| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 22.0, | |
| "eval_steps": 500, | |
| "global_step": 6930, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 2.4596855640411377, | |
| "learning_rate": 4.9500000000000004e-05, | |
| "loss": 12.5365, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 1.3071926832199097, | |
| "learning_rate": 4.927525622254759e-05, | |
| "loss": 3.0185, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 1.17218816280365, | |
| "learning_rate": 4.8543191800878476e-05, | |
| "loss": 2.4162, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.2698412698412698, | |
| "grad_norm": 1.175925374031067, | |
| "learning_rate": 4.7811127379209374e-05, | |
| "loss": 2.2303, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.5873015873015874, | |
| "grad_norm": 1.0879369974136353, | |
| "learning_rate": 4.7079062957540264e-05, | |
| "loss": 2.1309, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5873015873015874, | |
| "eval_loss": 1.667625904083252, | |
| "eval_runtime": 0.8881, | |
| "eval_samples_per_second": 1126.062, | |
| "eval_steps_per_second": 6.756, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.9047619047619047, | |
| "grad_norm": 1.031023621559143, | |
| "learning_rate": 4.634699853587116e-05, | |
| "loss": 2.047, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 1.00222647190094, | |
| "learning_rate": 4.561493411420205e-05, | |
| "loss": 1.9886, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.5396825396825395, | |
| "grad_norm": 0.9519165754318237, | |
| "learning_rate": 4.488286969253295e-05, | |
| "loss": 1.927, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.9294150471687317, | |
| "learning_rate": 4.4150805270863835e-05, | |
| "loss": 1.9148, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.1746031746031744, | |
| "grad_norm": 0.9498035311698914, | |
| "learning_rate": 4.341874084919473e-05, | |
| "loss": 1.8698, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.1746031746031744, | |
| "eval_loss": 1.5154762268066406, | |
| "eval_runtime": 1.1843, | |
| "eval_samples_per_second": 844.397, | |
| "eval_steps_per_second": 5.066, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.492063492063492, | |
| "grad_norm": 1.019351601600647, | |
| "learning_rate": 4.268667642752562e-05, | |
| "loss": 1.839, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.8095238095238093, | |
| "grad_norm": 0.9561589360237122, | |
| "learning_rate": 4.195461200585652e-05, | |
| "loss": 1.8173, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 4.1269841269841265, | |
| "grad_norm": 0.8755239248275757, | |
| "learning_rate": 4.122254758418741e-05, | |
| "loss": 1.7851, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 0.9118295907974243, | |
| "learning_rate": 4.049048316251831e-05, | |
| "loss": 1.7639, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 4.761904761904762, | |
| "grad_norm": 0.9037424921989441, | |
| "learning_rate": 3.975841874084919e-05, | |
| "loss": 1.749, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 4.761904761904762, | |
| "eval_loss": 1.44046950340271, | |
| "eval_runtime": 0.8726, | |
| "eval_samples_per_second": 1146.026, | |
| "eval_steps_per_second": 6.876, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 5.079365079365079, | |
| "grad_norm": 0.8880825042724609, | |
| "learning_rate": 3.902635431918009e-05, | |
| "loss": 1.7312, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 5.396825396825397, | |
| "grad_norm": 0.8245419263839722, | |
| "learning_rate": 3.829428989751098e-05, | |
| "loss": 1.7068, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 5.714285714285714, | |
| "grad_norm": 0.7877866625785828, | |
| "learning_rate": 3.756222547584188e-05, | |
| "loss": 1.6987, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 6.031746031746032, | |
| "grad_norm": 0.8793995380401611, | |
| "learning_rate": 3.683016105417277e-05, | |
| "loss": 1.6946, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 6.349206349206349, | |
| "grad_norm": 0.8199811577796936, | |
| "learning_rate": 3.609809663250366e-05, | |
| "loss": 1.6516, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 6.349206349206349, | |
| "eval_loss": 1.3722814321517944, | |
| "eval_runtime": 0.9172, | |
| "eval_samples_per_second": 1090.295, | |
| "eval_steps_per_second": 6.542, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.91376131772995, | |
| "learning_rate": 3.536603221083455e-05, | |
| "loss": 1.6641, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 6.984126984126984, | |
| "grad_norm": 0.8099491000175476, | |
| "learning_rate": 3.463396778916545e-05, | |
| "loss": 1.6527, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 7.301587301587301, | |
| "grad_norm": 0.847446620464325, | |
| "learning_rate": 3.390190336749634e-05, | |
| "loss": 1.6255, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 7.619047619047619, | |
| "grad_norm": 0.8136289715766907, | |
| "learning_rate": 3.316983894582724e-05, | |
| "loss": 1.6239, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 7.936507936507937, | |
| "grad_norm": 0.9612435102462769, | |
| "learning_rate": 3.243777452415813e-05, | |
| "loss": 1.6153, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 7.936507936507937, | |
| "eval_loss": 1.3314929008483887, | |
| "eval_runtime": 0.8885, | |
| "eval_samples_per_second": 1125.515, | |
| "eval_steps_per_second": 6.753, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 8.253968253968253, | |
| "grad_norm": 0.7829731702804565, | |
| "learning_rate": 3.170571010248902e-05, | |
| "loss": 1.599, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 8.571428571428571, | |
| "grad_norm": 0.8344002962112427, | |
| "learning_rate": 3.097364568081991e-05, | |
| "loss": 1.5986, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 8.88888888888889, | |
| "grad_norm": 0.7536396384239197, | |
| "learning_rate": 3.0241581259150808e-05, | |
| "loss": 1.5854, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 9.206349206349206, | |
| "grad_norm": 0.8793928623199463, | |
| "learning_rate": 2.95095168374817e-05, | |
| "loss": 1.5609, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 9.523809523809524, | |
| "grad_norm": 0.8234173655509949, | |
| "learning_rate": 2.8777452415812596e-05, | |
| "loss": 1.5621, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 9.523809523809524, | |
| "eval_loss": 1.2933802604675293, | |
| "eval_runtime": 0.8982, | |
| "eval_samples_per_second": 1113.3, | |
| "eval_steps_per_second": 6.68, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 9.841269841269842, | |
| "grad_norm": 0.8043637275695801, | |
| "learning_rate": 2.8045387994143483e-05, | |
| "loss": 1.57, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 10.158730158730158, | |
| "grad_norm": 0.8538019061088562, | |
| "learning_rate": 2.731332357247438e-05, | |
| "loss": 1.5425, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 10.476190476190476, | |
| "grad_norm": 0.8133682608604431, | |
| "learning_rate": 2.6581259150805272e-05, | |
| "loss": 1.54, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 10.793650793650794, | |
| "grad_norm": 0.8028839826583862, | |
| "learning_rate": 2.5849194729136166e-05, | |
| "loss": 1.5422, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 11.11111111111111, | |
| "grad_norm": 0.8318579196929932, | |
| "learning_rate": 2.5117130307467057e-05, | |
| "loss": 1.5322, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 11.11111111111111, | |
| "eval_loss": 1.2671520709991455, | |
| "eval_runtime": 0.8988, | |
| "eval_samples_per_second": 1112.597, | |
| "eval_steps_per_second": 6.676, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 11.428571428571429, | |
| "grad_norm": 0.80867600440979, | |
| "learning_rate": 2.438506588579795e-05, | |
| "loss": 1.5208, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 11.746031746031747, | |
| "grad_norm": 0.843559980392456, | |
| "learning_rate": 2.3653001464128842e-05, | |
| "loss": 1.5205, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 12.063492063492063, | |
| "grad_norm": 0.8322616815567017, | |
| "learning_rate": 2.2920937042459736e-05, | |
| "loss": 1.5138, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 12.380952380952381, | |
| "grad_norm": 0.8656408190727234, | |
| "learning_rate": 2.218887262079063e-05, | |
| "loss": 1.5013, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 12.698412698412698, | |
| "grad_norm": 0.7414558529853821, | |
| "learning_rate": 2.145680819912152e-05, | |
| "loss": 1.5031, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 12.698412698412698, | |
| "eval_loss": 1.2427836656570435, | |
| "eval_runtime": 0.906, | |
| "eval_samples_per_second": 1103.725, | |
| "eval_steps_per_second": 6.622, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 13.015873015873016, | |
| "grad_norm": 0.8400952816009521, | |
| "learning_rate": 2.0724743777452415e-05, | |
| "loss": 1.5012, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 13.333333333333334, | |
| "grad_norm": 0.7461639642715454, | |
| "learning_rate": 1.999267935578331e-05, | |
| "loss": 1.4839, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 13.65079365079365, | |
| "grad_norm": 0.9285081028938293, | |
| "learning_rate": 1.92606149341142e-05, | |
| "loss": 1.4839, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 13.968253968253968, | |
| "grad_norm": 0.7774894833564758, | |
| "learning_rate": 1.8528550512445095e-05, | |
| "loss": 1.489, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 14.285714285714286, | |
| "grad_norm": 0.7801647782325745, | |
| "learning_rate": 1.779648609077599e-05, | |
| "loss": 1.477, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 14.285714285714286, | |
| "eval_loss": 1.22147798538208, | |
| "eval_runtime": 0.9123, | |
| "eval_samples_per_second": 1096.1, | |
| "eval_steps_per_second": 6.577, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 14.603174603174603, | |
| "grad_norm": 0.799261748790741, | |
| "learning_rate": 1.706442166910688e-05, | |
| "loss": 1.4712, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 14.920634920634921, | |
| "grad_norm": 0.7967658042907715, | |
| "learning_rate": 1.6332357247437774e-05, | |
| "loss": 1.4681, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 15.238095238095237, | |
| "grad_norm": 0.7955309152603149, | |
| "learning_rate": 1.5600292825768668e-05, | |
| "loss": 1.4566, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 15.555555555555555, | |
| "grad_norm": 0.9011721014976501, | |
| "learning_rate": 1.4868228404099562e-05, | |
| "loss": 1.4571, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 15.873015873015873, | |
| "grad_norm": 0.8107950687408447, | |
| "learning_rate": 1.4136163982430456e-05, | |
| "loss": 1.4607, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 15.873015873015873, | |
| "eval_loss": 1.204552173614502, | |
| "eval_runtime": 0.8938, | |
| "eval_samples_per_second": 1118.839, | |
| "eval_steps_per_second": 6.713, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 16.19047619047619, | |
| "grad_norm": 0.796794056892395, | |
| "learning_rate": 1.3404099560761349e-05, | |
| "loss": 1.4541, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 16.507936507936506, | |
| "grad_norm": 0.8143286108970642, | |
| "learning_rate": 1.2672035139092242e-05, | |
| "loss": 1.4518, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 16.825396825396826, | |
| "grad_norm": 0.8341066241264343, | |
| "learning_rate": 1.1939970717423134e-05, | |
| "loss": 1.4492, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 17.142857142857142, | |
| "grad_norm": 0.7626970410346985, | |
| "learning_rate": 1.1207906295754027e-05, | |
| "loss": 1.4451, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 17.46031746031746, | |
| "grad_norm": 0.8455602526664734, | |
| "learning_rate": 1.0475841874084919e-05, | |
| "loss": 1.4398, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 17.46031746031746, | |
| "eval_loss": 1.194530963897705, | |
| "eval_runtime": 0.9698, | |
| "eval_samples_per_second": 1031.168, | |
| "eval_steps_per_second": 6.187, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 17.77777777777778, | |
| "grad_norm": 0.8057031035423279, | |
| "learning_rate": 9.743777452415813e-06, | |
| "loss": 1.4361, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 18.095238095238095, | |
| "grad_norm": 0.7198513746261597, | |
| "learning_rate": 9.011713030746706e-06, | |
| "loss": 1.4358, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 18.41269841269841, | |
| "grad_norm": 0.8185477256774902, | |
| "learning_rate": 8.279648609077598e-06, | |
| "loss": 1.435, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 18.73015873015873, | |
| "grad_norm": 1.1578043699264526, | |
| "learning_rate": 7.5475841874084925e-06, | |
| "loss": 1.4316, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 19.047619047619047, | |
| "grad_norm": 0.7790259718894958, | |
| "learning_rate": 6.815519765739385e-06, | |
| "loss": 1.434, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 19.047619047619047, | |
| "eval_loss": 1.1841422319412231, | |
| "eval_runtime": 0.8601, | |
| "eval_samples_per_second": 1162.639, | |
| "eval_steps_per_second": 6.976, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 19.365079365079364, | |
| "grad_norm": 0.8408913612365723, | |
| "learning_rate": 6.083455344070278e-06, | |
| "loss": 1.4288, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 19.682539682539684, | |
| "grad_norm": 0.8236083388328552, | |
| "learning_rate": 5.351390922401171e-06, | |
| "loss": 1.426, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 3.35252046585083, | |
| "learning_rate": 4.619326500732064e-06, | |
| "loss": 1.4277, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 20.317460317460316, | |
| "grad_norm": 0.8612999320030212, | |
| "learning_rate": 3.887262079062958e-06, | |
| "loss": 1.4182, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 20.634920634920636, | |
| "grad_norm": 0.820543646812439, | |
| "learning_rate": 3.1551976573938506e-06, | |
| "loss": 1.429, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 20.634920634920636, | |
| "eval_loss": 1.1799591779708862, | |
| "eval_runtime": 0.8828, | |
| "eval_samples_per_second": 1132.769, | |
| "eval_steps_per_second": 6.797, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 20.952380952380953, | |
| "grad_norm": 0.7831932902336121, | |
| "learning_rate": 2.423133235724744e-06, | |
| "loss": 1.4241, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 21.26984126984127, | |
| "grad_norm": 0.8200610280036926, | |
| "learning_rate": 1.6910688140556369e-06, | |
| "loss": 1.4248, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 21.58730158730159, | |
| "grad_norm": 0.8267305493354797, | |
| "learning_rate": 9.5900439238653e-07, | |
| "loss": 1.4239, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 21.904761904761905, | |
| "grad_norm": 0.7583746910095215, | |
| "learning_rate": 2.2693997071742313e-07, | |
| "loss": 1.4158, | |
| "step": 6900 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 6930, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 22, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.400954935720919e+17, | |
| "train_batch_size": 384, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |