| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 5.274852752685547, | |
| "learning_rate": 0.00019525, | |
| "loss": 3.313, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 8.226648330688477, | |
| "learning_rate": 0.00019025000000000002, | |
| "loss": 2.2679, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.02825403213501, | |
| "learning_rate": 0.00018525, | |
| "loss": 0.7816, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 12.396927833557129, | |
| "learning_rate": 0.00018025000000000002, | |
| "loss": 0.6099, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 3.590737819671631, | |
| "learning_rate": 0.00017525, | |
| "loss": 0.4067, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 3.1620190143585205, | |
| "learning_rate": 0.00017025, | |
| "loss": 0.2761, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.800012588500977, | |
| "learning_rate": 0.00016525, | |
| "loss": 0.2304, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 3.7974071502685547, | |
| "learning_rate": 0.00016025000000000002, | |
| "loss": 0.1981, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 2.664266347885132, | |
| "learning_rate": 0.00015525, | |
| "loss": 0.1579, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 9.237320899963379, | |
| "learning_rate": 0.00015025, | |
| "loss": 0.1837, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 2.3988049030303955, | |
| "learning_rate": 0.00014525, | |
| "loss": 0.1168, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 6.759559631347656, | |
| "learning_rate": 0.00014025000000000002, | |
| "loss": 0.1017, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.841704249382019, | |
| "learning_rate": 0.00013525, | |
| "loss": 0.093, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.0974807739257812, | |
| "learning_rate": 0.00013025, | |
| "loss": 0.0944, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.219509243965149, | |
| "learning_rate": 0.00012525, | |
| "loss": 0.0921, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.0698885917663574, | |
| "learning_rate": 0.00012025, | |
| "loss": 0.0796, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.5936706066131592, | |
| "learning_rate": 0.00011525000000000001, | |
| "loss": 0.0647, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 3.975679636001587, | |
| "learning_rate": 0.00011025000000000001, | |
| "loss": 0.0711, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.7344488501548767, | |
| "learning_rate": 0.00010525000000000001, | |
| "loss": 0.0598, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.47527560591697693, | |
| "learning_rate": 0.00010025, | |
| "loss": 0.0688, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.04188907518982887, | |
| "eval_runtime": 42.5156, | |
| "eval_samples_per_second": 2.352, | |
| "eval_steps_per_second": 0.306, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 2.3210883140563965, | |
| "learning_rate": 9.525000000000001e-05, | |
| "loss": 0.0683, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.7220126986503601, | |
| "learning_rate": 9.025e-05, | |
| "loss": 0.0629, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 3.119802236557007, | |
| "learning_rate": 8.525000000000001e-05, | |
| "loss": 0.0498, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 2.0455057621002197, | |
| "learning_rate": 8.025e-05, | |
| "loss": 0.0464, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 5.6062140464782715, | |
| "learning_rate": 7.525e-05, | |
| "loss": 0.0471, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.0683833360671997, | |
| "learning_rate": 7.025e-05, | |
| "loss": 0.053, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 2.467642307281494, | |
| "learning_rate": 6.525e-05, | |
| "loss": 0.0544, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.8878470063209534, | |
| "learning_rate": 6.025000000000001e-05, | |
| "loss": 0.055, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.9636881351470947, | |
| "learning_rate": 5.525e-05, | |
| "loss": 0.042, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.3637828826904297, | |
| "learning_rate": 5.0249999999999995e-05, | |
| "loss": 0.0425, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.6462461352348328, | |
| "learning_rate": 4.525e-05, | |
| "loss": 0.0509, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.9533060193061829, | |
| "learning_rate": 4.025e-05, | |
| "loss": 0.0416, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.821641206741333, | |
| "learning_rate": 3.525e-05, | |
| "loss": 0.0434, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.9575638771057129, | |
| "learning_rate": 3.025e-05, | |
| "loss": 0.0427, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.5366125106811523, | |
| "learning_rate": 2.525e-05, | |
| "loss": 0.0403, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.7077822685241699, | |
| "learning_rate": 2.025e-05, | |
| "loss": 0.0426, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 0.817557692527771, | |
| "learning_rate": 1.525e-05, | |
| "loss": 0.0386, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 2.0137739181518555, | |
| "learning_rate": 1.025e-05, | |
| "loss": 0.0432, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 3.582106113433838, | |
| "learning_rate": 5.25e-06, | |
| "loss": 0.0413, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.3983885645866394, | |
| "learning_rate": 2.5000000000000004e-07, | |
| "loss": 0.0366, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.029658818617463112, | |
| "eval_runtime": 42.5185, | |
| "eval_samples_per_second": 2.352, | |
| "eval_steps_per_second": 0.306, | |
| "step": 800 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 800, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.83501161086976e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |