{ "config": { "group_size": 8, "lr": 5e-06, "epochs": 2, "max_steps_per_traj": 8, "temperature": 1.0, "temp_start": 1.2, "temp_end": 0.3, "clip_grad": 1.0, "question_batch": 4, "rft_accept_exact_only": true, "rft_diversity_boost": 3.0, "rft_diversity_threshold": 0.5, "rft_force_diverse_rollouts": true }, "rewards": { "correct": 7.0, "partial": 1.5, "wrong": -1.5, "plan_first": 0.5, "verify_finish": 1.0, "no_verify": -0.5, "sig_correct": 0.5, "sig_ignored": -1.0, "step_cost": -0.05, "max_step": -2.0, "repeat3": -0.3, "plan_late": -0.5, "diversity": 1.5, "cross_verify": 1.0, "early_exit": 1.0, "single_model": -1.5, "long_wrong": -1.0, "error_recover": 1.0, "intermediate_answer": 0.3, "clear_verification": 0.2, "stuck_same_worker": -0.3, "low_confidence_switch": 0.4 }, "epochs": [ { "epoch": 1, "avg_reward": 6.3244140625, "avg_loss": 6.204556642713502, "accuracy": 71.54947916666666, "api": { "@qwen-coder": { "tag": "@qwen-coder", "model": "mistralai/Mistral-Small-24B-Instruct-2501", "provider": "together", "calls": 3188, "in_tok": 2723761, "out_tok": 912639, "cost_usd": 2.9091 }, "@qwen-general": { "tag": "@qwen-general", "model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "provider": "together", "calls": 3215, "in_tok": 2910742, "out_tok": 450239, "cost_usd": 1.0083 }, "@kimi": { "tag": "@kimi", "model": "google/gemma-3n-E4B-it", "provider": "together", "calls": 3194, "in_tok": 2831854, "out_tok": 767376, "cost_usd": 1.4397 }, "_total": { "calls": 9597, "cost_usd": 5.3571 } } } ], "api": { "@qwen-coder": { "tag": "@qwen-coder", "model": "mistralai/Mistral-Small-24B-Instruct-2501", "provider": "together", "calls": 3188, "in_tok": 2723761, "out_tok": 912639, "cost_usd": 2.9091 }, "@qwen-general": { "tag": "@qwen-general", "model": "Qwen/Qwen2.5-7B-Instruct-Turbo", "provider": "together", "calls": 3215, "in_tok": 2910742, "out_tok": 450239, "cost_usd": 1.0083 }, "@kimi": { "tag": "@kimi", "model": "google/gemma-3n-E4B-it", "provider": "together", "calls": 3194, "in_tok": 2831854, "out_tok": 767376, "cost_usd": 1.4397 }, "_total": { "calls": 9597, "cost_usd": 5.3571 } }, "peak_vram_gb": "83.2", "gpu": "NVIDIA A100-SXM4-80GB", "version": "v5", "stopped_early": true, "stop_reason": "REWARD PLATEAU: No improvement for 40 batches. Best R\u0304=9.041 at batch 8." }