LabHorizon-Model / trainer_state.json
black-yt's picture
Add LabHorizon Qwen LoRA adapter
d4ad2d7
{
"best_global_step": 700,
"best_metric": 0.44259119033813477,
"best_model_checkpoint": "/data/taoyong/LabOS/QWEN-36/checkpoints/qwen3.6-35b-a3b-lora-lf/checkpoint-700",
"epoch": 10.0,
"eval_steps": 100,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04,
"grad_norm": 1.3030611276626587,
"learning_rate": 3.6e-06,
"loss": 1.1145790100097657,
"step": 10
},
{
"epoch": 0.08,
"grad_norm": 1.540786623954773,
"learning_rate": 7.6e-06,
"loss": 1.2167404174804688,
"step": 20
},
{
"epoch": 0.12,
"grad_norm": 1.0591915845870972,
"learning_rate": 1.16e-05,
"loss": 1.0437713623046876,
"step": 30
},
{
"epoch": 0.16,
"grad_norm": 0.6695119142532349,
"learning_rate": 1.56e-05,
"loss": 0.9282869338989258,
"step": 40
},
{
"epoch": 0.2,
"grad_norm": 0.7912387847900391,
"learning_rate": 1.9600000000000002e-05,
"loss": 0.8799624443054199,
"step": 50
},
{
"epoch": 0.24,
"grad_norm": 0.7810359001159668,
"learning_rate": 2.36e-05,
"loss": 0.7062759399414062,
"step": 60
},
{
"epoch": 0.28,
"grad_norm": 0.7185921669006348,
"learning_rate": 2.7600000000000003e-05,
"loss": 0.7228042602539062,
"step": 70
},
{
"epoch": 0.32,
"grad_norm": 0.7974339723587036,
"learning_rate": 3.16e-05,
"loss": 0.6257906913757324,
"step": 80
},
{
"epoch": 0.36,
"grad_norm": 0.7850703597068787,
"learning_rate": 3.56e-05,
"loss": 0.5399329185485839,
"step": 90
},
{
"epoch": 0.4,
"grad_norm": 0.7295215129852295,
"learning_rate": 3.960000000000001e-05,
"loss": 0.5184461116790772,
"step": 100
},
{
"epoch": 0.4,
"eval_loss": 0.5476460456848145,
"eval_runtime": 21.5181,
"eval_samples_per_second": 18.589,
"eval_steps_per_second": 3.114,
"step": 100
},
{
"epoch": 0.44,
"grad_norm": 1.0682953596115112,
"learning_rate": 4.36e-05,
"loss": 0.5210700988769531,
"step": 110
},
{
"epoch": 0.48,
"grad_norm": 0.9108087420463562,
"learning_rate": 4.76e-05,
"loss": 0.5155693531036377,
"step": 120
},
{
"epoch": 0.52,
"grad_norm": 1.0037930011749268,
"learning_rate": 5.16e-05,
"loss": 0.45534143447875974,
"step": 130
},
{
"epoch": 0.56,
"grad_norm": 0.9430785179138184,
"learning_rate": 5.560000000000001e-05,
"loss": 0.45524797439575193,
"step": 140
},
{
"epoch": 0.6,
"grad_norm": 0.9689427614212036,
"learning_rate": 5.96e-05,
"loss": 0.47152209281921387,
"step": 150
},
{
"epoch": 0.64,
"grad_norm": 0.7584393620491028,
"learning_rate": 6.36e-05,
"loss": 0.4532940864562988,
"step": 160
},
{
"epoch": 0.68,
"grad_norm": 0.7581620216369629,
"learning_rate": 6.76e-05,
"loss": 0.48988704681396483,
"step": 170
},
{
"epoch": 0.72,
"grad_norm": 0.9882776141166687,
"learning_rate": 7.16e-05,
"loss": 0.46865572929382326,
"step": 180
},
{
"epoch": 0.76,
"grad_norm": 0.743236780166626,
"learning_rate": 7.560000000000001e-05,
"loss": 0.45577139854431153,
"step": 190
},
{
"epoch": 0.8,
"grad_norm": 0.6103836894035339,
"learning_rate": 7.960000000000001e-05,
"loss": 0.4559042453765869,
"step": 200
},
{
"epoch": 0.8,
"eval_loss": 0.485470175743103,
"eval_runtime": 17.4199,
"eval_samples_per_second": 22.962,
"eval_steps_per_second": 3.846,
"step": 200
},
{
"epoch": 0.84,
"grad_norm": 0.8245580792427063,
"learning_rate": 8.36e-05,
"loss": 0.45926451683044434,
"step": 210
},
{
"epoch": 0.88,
"grad_norm": 0.6920369267463684,
"learning_rate": 8.76e-05,
"loss": 0.4545453548431396,
"step": 220
},
{
"epoch": 0.92,
"grad_norm": 0.6936920881271362,
"learning_rate": 9.16e-05,
"loss": 0.47637343406677246,
"step": 230
},
{
"epoch": 0.96,
"grad_norm": 0.6694210767745972,
"learning_rate": 9.56e-05,
"loss": 0.43120541572570803,
"step": 240
},
{
"epoch": 1.0,
"grad_norm": 0.583095133304596,
"learning_rate": 9.960000000000001e-05,
"loss": 0.4153712272644043,
"step": 250
},
{
"epoch": 1.04,
"grad_norm": 0.6926116943359375,
"learning_rate": 9.999605221019081e-05,
"loss": 0.44300012588500975,
"step": 260
},
{
"epoch": 1.08,
"grad_norm": 0.761324405670166,
"learning_rate": 9.998240632972073e-05,
"loss": 0.462084436416626,
"step": 270
},
{
"epoch": 1.12,
"grad_norm": 0.5191273093223572,
"learning_rate": 9.995901628010196e-05,
"loss": 0.39808471202850343,
"step": 280
},
{
"epoch": 1.16,
"grad_norm": 0.8463711738586426,
"learning_rate": 9.9925886621271e-05,
"loss": 0.423044490814209,
"step": 290
},
{
"epoch": 1.2,
"grad_norm": 0.8373249769210815,
"learning_rate": 9.98830238119205e-05,
"loss": 0.41622562408447267,
"step": 300
},
{
"epoch": 1.2,
"eval_loss": 0.4695434272289276,
"eval_runtime": 19.2419,
"eval_samples_per_second": 20.788,
"eval_steps_per_second": 3.482,
"step": 300
},
{
"epoch": 1.24,
"grad_norm": 0.6290304064750671,
"learning_rate": 9.983043620824005e-05,
"loss": 0.4166346549987793,
"step": 310
},
{
"epoch": 1.28,
"grad_norm": 0.6189863681793213,
"learning_rate": 9.97681340622872e-05,
"loss": 0.43734130859375,
"step": 320
},
{
"epoch": 1.32,
"grad_norm": 0.5579029321670532,
"learning_rate": 9.969612951998874e-05,
"loss": 0.3747305631637573,
"step": 330
},
{
"epoch": 1.3599999999999999,
"grad_norm": 1.1675549745559692,
"learning_rate": 9.961443661877289e-05,
"loss": 0.42578792572021484,
"step": 340
},
{
"epoch": 1.4,
"grad_norm": 0.6578675508499146,
"learning_rate": 9.952307128483256e-05,
"loss": 0.39537777900695803,
"step": 350
},
{
"epoch": 1.44,
"grad_norm": 0.8092941045761108,
"learning_rate": 9.942205133002068e-05,
"loss": 0.4084367275238037,
"step": 360
},
{
"epoch": 1.48,
"grad_norm": 0.6226063370704651,
"learning_rate": 9.931139644837754e-05,
"loss": 0.3781426906585693,
"step": 370
},
{
"epoch": 1.52,
"grad_norm": 0.7148721218109131,
"learning_rate": 9.919112821229163e-05,
"loss": 0.3952002048492432,
"step": 380
},
{
"epoch": 1.56,
"grad_norm": 0.5743547081947327,
"learning_rate": 9.906127006829384e-05,
"loss": 0.4087832927703857,
"step": 390
},
{
"epoch": 1.6,
"grad_norm": 0.6315461993217468,
"learning_rate": 9.892184733248666e-05,
"loss": 0.3861570119857788,
"step": 400
},
{
"epoch": 1.6,
"eval_loss": 0.45406103134155273,
"eval_runtime": 19.7154,
"eval_samples_per_second": 20.289,
"eval_steps_per_second": 3.398,
"step": 400
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.6243694424629211,
"learning_rate": 9.877288718560866e-05,
"loss": 0.39033331871032717,
"step": 410
},
{
"epoch": 1.6800000000000002,
"grad_norm": 0.6677294969558716,
"learning_rate": 9.861441866773564e-05,
"loss": 0.43663845062255857,
"step": 420
},
{
"epoch": 1.72,
"grad_norm": 0.6460554599761963,
"learning_rate": 9.844647267261916e-05,
"loss": 0.43364706039428713,
"step": 430
},
{
"epoch": 1.76,
"grad_norm": 0.570160984992981,
"learning_rate": 9.82690819416637e-05,
"loss": 0.409498929977417,
"step": 440
},
{
"epoch": 1.8,
"grad_norm": 0.5696760416030884,
"learning_rate": 9.808228105754376e-05,
"loss": 0.4264820098876953,
"step": 450
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.583260715007782,
"learning_rate": 9.788610643746184e-05,
"loss": 0.417040491104126,
"step": 460
},
{
"epoch": 1.88,
"grad_norm": 0.6025984287261963,
"learning_rate": 9.76805963260488e-05,
"loss": 0.3749807357788086,
"step": 470
},
{
"epoch": 1.92,
"grad_norm": 0.5953373312950134,
"learning_rate": 9.746579078790807e-05,
"loss": 0.4022481918334961,
"step": 480
},
{
"epoch": 1.96,
"grad_norm": 0.4357820153236389,
"learning_rate": 9.724173169980491e-05,
"loss": 0.38319835662841795,
"step": 490
},
{
"epoch": 2.0,
"grad_norm": 0.5152677297592163,
"learning_rate": 9.700846274250251e-05,
"loss": 0.4122174263000488,
"step": 500
},
{
"epoch": 2.0,
"eval_loss": 0.44415727257728577,
"eval_runtime": 18.9015,
"eval_samples_per_second": 21.162,
"eval_steps_per_second": 3.545,
"step": 500
},
{
"epoch": 2.04,
"grad_norm": 0.38848409056663513,
"learning_rate": 9.676602939224629e-05,
"loss": 0.3524669408798218,
"step": 510
},
{
"epoch": 2.08,
"grad_norm": 0.5285012125968933,
"learning_rate": 9.651447891189825e-05,
"loss": 0.3717231273651123,
"step": 520
},
{
"epoch": 2.12,
"grad_norm": 0.6452465653419495,
"learning_rate": 9.62538603417229e-05,
"loss": 0.40065832138061525,
"step": 530
},
{
"epoch": 2.16,
"grad_norm": 0.48196467757225037,
"learning_rate": 9.598422448982696e-05,
"loss": 0.33635973930358887,
"step": 540
},
{
"epoch": 2.2,
"grad_norm": 0.563376247882843,
"learning_rate": 9.570562392225396e-05,
"loss": 0.3708656787872314,
"step": 550
},
{
"epoch": 2.24,
"grad_norm": 0.6459429860115051,
"learning_rate": 9.541811295273656e-05,
"loss": 0.35284056663513186,
"step": 560
},
{
"epoch": 2.2800000000000002,
"grad_norm": 0.5247339606285095,
"learning_rate": 9.512174763210797e-05,
"loss": 0.3429510831832886,
"step": 570
},
{
"epoch": 2.32,
"grad_norm": 0.5456256866455078,
"learning_rate": 9.481658573737465e-05,
"loss": 0.36770102977752683,
"step": 580
},
{
"epoch": 2.36,
"grad_norm": 0.5435087084770203,
"learning_rate": 9.450268676045262e-05,
"loss": 0.3684037208557129,
"step": 590
},
{
"epoch": 2.4,
"grad_norm": 0.5584478974342346,
"learning_rate": 9.418011189656941e-05,
"loss": 0.3221792697906494,
"step": 600
},
{
"epoch": 2.4,
"eval_loss": 0.44748273491859436,
"eval_runtime": 18.8521,
"eval_samples_per_second": 21.218,
"eval_steps_per_second": 3.554,
"step": 600
},
{
"epoch": 2.44,
"grad_norm": 0.7217129468917847,
"learning_rate": 9.384892403233384e-05,
"loss": 0.40174164772033694,
"step": 610
},
{
"epoch": 2.48,
"grad_norm": 0.5068971514701843,
"learning_rate": 9.35091877334763e-05,
"loss": 0.3701002836227417,
"step": 620
},
{
"epoch": 2.52,
"grad_norm": 0.4331487715244293,
"learning_rate": 9.316096923226135e-05,
"loss": 0.3759175777435303,
"step": 630
},
{
"epoch": 2.56,
"grad_norm": 0.5161293148994446,
"learning_rate": 9.28043364145758e-05,
"loss": 0.3581662178039551,
"step": 640
},
{
"epoch": 2.6,
"grad_norm": 0.709299623966217,
"learning_rate": 9.24393588066941e-05,
"loss": 0.35065665245056155,
"step": 650
},
{
"epoch": 2.64,
"grad_norm": 0.6004891991615295,
"learning_rate": 9.206610756172402e-05,
"loss": 0.36879355907440187,
"step": 660
},
{
"epoch": 2.68,
"grad_norm": 0.4662474989891052,
"learning_rate": 9.168465544573536e-05,
"loss": 0.3592060565948486,
"step": 670
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.5826489329338074,
"learning_rate": 9.129507682357394e-05,
"loss": 0.36156315803527833,
"step": 680
},
{
"epoch": 2.76,
"grad_norm": 0.48988744616508484,
"learning_rate": 9.089744764436403e-05,
"loss": 0.34445748329162595,
"step": 690
},
{
"epoch": 2.8,
"grad_norm": 0.4443361163139343,
"learning_rate": 9.049184542670199e-05,
"loss": 0.3526463985443115,
"step": 700
},
{
"epoch": 2.8,
"eval_loss": 0.44259119033813477,
"eval_runtime": 16.8228,
"eval_samples_per_second": 23.777,
"eval_steps_per_second": 3.983,
"step": 700
},
{
"epoch": 2.84,
"grad_norm": 0.5471161007881165,
"learning_rate": 9.007834924354383e-05,
"loss": 0.3458081245422363,
"step": 710
},
{
"epoch": 2.88,
"grad_norm": 0.5264748930931091,
"learning_rate": 8.965703970678974e-05,
"loss": 0.3651163101196289,
"step": 720
},
{
"epoch": 2.92,
"grad_norm": 0.48987507820129395,
"learning_rate": 8.922799895156867e-05,
"loss": 0.3218229293823242,
"step": 730
},
{
"epoch": 2.96,
"grad_norm": 0.5640589594841003,
"learning_rate": 8.879131062022598e-05,
"loss": 0.3561582088470459,
"step": 740
},
{
"epoch": 3.0,
"grad_norm": 0.7934619784355164,
"learning_rate": 8.834705984601708e-05,
"loss": 0.36128854751586914,
"step": 750
},
{
"epoch": 3.04,
"grad_norm": 1.0869489908218384,
"learning_rate": 8.789533323651066e-05,
"loss": 0.31422438621521,
"step": 760
},
{
"epoch": 3.08,
"grad_norm": 0.4695897102355957,
"learning_rate": 8.74362188567043e-05,
"loss": 0.29355826377868655,
"step": 770
},
{
"epoch": 3.12,
"grad_norm": 0.5532680153846741,
"learning_rate": 8.696980621185602e-05,
"loss": 0.3185117721557617,
"step": 780
},
{
"epoch": 3.16,
"grad_norm": 0.5760806202888489,
"learning_rate": 8.649618623003508e-05,
"loss": 0.28971233367919924,
"step": 790
},
{
"epoch": 3.2,
"grad_norm": 0.5517900586128235,
"learning_rate": 8.601545124439535e-05,
"loss": 0.3055370092391968,
"step": 800
},
{
"epoch": 3.2,
"eval_loss": 0.4529191255569458,
"eval_runtime": 18.5382,
"eval_samples_per_second": 21.577,
"eval_steps_per_second": 3.614,
"step": 800
},
{
"epoch": 3.24,
"grad_norm": 0.5356678366661072,
"learning_rate": 8.552769497517482e-05,
"loss": 0.28035550117492675,
"step": 810
},
{
"epoch": 3.2800000000000002,
"grad_norm": 0.5985352993011475,
"learning_rate": 8.503301251142459e-05,
"loss": 0.3199602603912354,
"step": 820
},
{
"epoch": 3.32,
"grad_norm": 0.5187913179397583,
"learning_rate": 8.453150029247114e-05,
"loss": 0.29444499015808107,
"step": 830
},
{
"epoch": 3.36,
"grad_norm": 0.5703292489051819,
"learning_rate": 8.402325608911526e-05,
"loss": 0.30467259883880615,
"step": 840
},
{
"epoch": 3.4,
"grad_norm": 0.9323157072067261,
"learning_rate": 8.350837898457143e-05,
"loss": 0.3117033004760742,
"step": 850
},
{
"epoch": 3.44,
"grad_norm": 0.628546953201294,
"learning_rate": 8.298696935515132e-05,
"loss": 0.34261503219604494,
"step": 860
},
{
"epoch": 3.48,
"grad_norm": 0.5379561185836792,
"learning_rate": 8.245912885069531e-05,
"loss": 0.3159458637237549,
"step": 870
},
{
"epoch": 3.52,
"grad_norm": 0.6575730443000793,
"learning_rate": 8.192496037475562e-05,
"loss": 0.2982481002807617,
"step": 880
},
{
"epoch": 3.56,
"grad_norm": 0.5830497145652771,
"learning_rate": 8.138456806453503e-05,
"loss": 0.3232215404510498,
"step": 890
},
{
"epoch": 3.6,
"grad_norm": 0.5474710464477539,
"learning_rate": 8.083805727058513e-05,
"loss": 0.3305091381072998,
"step": 900
},
{
"epoch": 3.6,
"eval_loss": 0.44760578870773315,
"eval_runtime": 19.5159,
"eval_samples_per_second": 20.496,
"eval_steps_per_second": 3.433,
"step": 900
},
{
"epoch": 3.64,
"grad_norm": 0.5096336007118225,
"learning_rate": 8.028553453626808e-05,
"loss": 0.35752732753753663,
"step": 910
},
{
"epoch": 3.68,
"grad_norm": 0.5023341774940491,
"learning_rate": 7.972710757698567e-05,
"loss": 0.3292932271957397,
"step": 920
},
{
"epoch": 3.7199999999999998,
"grad_norm": 0.5277951955795288,
"learning_rate": 7.916288525918007e-05,
"loss": 0.28986682891845705,
"step": 930
},
{
"epoch": 3.76,
"grad_norm": 0.600412905216217,
"learning_rate": 7.859297757911013e-05,
"loss": 0.3027395725250244,
"step": 940
},
{
"epoch": 3.8,
"grad_norm": 0.6396210193634033,
"learning_rate": 7.801749564140724e-05,
"loss": 0.3238774061203003,
"step": 950
},
{
"epoch": 3.84,
"grad_norm": 0.628635585308075,
"learning_rate": 7.743655163741543e-05,
"loss": 0.34537086486816404,
"step": 960
},
{
"epoch": 3.88,
"grad_norm": 0.49822649359703064,
"learning_rate": 7.685025882331936e-05,
"loss": 0.3292637825012207,
"step": 970
},
{
"epoch": 3.92,
"grad_norm": 0.5356727242469788,
"learning_rate": 7.62587314980648e-05,
"loss": 0.32722015380859376,
"step": 980
},
{
"epoch": 3.96,
"grad_norm": 0.6211317777633667,
"learning_rate": 7.566208498107585e-05,
"loss": 0.29880056381225584,
"step": 990
},
{
"epoch": 4.0,
"grad_norm": 0.5336779356002808,
"learning_rate": 7.506043558977321e-05,
"loss": 0.2978524684906006,
"step": 1000
},
{
"epoch": 4.0,
"eval_loss": 0.44613513350486755,
"eval_runtime": 19.2382,
"eval_samples_per_second": 20.792,
"eval_steps_per_second": 3.483,
"step": 1000
},
{
"epoch": 4.04,
"grad_norm": 0.6681120991706848,
"learning_rate": 7.445390061689782e-05,
"loss": 0.27530927658081056,
"step": 1010
},
{
"epoch": 4.08,
"grad_norm": 0.6299528479576111,
"learning_rate": 7.38425983076444e-05,
"loss": 0.2517704486846924,
"step": 1020
},
{
"epoch": 4.12,
"grad_norm": 0.5211061239242554,
"learning_rate": 7.32266478366094e-05,
"loss": 0.28200175762176516,
"step": 1030
},
{
"epoch": 4.16,
"grad_norm": 0.5778363347053528,
"learning_rate": 7.260616928455754e-05,
"loss": 0.2569046258926392,
"step": 1040
},
{
"epoch": 4.2,
"grad_norm": 0.6715266108512878,
"learning_rate": 7.1981283615012e-05,
"loss": 0.2665576696395874,
"step": 1050
},
{
"epoch": 4.24,
"grad_norm": 0.6580007672309875,
"learning_rate": 7.135211265067216e-05,
"loss": 0.2635650634765625,
"step": 1060
},
{
"epoch": 4.28,
"grad_norm": 0.6889304518699646,
"learning_rate": 7.071877904966423e-05,
"loss": 0.26842334270477297,
"step": 1070
},
{
"epoch": 4.32,
"grad_norm": 0.5896309018135071,
"learning_rate": 7.00814062816285e-05,
"loss": 0.2633937358856201,
"step": 1080
},
{
"epoch": 4.36,
"grad_norm": 0.6062363386154175,
"learning_rate": 6.944011860364905e-05,
"loss": 0.2895397186279297,
"step": 1090
},
{
"epoch": 4.4,
"grad_norm": 0.6124110817909241,
"learning_rate": 6.879504103602935e-05,
"loss": 0.27405414581298826,
"step": 1100
},
{
"epoch": 4.4,
"eval_loss": 0.46795058250427246,
"eval_runtime": 17.2143,
"eval_samples_per_second": 23.237,
"eval_steps_per_second": 3.892,
"step": 1100
},
{
"epoch": 4.44,
"grad_norm": 0.8100364208221436,
"learning_rate": 6.814629933791931e-05,
"loss": 0.2581511974334717,
"step": 1110
},
{
"epoch": 4.48,
"grad_norm": 0.6187950372695923,
"learning_rate": 6.749401998279846e-05,
"loss": 0.2689012050628662,
"step": 1120
},
{
"epoch": 4.52,
"grad_norm": 0.6595885157585144,
"learning_rate": 6.683833013381941e-05,
"loss": 0.27230424880981446,
"step": 1130
},
{
"epoch": 4.5600000000000005,
"grad_norm": 0.6320788860321045,
"learning_rate": 6.617935761901748e-05,
"loss": 0.2903036594390869,
"step": 1140
},
{
"epoch": 4.6,
"grad_norm": 0.6367589831352234,
"learning_rate": 6.551723090639007e-05,
"loss": 0.2551115989685059,
"step": 1150
},
{
"epoch": 4.64,
"grad_norm": 0.5754795670509338,
"learning_rate": 6.485207907885175e-05,
"loss": 0.2783109188079834,
"step": 1160
},
{
"epoch": 4.68,
"grad_norm": 0.6343188881874084,
"learning_rate": 6.418403180906922e-05,
"loss": 0.29131503105163575,
"step": 1170
},
{
"epoch": 4.72,
"grad_norm": 0.6726956963539124,
"learning_rate": 6.351321933418139e-05,
"loss": 0.2730400085449219,
"step": 1180
},
{
"epoch": 4.76,
"grad_norm": 0.5498913526535034,
"learning_rate": 6.283977243040939e-05,
"loss": 0.2572148323059082,
"step": 1190
},
{
"epoch": 4.8,
"grad_norm": 0.6083167195320129,
"learning_rate": 6.216382238756146e-05,
"loss": 0.27444655895233155,
"step": 1200
},
{
"epoch": 4.8,
"eval_loss": 0.466619610786438,
"eval_runtime": 19.9505,
"eval_samples_per_second": 20.05,
"eval_steps_per_second": 3.358,
"step": 1200
},
{
"epoch": 4.84,
"grad_norm": 0.5861450433731079,
"learning_rate": 6.148550098343778e-05,
"loss": 0.27054529190063475,
"step": 1210
},
{
"epoch": 4.88,
"grad_norm": 0.7090939879417419,
"learning_rate": 6.080494045814011e-05,
"loss": 0.26785056591033934,
"step": 1220
},
{
"epoch": 4.92,
"grad_norm": 0.5825073719024658,
"learning_rate": 6.0122273488291304e-05,
"loss": 0.26335647106170657,
"step": 1230
},
{
"epoch": 4.96,
"grad_norm": 0.5506169199943542,
"learning_rate": 5.943763316116977e-05,
"loss": 0.2614041090011597,
"step": 1240
},
{
"epoch": 5.0,
"grad_norm": 0.6169804930686951,
"learning_rate": 5.875115294876381e-05,
"loss": 0.24768717288970948,
"step": 1250
},
{
"epoch": 5.04,
"grad_norm": 0.8200834393501282,
"learning_rate": 5.806296668175104e-05,
"loss": 0.21707432270050048,
"step": 1260
},
{
"epoch": 5.08,
"grad_norm": 1.5680038928985596,
"learning_rate": 5.737320852340775e-05,
"loss": 0.2139519214630127,
"step": 1270
},
{
"epoch": 5.12,
"grad_norm": 0.6845637559890747,
"learning_rate": 5.668201294345363e-05,
"loss": 0.20998594760894776,
"step": 1280
},
{
"epoch": 5.16,
"grad_norm": 0.8293268084526062,
"learning_rate": 5.598951469183649e-05,
"loss": 0.23306002616882324,
"step": 1290
},
{
"epoch": 5.2,
"grad_norm": 0.7228839993476868,
"learning_rate": 5.52958487724626e-05,
"loss": 0.2262401580810547,
"step": 1300
},
{
"epoch": 5.2,
"eval_loss": 0.49972543120384216,
"eval_runtime": 18.926,
"eval_samples_per_second": 21.135,
"eval_steps_per_second": 3.54,
"step": 1300
},
{
"epoch": 5.24,
"grad_norm": 0.6243706345558167,
"learning_rate": 5.4601150416877367e-05,
"loss": 0.21100988388061523,
"step": 1310
},
{
"epoch": 5.28,
"grad_norm": 1.0553343296051025,
"learning_rate": 5.390555505790168e-05,
"loss": 0.23542592525482178,
"step": 1320
},
{
"epoch": 5.32,
"grad_norm": 0.6127402186393738,
"learning_rate": 5.3209198303229027e-05,
"loss": 0.2095633029937744,
"step": 1330
},
{
"epoch": 5.36,
"grad_norm": 0.7463288903236389,
"learning_rate": 5.2512215908988484e-05,
"loss": 0.21693904399871827,
"step": 1340
},
{
"epoch": 5.4,
"grad_norm": 0.8020226955413818,
"learning_rate": 5.1814743753278795e-05,
"loss": 0.2076347827911377,
"step": 1350
},
{
"epoch": 5.44,
"grad_norm": 0.6652446389198303,
"learning_rate": 5.111691780967869e-05,
"loss": 0.22539749145507812,
"step": 1360
},
{
"epoch": 5.48,
"grad_norm": 0.6378898620605469,
"learning_rate": 5.041887412073854e-05,
"loss": 0.2077547550201416,
"step": 1370
},
{
"epoch": 5.52,
"grad_norm": 0.7381134033203125,
"learning_rate": 4.97207487714586e-05,
"loss": 0.21558783054351807,
"step": 1380
},
{
"epoch": 5.5600000000000005,
"grad_norm": 0.6613102555274963,
"learning_rate": 4.9022677862758945e-05,
"loss": 0.21069679260253907,
"step": 1390
},
{
"epoch": 5.6,
"grad_norm": 0.7527480721473694,
"learning_rate": 4.832479748494643e-05,
"loss": 0.21843309402465821,
"step": 1400
},
{
"epoch": 5.6,
"eval_loss": 0.49576279520988464,
"eval_runtime": 18.3368,
"eval_samples_per_second": 21.814,
"eval_steps_per_second": 3.654,
"step": 1400
},
{
"epoch": 5.64,
"grad_norm": 0.5983570218086243,
"learning_rate": 4.7627243691183453e-05,
"loss": 0.22310276031494142,
"step": 1410
},
{
"epoch": 5.68,
"grad_norm": 0.6202098727226257,
"learning_rate": 4.693015247096423e-05,
"loss": 0.22056117057800292,
"step": 1420
},
{
"epoch": 5.72,
"grad_norm": 0.7730934023857117,
"learning_rate": 4.623365972360337e-05,
"loss": 0.2241537094116211,
"step": 1430
},
{
"epoch": 5.76,
"grad_norm": 0.6262892484664917,
"learning_rate": 4.553790123174197e-05,
"loss": 0.21514451503753662,
"step": 1440
},
{
"epoch": 5.8,
"grad_norm": 0.646507203578949,
"learning_rate": 4.484301263487665e-05,
"loss": 0.21031346321105956,
"step": 1450
},
{
"epoch": 5.84,
"grad_norm": 0.8227706551551819,
"learning_rate": 4.414912940291613e-05,
"loss": 0.2312474489212036,
"step": 1460
},
{
"epoch": 5.88,
"grad_norm": 0.6932390332221985,
"learning_rate": 4.345638680977139e-05,
"loss": 0.22380952835083007,
"step": 1470
},
{
"epoch": 5.92,
"grad_norm": 0.7352316379547119,
"learning_rate": 4.276491990698355e-05,
"loss": 0.22706894874572753,
"step": 1480
},
{
"epoch": 5.96,
"grad_norm": 0.6953718066215515,
"learning_rate": 4.2074863497395377e-05,
"loss": 0.2103546142578125,
"step": 1490
},
{
"epoch": 6.0,
"grad_norm": 0.661618709564209,
"learning_rate": 4.1386352108871174e-05,
"loss": 0.2276217222213745,
"step": 1500
},
{
"epoch": 6.0,
"eval_loss": 0.4966464042663574,
"eval_runtime": 17.2948,
"eval_samples_per_second": 23.128,
"eval_steps_per_second": 3.874,
"step": 1500
},
{
"epoch": 6.04,
"grad_norm": 0.8837434649467468,
"learning_rate": 4.069951996807034e-05,
"loss": 0.16540236473083497,
"step": 1510
},
{
"epoch": 6.08,
"grad_norm": 1.3857215642929077,
"learning_rate": 4.001450097427966e-05,
"loss": 0.1638352394104004,
"step": 1520
},
{
"epoch": 6.12,
"grad_norm": 0.8306711912155151,
"learning_rate": 3.9331428673309204e-05,
"loss": 0.1719011664390564,
"step": 1530
},
{
"epoch": 6.16,
"grad_norm": 0.8509021997451782,
"learning_rate": 3.865043623145751e-05,
"loss": 0.1651092290878296,
"step": 1540
},
{
"epoch": 6.2,
"grad_norm": 0.7507994174957275,
"learning_rate": 3.797165640955041e-05,
"loss": 0.1746900796890259,
"step": 1550
},
{
"epoch": 6.24,
"grad_norm": 0.740626335144043,
"learning_rate": 3.729522153705916e-05,
"loss": 0.16637682914733887,
"step": 1560
},
{
"epoch": 6.28,
"grad_norm": 0.6479809880256653,
"learning_rate": 3.662126348630237e-05,
"loss": 0.1709848165512085,
"step": 1570
},
{
"epoch": 6.32,
"grad_norm": 0.6932395100593567,
"learning_rate": 3.594991364673745e-05,
"loss": 0.18107957839965821,
"step": 1580
},
{
"epoch": 6.36,
"grad_norm": 0.8027141690254211,
"learning_rate": 3.528130289934583e-05,
"loss": 0.16225044727325438,
"step": 1590
},
{
"epoch": 6.4,
"grad_norm": 0.5781376957893372,
"learning_rate": 3.461556159111748e-05,
"loss": 0.17544152736663818,
"step": 1600
},
{
"epoch": 6.4,
"eval_loss": 0.5342507362365723,
"eval_runtime": 19.471,
"eval_samples_per_second": 20.543,
"eval_steps_per_second": 3.441,
"step": 1600
},
{
"epoch": 6.44,
"grad_norm": 0.7642867565155029,
"learning_rate": 3.3952819509639534e-05,
"loss": 0.17091144323349,
"step": 1610
},
{
"epoch": 6.48,
"grad_norm": 0.7651257514953613,
"learning_rate": 3.329320585779393e-05,
"loss": 0.17765278816223146,
"step": 1620
},
{
"epoch": 6.52,
"grad_norm": 0.6956056356430054,
"learning_rate": 3.263684922856905e-05,
"loss": 0.16475566625595092,
"step": 1630
},
{
"epoch": 6.5600000000000005,
"grad_norm": 0.7344402074813843,
"learning_rate": 3.1983877579990274e-05,
"loss": 0.172060227394104,
"step": 1640
},
{
"epoch": 6.6,
"grad_norm": 0.7196578979492188,
"learning_rate": 3.1334418210174263e-05,
"loss": 0.16673840284347535,
"step": 1650
},
{
"epoch": 6.64,
"grad_norm": 0.7540257573127747,
"learning_rate": 3.0688597732512e-05,
"loss": 0.17414634227752684,
"step": 1660
},
{
"epoch": 6.68,
"grad_norm": 0.5103999972343445,
"learning_rate": 3.0046542050985237e-05,
"loss": 0.1620783567428589,
"step": 1670
},
{
"epoch": 6.72,
"grad_norm": 0.8846920132637024,
"learning_rate": 2.940837633562127e-05,
"loss": 0.17428462505340575,
"step": 1680
},
{
"epoch": 6.76,
"grad_norm": 0.8017328381538391,
"learning_rate": 2.877422499809072e-05,
"loss": 0.19050977230072022,
"step": 1690
},
{
"epoch": 6.8,
"grad_norm": 0.8515416383743286,
"learning_rate": 2.8144211667453368e-05,
"loss": 0.16926174163818358,
"step": 1700
},
{
"epoch": 6.8,
"eval_loss": 0.5441356301307678,
"eval_runtime": 17.5836,
"eval_samples_per_second": 22.749,
"eval_steps_per_second": 3.81,
"step": 1700
},
{
"epoch": 6.84,
"grad_norm": 0.7547643184661865,
"learning_rate": 2.75184591660563e-05,
"loss": 0.1793771743774414,
"step": 1710
},
{
"epoch": 6.88,
"grad_norm": 0.7164461016654968,
"learning_rate": 2.6897089485589583e-05,
"loss": 0.1647491931915283,
"step": 1720
},
{
"epoch": 6.92,
"grad_norm": 1.1592035293579102,
"learning_rate": 2.6280223763303546e-05,
"loss": 0.17397019863128663,
"step": 1730
},
{
"epoch": 6.96,
"grad_norm": 0.9889470934867859,
"learning_rate": 2.5667982258393014e-05,
"loss": 0.17107686996459961,
"step": 1740
},
{
"epoch": 7.0,
"grad_norm": 0.7448652982711792,
"learning_rate": 2.506048432855247e-05,
"loss": 0.1730511426925659,
"step": 1750
},
{
"epoch": 7.04,
"grad_norm": 0.6695497632026672,
"learning_rate": 2.4457848406707013e-05,
"loss": 0.13950222730636597,
"step": 1760
},
{
"epoch": 7.08,
"grad_norm": 0.7200675010681152,
"learning_rate": 2.3860191977923672e-05,
"loss": 0.1326605796813965,
"step": 1770
},
{
"epoch": 7.12,
"grad_norm": 0.6615055799484253,
"learning_rate": 2.326763155650744e-05,
"loss": 0.1265331983566284,
"step": 1780
},
{
"epoch": 7.16,
"grad_norm": 0.8998573422431946,
"learning_rate": 2.2680282663286552e-05,
"loss": 0.12731509208679198,
"step": 1790
},
{
"epoch": 7.2,
"grad_norm": 0.808588981628418,
"learning_rate": 2.209825980309151e-05,
"loss": 0.13114826679229735,
"step": 1800
},
{
"epoch": 7.2,
"eval_loss": 0.5847110748291016,
"eval_runtime": 18.9921,
"eval_samples_per_second": 21.061,
"eval_steps_per_second": 3.528,
"step": 1800
},
{
"epoch": 7.24,
"grad_norm": 0.951817512512207,
"learning_rate": 2.152167644243213e-05,
"loss": 0.12906957864761354,
"step": 1810
},
{
"epoch": 7.28,
"grad_norm": 0.8695458173751831,
"learning_rate": 2.095064498737701e-05,
"loss": 0.133590030670166,
"step": 1820
},
{
"epoch": 7.32,
"grad_norm": 0.7357354760169983,
"learning_rate": 2.0385276761639765e-05,
"loss": 0.13653848171234131,
"step": 1830
},
{
"epoch": 7.36,
"grad_norm": 0.7873698472976685,
"learning_rate": 1.9825681984876172e-05,
"loss": 0.12472724914550781,
"step": 1840
},
{
"epoch": 7.4,
"grad_norm": 0.873921811580658,
"learning_rate": 1.9271969751196776e-05,
"loss": 0.13255125284194946,
"step": 1850
},
{
"epoch": 7.44,
"grad_norm": 0.7591536045074463,
"learning_rate": 1.8724248007898647e-05,
"loss": 0.13693161010742189,
"step": 1860
},
{
"epoch": 7.48,
"grad_norm": 1.0509488582611084,
"learning_rate": 1.8182623534420907e-05,
"loss": 0.13425672054290771,
"step": 1870
},
{
"epoch": 7.52,
"grad_norm": 0.8472399711608887,
"learning_rate": 1.76472019215278e-05,
"loss": 0.13668575286865234,
"step": 1880
},
{
"epoch": 7.5600000000000005,
"grad_norm": 0.911901593208313,
"learning_rate": 1.7118087550723633e-05,
"loss": 0.1317702889442444,
"step": 1890
},
{
"epoch": 7.6,
"grad_norm": 0.9731144309043884,
"learning_rate": 1.659538357390341e-05,
"loss": 0.14458621740341188,
"step": 1900
},
{
"epoch": 7.6,
"eval_loss": 0.5830516219139099,
"eval_runtime": 18.7747,
"eval_samples_per_second": 21.305,
"eval_steps_per_second": 3.569,
"step": 1900
},
{
"epoch": 7.64,
"grad_norm": 0.5515460968017578,
"learning_rate": 1.60791918932431e-05,
"loss": 0.13126691579818725,
"step": 1910
},
{
"epoch": 7.68,
"grad_norm": 0.7286776304244995,
"learning_rate": 1.556961314133359e-05,
"loss": 0.12600460052490234,
"step": 1920
},
{
"epoch": 7.72,
"grad_norm": 0.95229572057724,
"learning_rate": 1.5066746661562253e-05,
"loss": 0.12453792095184327,
"step": 1930
},
{
"epoch": 7.76,
"grad_norm": 0.7712796330451965,
"learning_rate": 1.4570690488745687e-05,
"loss": 0.14839541912078857,
"step": 1940
},
{
"epoch": 7.8,
"grad_norm": 0.8011840581893921,
"learning_rate": 1.4081541330017705e-05,
"loss": 0.1321096420288086,
"step": 1950
},
{
"epoch": 7.84,
"grad_norm": 0.936607301235199,
"learning_rate": 1.3599394545975951e-05,
"loss": 0.1317069411277771,
"step": 1960
},
{
"epoch": 7.88,
"grad_norm": 0.9034994840621948,
"learning_rate": 1.312434413209131e-05,
"loss": 0.13362932205200195,
"step": 1970
},
{
"epoch": 7.92,
"grad_norm": 0.9586318731307983,
"learning_rate": 1.2656482700383237e-05,
"loss": 0.12677763700485228,
"step": 1980
},
{
"epoch": 7.96,
"grad_norm": 0.9358674883842468,
"learning_rate": 1.219590146136485e-05,
"loss": 0.1382434129714966,
"step": 1990
},
{
"epoch": 8.0,
"grad_norm": 0.8410677313804626,
"learning_rate": 1.1742690206261292e-05,
"loss": 0.12519369125366211,
"step": 2000
},
{
"epoch": 8.0,
"eval_loss": 0.5840195417404175,
"eval_runtime": 18.625,
"eval_samples_per_second": 21.477,
"eval_steps_per_second": 3.597,
"step": 2000
},
{
"epoch": 8.04,
"grad_norm": 0.6319883465766907,
"learning_rate": 1.129693728950474e-05,
"loss": 0.10409053564071655,
"step": 2010
},
{
"epoch": 8.08,
"grad_norm": 0.7751646041870117,
"learning_rate": 1.0858729611509516e-05,
"loss": 0.10310100317001343,
"step": 2020
},
{
"epoch": 8.12,
"grad_norm": 0.9277542233467102,
"learning_rate": 1.0428152601730718e-05,
"loss": 0.09960774183273316,
"step": 2030
},
{
"epoch": 8.16,
"grad_norm": 0.8381429314613342,
"learning_rate": 1.0005290202009531e-05,
"loss": 0.09982571601867676,
"step": 2040
},
{
"epoch": 8.2,
"grad_norm": 0.7726228833198547,
"learning_rate": 9.590224850208646e-06,
"loss": 0.11322143077850341,
"step": 2050
},
{
"epoch": 8.24,
"grad_norm": 0.7724836468696594,
"learning_rate": 9.183037464140804e-06,
"loss": 0.10006082057952881,
"step": 2060
},
{
"epoch": 8.28,
"grad_norm": 1.0587371587753296,
"learning_rate": 8.783807425793721e-06,
"loss": 0.11560235023498536,
"step": 2070
},
{
"epoch": 8.32,
"grad_norm": 0.8337858319282532,
"learning_rate": 8.392612565854375e-06,
"loss": 0.10931503772735596,
"step": 2080
},
{
"epoch": 8.36,
"grad_norm": 0.805338978767395,
"learning_rate": 8.009529148535855e-06,
"loss": 0.10900030136108399,
"step": 2090
},
{
"epoch": 8.4,
"grad_norm": 0.7612441182136536,
"learning_rate": 7.63463185670939e-06,
"loss": 0.1069128155708313,
"step": 2100
},
{
"epoch": 8.4,
"eval_loss": 0.6247864961624146,
"eval_runtime": 18.281,
"eval_samples_per_second": 21.881,
"eval_steps_per_second": 3.665,
"step": 2100
},
{
"epoch": 8.44,
"grad_norm": 0.8081948757171631,
"learning_rate": 7.267993777344856e-06,
"loss": 0.09856721758842468,
"step": 2110
},
{
"epoch": 8.48,
"grad_norm": 0.7861329913139343,
"learning_rate": 6.909686387262254e-06,
"loss": 0.10609345436096192,
"step": 2120
},
{
"epoch": 8.52,
"grad_norm": 0.7145861387252808,
"learning_rate": 6.559779539197231e-06,
"loss": 0.105103600025177,
"step": 2130
},
{
"epoch": 8.56,
"grad_norm": 0.7359808683395386,
"learning_rate": 6.21834144818314e-06,
"loss": 0.10853493213653564,
"step": 2140
},
{
"epoch": 8.6,
"grad_norm": 0.8519245982170105,
"learning_rate": 5.885438678252342e-06,
"loss": 0.11464111804962158,
"step": 2150
},
{
"epoch": 8.64,
"grad_norm": 0.8307661414146423,
"learning_rate": 5.5611361294594325e-06,
"loss": 0.10765299797058106,
"step": 2160
},
{
"epoch": 8.68,
"grad_norm": 0.8340169787406921,
"learning_rate": 5.245497025228874e-06,
"loss": 0.10699164867401123,
"step": 2170
},
{
"epoch": 8.72,
"grad_norm": 0.7895165085792542,
"learning_rate": 4.938582900029437e-06,
"loss": 0.10728691816329956,
"step": 2180
},
{
"epoch": 8.76,
"grad_norm": 0.7967789769172668,
"learning_rate": 4.640453587377957e-06,
"loss": 0.11177785396575927,
"step": 2190
},
{
"epoch": 8.8,
"grad_norm": 0.8613453507423401,
"learning_rate": 4.351167208174639e-06,
"loss": 0.11041848659515381,
"step": 2200
},
{
"epoch": 8.8,
"eval_loss": 0.6235533356666565,
"eval_runtime": 19.0901,
"eval_samples_per_second": 20.953,
"eval_steps_per_second": 3.51,
"step": 2200
},
{
"epoch": 8.84,
"grad_norm": 0.6587359309196472,
"learning_rate": 4.0707801593723e-06,
"loss": 0.1085782766342163,
"step": 2210
},
{
"epoch": 8.88,
"grad_norm": 0.7126621603965759,
"learning_rate": 3.799347102981665e-06,
"loss": 0.11138873100280762,
"step": 2220
},
{
"epoch": 8.92,
"grad_norm": 0.7560760974884033,
"learning_rate": 3.536920955414885e-06,
"loss": 0.10770895481109619,
"step": 2230
},
{
"epoch": 8.96,
"grad_norm": 0.95421302318573,
"learning_rate": 3.2835528771693992e-06,
"loss": 0.11167995929718018,
"step": 2240
},
{
"epoch": 9.0,
"grad_norm": 0.9774760007858276,
"learning_rate": 3.039292262854088e-06,
"loss": 0.11738998889923095,
"step": 2250
},
{
"epoch": 9.04,
"grad_norm": 0.7680178880691528,
"learning_rate": 2.804186731559677e-06,
"loss": 0.10072145462036133,
"step": 2260
},
{
"epoch": 9.08,
"grad_norm": 0.8222008943557739,
"learning_rate": 2.5782821175753422e-06,
"loss": 0.09228388667106628,
"step": 2270
},
{
"epoch": 9.12,
"grad_norm": 0.8610215783119202,
"learning_rate": 2.361622461453178e-06,
"loss": 0.09626876711845397,
"step": 2280
},
{
"epoch": 9.16,
"grad_norm": 0.7807718515396118,
"learning_rate": 2.154250001422431e-06,
"loss": 0.0960278868675232,
"step": 2290
},
{
"epoch": 9.2,
"grad_norm": 0.8036084175109863,
"learning_rate": 1.956205165155078e-06,
"loss": 0.0941778838634491,
"step": 2300
},
{
"epoch": 9.2,
"eval_loss": 0.6419874429702759,
"eval_runtime": 19.9334,
"eval_samples_per_second": 20.067,
"eval_steps_per_second": 3.361,
"step": 2300
},
{
"epoch": 9.24,
"grad_norm": 0.7480472326278687,
"learning_rate": 1.7675265618843362e-06,
"loss": 0.09725146293640137,
"step": 2310
},
{
"epoch": 9.28,
"grad_norm": 0.8559448719024658,
"learning_rate": 1.5882509748777808e-06,
"loss": 0.09353782534599304,
"step": 2320
},
{
"epoch": 9.32,
"grad_norm": 0.6416171193122864,
"learning_rate": 1.4184133542663014e-06,
"loss": 0.09848537445068359,
"step": 2330
},
{
"epoch": 9.36,
"grad_norm": 0.7388947606086731,
"learning_rate": 1.258046810230562e-06,
"loss": 0.10164464712142944,
"step": 2340
},
{
"epoch": 9.4,
"grad_norm": 0.8187626600265503,
"learning_rate": 1.1071826065460588e-06,
"loss": 0.0934177041053772,
"step": 2350
},
{
"epoch": 9.44,
"grad_norm": 0.865635871887207,
"learning_rate": 9.65850154488218e-07,
"loss": 0.1012031078338623,
"step": 2360
},
{
"epoch": 9.48,
"grad_norm": 0.8829763531684875,
"learning_rate": 8.340770070986214e-07,
"loss": 0.09371918439865112,
"step": 2370
},
{
"epoch": 9.52,
"grad_norm": 0.7734853625297546,
"learning_rate": 7.11888853813436e-07,
"loss": 0.09450345039367676,
"step": 2380
},
{
"epoch": 9.56,
"grad_norm": 0.7692961096763611,
"learning_rate": 5.993095154552431e-07,
"loss": 0.09499152898788452,
"step": 2390
},
{
"epoch": 9.6,
"grad_norm": 1.1678398847579956,
"learning_rate": 4.963609395891299e-07,
"loss": 0.10716021060943604,
"step": 2400
},
{
"epoch": 9.6,
"eval_loss": 0.6402375102043152,
"eval_runtime": 18.9858,
"eval_samples_per_second": 21.068,
"eval_steps_per_second": 3.529,
"step": 2400
},
{
"epoch": 9.64,
"grad_norm": 0.7258604764938354,
"learning_rate": 4.030631962439302e-07,
"loss": 0.09596163630485535,
"step": 2410
},
{
"epoch": 9.68,
"grad_norm": 0.8662357330322266,
"learning_rate": 3.1943447399958027e-07,
"loss": 0.09645589590072631,
"step": 2420
},
{
"epoch": 9.72,
"grad_norm": 0.8258174061775208,
"learning_rate": 2.4549107644117885e-07,
"loss": 0.09415926933288574,
"step": 2430
},
{
"epoch": 9.76,
"grad_norm": 0.911540150642395,
"learning_rate": 1.8124741898058462e-07,
"loss": 0.10026730298995971,
"step": 2440
},
{
"epoch": 9.8,
"grad_norm": 0.8336577415466309,
"learning_rate": 1.267160260461253e-07,
"loss": 0.09711679220199584,
"step": 2450
},
{
"epoch": 9.84,
"grad_norm": 0.7324675917625427,
"learning_rate": 8.190752864088436e-08,
"loss": 0.09345818758010864,
"step": 2460
},
{
"epoch": 9.88,
"grad_norm": 0.9261553287506104,
"learning_rate": 4.683066227023081e-08,
"loss": 0.102751624584198,
"step": 2470
},
{
"epoch": 9.92,
"grad_norm": 0.9403973817825317,
"learning_rate": 2.1492265238748366e-08,
"loss": 0.0988599717617035,
"step": 2480
},
{
"epoch": 9.96,
"grad_norm": 0.7062044739723206,
"learning_rate": 5.897277317157279e-09,
"loss": 0.09828301668167114,
"step": 2490
},
{
"epoch": 10.0,
"grad_norm": 0.7819132804870605,
"learning_rate": 4.873877924582715e-11,
"loss": 0.0937616467475891,
"step": 2500
},
{
"epoch": 10.0,
"eval_loss": 0.6409608721733093,
"eval_runtime": 17.8761,
"eval_samples_per_second": 22.376,
"eval_steps_per_second": 3.748,
"step": 2500
},
{
"epoch": 10.0,
"step": 2500,
"total_flos": 3.634151342457697e+19,
"train_loss": 0.2690703985452652,
"train_runtime": 10014.7733,
"train_samples_per_second": 5.991,
"train_steps_per_second": 0.25
}
],
"logging_steps": 10,
"max_steps": 2500,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.634151342457697e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}