flyingbugs's picture
Model save
63b41e0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.993342210386152,
"eval_steps": 500,
"global_step": 1125,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002663115845539281,
"grad_norm": 59.669442519158444,
"learning_rate": 4.424778761061947e-07,
"loss": 11.0815,
"step": 1
},
{
"epoch": 0.005326231691078562,
"grad_norm": 59.77300379138749,
"learning_rate": 8.849557522123894e-07,
"loss": 11.0703,
"step": 2
},
{
"epoch": 0.007989347536617843,
"grad_norm": 59.37811338851668,
"learning_rate": 1.3274336283185841e-06,
"loss": 11.1149,
"step": 3
},
{
"epoch": 0.010652463382157125,
"grad_norm": 59.714257927262075,
"learning_rate": 1.7699115044247788e-06,
"loss": 11.1,
"step": 4
},
{
"epoch": 0.013315579227696404,
"grad_norm": 62.19325541849273,
"learning_rate": 2.2123893805309734e-06,
"loss": 10.9008,
"step": 5
},
{
"epoch": 0.015978695073235686,
"grad_norm": 64.3469313247898,
"learning_rate": 2.6548672566371683e-06,
"loss": 10.7897,
"step": 6
},
{
"epoch": 0.018641810918774968,
"grad_norm": 64.70693307946331,
"learning_rate": 3.097345132743363e-06,
"loss": 10.6244,
"step": 7
},
{
"epoch": 0.02130492676431425,
"grad_norm": 100.07904925734698,
"learning_rate": 3.5398230088495575e-06,
"loss": 9.3505,
"step": 8
},
{
"epoch": 0.023968042609853527,
"grad_norm": 121.42213770896274,
"learning_rate": 3.982300884955752e-06,
"loss": 8.5961,
"step": 9
},
{
"epoch": 0.02663115845539281,
"grad_norm": 64.96997432704501,
"learning_rate": 4.424778761061947e-06,
"loss": 3.5386,
"step": 10
},
{
"epoch": 0.02929427430093209,
"grad_norm": 53.5067123571589,
"learning_rate": 4.867256637168142e-06,
"loss": 3.1169,
"step": 11
},
{
"epoch": 0.03195739014647137,
"grad_norm": 34.28454533456946,
"learning_rate": 5.3097345132743365e-06,
"loss": 2.3171,
"step": 12
},
{
"epoch": 0.03462050599201065,
"grad_norm": 28.02284592011359,
"learning_rate": 5.752212389380531e-06,
"loss": 2.1704,
"step": 13
},
{
"epoch": 0.037283621837549935,
"grad_norm": 6.230233716943746,
"learning_rate": 6.194690265486726e-06,
"loss": 1.3702,
"step": 14
},
{
"epoch": 0.03994673768308921,
"grad_norm": 4.8265444090252325,
"learning_rate": 6.6371681415929215e-06,
"loss": 1.2994,
"step": 15
},
{
"epoch": 0.0426098535286285,
"grad_norm": 3.4989649353882544,
"learning_rate": 7.079646017699115e-06,
"loss": 1.1939,
"step": 16
},
{
"epoch": 0.045272969374167776,
"grad_norm": 2.548022240081304,
"learning_rate": 7.52212389380531e-06,
"loss": 1.1113,
"step": 17
},
{
"epoch": 0.047936085219707054,
"grad_norm": 1.7785073197319812,
"learning_rate": 7.964601769911505e-06,
"loss": 1.0099,
"step": 18
},
{
"epoch": 0.05059920106524634,
"grad_norm": 52.43472197468591,
"learning_rate": 8.407079646017701e-06,
"loss": 1.0002,
"step": 19
},
{
"epoch": 0.05326231691078562,
"grad_norm": 18.71256882921437,
"learning_rate": 8.849557522123894e-06,
"loss": 0.9335,
"step": 20
},
{
"epoch": 0.0559254327563249,
"grad_norm": 1.6748381666125123,
"learning_rate": 9.29203539823009e-06,
"loss": 0.8897,
"step": 21
},
{
"epoch": 0.05858854860186418,
"grad_norm": 1.2119772296620004,
"learning_rate": 9.734513274336284e-06,
"loss": 0.8728,
"step": 22
},
{
"epoch": 0.06125166444740346,
"grad_norm": 0.9292233025769583,
"learning_rate": 1.0176991150442479e-05,
"loss": 0.8443,
"step": 23
},
{
"epoch": 0.06391478029294274,
"grad_norm": 0.8058222924733704,
"learning_rate": 1.0619469026548673e-05,
"loss": 0.8065,
"step": 24
},
{
"epoch": 0.06657789613848203,
"grad_norm": 0.7676888976773729,
"learning_rate": 1.1061946902654869e-05,
"loss": 0.744,
"step": 25
},
{
"epoch": 0.0692410119840213,
"grad_norm": 1.1442962246712427,
"learning_rate": 1.1504424778761062e-05,
"loss": 0.7962,
"step": 26
},
{
"epoch": 0.07190412782956059,
"grad_norm": 0.8086732801653846,
"learning_rate": 1.1946902654867258e-05,
"loss": 0.7546,
"step": 27
},
{
"epoch": 0.07456724367509987,
"grad_norm": 0.6032687314644429,
"learning_rate": 1.2389380530973452e-05,
"loss": 0.6961,
"step": 28
},
{
"epoch": 0.07723035952063914,
"grad_norm": 0.8050008569135423,
"learning_rate": 1.2831858407079647e-05,
"loss": 0.7181,
"step": 29
},
{
"epoch": 0.07989347536617843,
"grad_norm": 0.7760170053857292,
"learning_rate": 1.3274336283185843e-05,
"loss": 0.7011,
"step": 30
},
{
"epoch": 0.08255659121171771,
"grad_norm": 0.6911853454916363,
"learning_rate": 1.3716814159292036e-05,
"loss": 0.6767,
"step": 31
},
{
"epoch": 0.085219707057257,
"grad_norm": 0.5690990372888421,
"learning_rate": 1.415929203539823e-05,
"loss": 0.6657,
"step": 32
},
{
"epoch": 0.08788282290279627,
"grad_norm": 0.46539236587043925,
"learning_rate": 1.4601769911504426e-05,
"loss": 0.6585,
"step": 33
},
{
"epoch": 0.09054593874833555,
"grad_norm": 0.6011651474231043,
"learning_rate": 1.504424778761062e-05,
"loss": 0.6571,
"step": 34
},
{
"epoch": 0.09320905459387484,
"grad_norm": 0.6055438783984222,
"learning_rate": 1.5486725663716813e-05,
"loss": 0.6307,
"step": 35
},
{
"epoch": 0.09587217043941411,
"grad_norm": 0.4930140407791457,
"learning_rate": 1.592920353982301e-05,
"loss": 0.638,
"step": 36
},
{
"epoch": 0.0985352862849534,
"grad_norm": 0.38727032176053555,
"learning_rate": 1.6371681415929206e-05,
"loss": 0.6189,
"step": 37
},
{
"epoch": 0.10119840213049268,
"grad_norm": 0.46992360907642716,
"learning_rate": 1.6814159292035402e-05,
"loss": 0.6242,
"step": 38
},
{
"epoch": 0.10386151797603196,
"grad_norm": 0.5002104790615647,
"learning_rate": 1.7256637168141594e-05,
"loss": 0.6087,
"step": 39
},
{
"epoch": 0.10652463382157124,
"grad_norm": 0.4378982855259104,
"learning_rate": 1.7699115044247787e-05,
"loss": 0.6112,
"step": 40
},
{
"epoch": 0.10918774966711052,
"grad_norm": 0.343549106950523,
"learning_rate": 1.8141592920353983e-05,
"loss": 0.6251,
"step": 41
},
{
"epoch": 0.1118508655126498,
"grad_norm": 0.43140422077824325,
"learning_rate": 1.858407079646018e-05,
"loss": 0.625,
"step": 42
},
{
"epoch": 0.11451398135818908,
"grad_norm": 0.44945895418028914,
"learning_rate": 1.9026548672566372e-05,
"loss": 0.576,
"step": 43
},
{
"epoch": 0.11717709720372836,
"grad_norm": 0.33640715838659224,
"learning_rate": 1.946902654867257e-05,
"loss": 0.602,
"step": 44
},
{
"epoch": 0.11984021304926765,
"grad_norm": 0.3602083165810118,
"learning_rate": 1.991150442477876e-05,
"loss": 0.5707,
"step": 45
},
{
"epoch": 0.12250332889480692,
"grad_norm": 1.7341245223857158,
"learning_rate": 2.0353982300884957e-05,
"loss": 0.5662,
"step": 46
},
{
"epoch": 0.12516644474034622,
"grad_norm": 0.42320706053839496,
"learning_rate": 2.079646017699115e-05,
"loss": 0.5718,
"step": 47
},
{
"epoch": 0.1278295605858855,
"grad_norm": 0.34356067841011745,
"learning_rate": 2.1238938053097346e-05,
"loss": 0.5652,
"step": 48
},
{
"epoch": 0.13049267643142476,
"grad_norm": 0.37607875054105366,
"learning_rate": 2.1681415929203542e-05,
"loss": 0.6079,
"step": 49
},
{
"epoch": 0.13315579227696406,
"grad_norm": 0.355877489349339,
"learning_rate": 2.2123893805309738e-05,
"loss": 0.5414,
"step": 50
},
{
"epoch": 0.13581890812250333,
"grad_norm": 0.3531413648567738,
"learning_rate": 2.2566371681415928e-05,
"loss": 0.5383,
"step": 51
},
{
"epoch": 0.1384820239680426,
"grad_norm": 0.3900867327584249,
"learning_rate": 2.3008849557522124e-05,
"loss": 0.5607,
"step": 52
},
{
"epoch": 0.1411451398135819,
"grad_norm": 0.29096561379999103,
"learning_rate": 2.345132743362832e-05,
"loss": 0.5428,
"step": 53
},
{
"epoch": 0.14380825565912117,
"grad_norm": 0.34882597172967983,
"learning_rate": 2.3893805309734516e-05,
"loss": 0.5597,
"step": 54
},
{
"epoch": 0.14647137150466044,
"grad_norm": 0.31745047102841745,
"learning_rate": 2.433628318584071e-05,
"loss": 0.5427,
"step": 55
},
{
"epoch": 0.14913448735019974,
"grad_norm": 0.3429464925874952,
"learning_rate": 2.4778761061946905e-05,
"loss": 0.5418,
"step": 56
},
{
"epoch": 0.151797603195739,
"grad_norm": 0.28154789184935636,
"learning_rate": 2.5221238938053098e-05,
"loss": 0.5701,
"step": 57
},
{
"epoch": 0.15446071904127828,
"grad_norm": 0.3141148216942468,
"learning_rate": 2.5663716814159294e-05,
"loss": 0.5279,
"step": 58
},
{
"epoch": 0.15712383488681758,
"grad_norm": 0.3077683025338142,
"learning_rate": 2.610619469026549e-05,
"loss": 0.5443,
"step": 59
},
{
"epoch": 0.15978695073235685,
"grad_norm": 0.35329472069062134,
"learning_rate": 2.6548672566371686e-05,
"loss": 0.5657,
"step": 60
},
{
"epoch": 0.16245006657789615,
"grad_norm": 0.30082869981695665,
"learning_rate": 2.6991150442477875e-05,
"loss": 0.5386,
"step": 61
},
{
"epoch": 0.16511318242343542,
"grad_norm": 0.3705381333041911,
"learning_rate": 2.743362831858407e-05,
"loss": 0.5417,
"step": 62
},
{
"epoch": 0.1677762982689747,
"grad_norm": 0.3424625742113855,
"learning_rate": 2.7876106194690264e-05,
"loss": 0.5334,
"step": 63
},
{
"epoch": 0.170439414114514,
"grad_norm": 0.2904098798351202,
"learning_rate": 2.831858407079646e-05,
"loss": 0.5424,
"step": 64
},
{
"epoch": 0.17310252996005326,
"grad_norm": 0.32851572085926894,
"learning_rate": 2.8761061946902656e-05,
"loss": 0.5231,
"step": 65
},
{
"epoch": 0.17576564580559254,
"grad_norm": 0.29034784648982725,
"learning_rate": 2.9203539823008852e-05,
"loss": 0.5394,
"step": 66
},
{
"epoch": 0.17842876165113183,
"grad_norm": 0.33213549417249844,
"learning_rate": 2.964601769911505e-05,
"loss": 0.54,
"step": 67
},
{
"epoch": 0.1810918774966711,
"grad_norm": 0.2751631826164567,
"learning_rate": 3.008849557522124e-05,
"loss": 0.5254,
"step": 68
},
{
"epoch": 0.18375499334221038,
"grad_norm": 0.3037009657021324,
"learning_rate": 3.0530973451327434e-05,
"loss": 0.5216,
"step": 69
},
{
"epoch": 0.18641810918774968,
"grad_norm": 0.30105360826964594,
"learning_rate": 3.097345132743363e-05,
"loss": 0.5111,
"step": 70
},
{
"epoch": 0.18908122503328895,
"grad_norm": 0.3202863693523833,
"learning_rate": 3.1415929203539826e-05,
"loss": 0.537,
"step": 71
},
{
"epoch": 0.19174434087882822,
"grad_norm": 0.3294366280935238,
"learning_rate": 3.185840707964602e-05,
"loss": 0.5215,
"step": 72
},
{
"epoch": 0.19440745672436752,
"grad_norm": 0.32228297514585236,
"learning_rate": 3.230088495575221e-05,
"loss": 0.536,
"step": 73
},
{
"epoch": 0.1970705725699068,
"grad_norm": 0.31224977631197853,
"learning_rate": 3.274336283185841e-05,
"loss": 0.5133,
"step": 74
},
{
"epoch": 0.19973368841544606,
"grad_norm": 0.34249789697496347,
"learning_rate": 3.3185840707964604e-05,
"loss": 0.5187,
"step": 75
},
{
"epoch": 0.20239680426098536,
"grad_norm": 0.3014674455677291,
"learning_rate": 3.3628318584070804e-05,
"loss": 0.5173,
"step": 76
},
{
"epoch": 0.20505992010652463,
"grad_norm": 0.31181209074311145,
"learning_rate": 3.407079646017699e-05,
"loss": 0.4938,
"step": 77
},
{
"epoch": 0.20772303595206393,
"grad_norm": 0.3421599429123891,
"learning_rate": 3.451327433628319e-05,
"loss": 0.5178,
"step": 78
},
{
"epoch": 0.2103861517976032,
"grad_norm": 0.32144698779599035,
"learning_rate": 3.495575221238938e-05,
"loss": 0.529,
"step": 79
},
{
"epoch": 0.21304926764314247,
"grad_norm": 0.30829102288383803,
"learning_rate": 3.5398230088495574e-05,
"loss": 0.5045,
"step": 80
},
{
"epoch": 0.21571238348868177,
"grad_norm": 0.3320673147021741,
"learning_rate": 3.5840707964601774e-05,
"loss": 0.5193,
"step": 81
},
{
"epoch": 0.21837549933422104,
"grad_norm": 0.3257493459194373,
"learning_rate": 3.628318584070797e-05,
"loss": 0.5161,
"step": 82
},
{
"epoch": 0.2210386151797603,
"grad_norm": 0.3451069209364067,
"learning_rate": 3.672566371681416e-05,
"loss": 0.4902,
"step": 83
},
{
"epoch": 0.2237017310252996,
"grad_norm": 0.38062902785170477,
"learning_rate": 3.716814159292036e-05,
"loss": 0.5106,
"step": 84
},
{
"epoch": 0.22636484687083888,
"grad_norm": 0.3437845837066077,
"learning_rate": 3.7610619469026545e-05,
"loss": 0.5072,
"step": 85
},
{
"epoch": 0.22902796271637815,
"grad_norm": 0.4369801740657791,
"learning_rate": 3.8053097345132744e-05,
"loss": 0.5016,
"step": 86
},
{
"epoch": 0.23169107856191745,
"grad_norm": 0.39323367167161793,
"learning_rate": 3.849557522123894e-05,
"loss": 0.5126,
"step": 87
},
{
"epoch": 0.23435419440745672,
"grad_norm": 0.3804923058106557,
"learning_rate": 3.893805309734514e-05,
"loss": 0.5169,
"step": 88
},
{
"epoch": 0.237017310252996,
"grad_norm": 0.3991475997522414,
"learning_rate": 3.938053097345133e-05,
"loss": 0.5206,
"step": 89
},
{
"epoch": 0.2396804260985353,
"grad_norm": 0.3345983998430803,
"learning_rate": 3.982300884955752e-05,
"loss": 0.5126,
"step": 90
},
{
"epoch": 0.24234354194407456,
"grad_norm": 0.37605023011424904,
"learning_rate": 4.026548672566372e-05,
"loss": 0.517,
"step": 91
},
{
"epoch": 0.24500665778961384,
"grad_norm": 0.30015095297467786,
"learning_rate": 4.0707964601769914e-05,
"loss": 0.5146,
"step": 92
},
{
"epoch": 0.24766977363515313,
"grad_norm": 0.37615535541775885,
"learning_rate": 4.115044247787611e-05,
"loss": 0.4897,
"step": 93
},
{
"epoch": 0.25033288948069243,
"grad_norm": 0.32506469165922075,
"learning_rate": 4.15929203539823e-05,
"loss": 0.5033,
"step": 94
},
{
"epoch": 0.2529960053262317,
"grad_norm": 0.3955130401533768,
"learning_rate": 4.20353982300885e-05,
"loss": 0.517,
"step": 95
},
{
"epoch": 0.255659121171771,
"grad_norm": 0.38256193351931217,
"learning_rate": 4.247787610619469e-05,
"loss": 0.4903,
"step": 96
},
{
"epoch": 0.2583222370173103,
"grad_norm": 0.3757931359073768,
"learning_rate": 4.2920353982300885e-05,
"loss": 0.4881,
"step": 97
},
{
"epoch": 0.2609853528628495,
"grad_norm": 0.4073525724085135,
"learning_rate": 4.3362831858407084e-05,
"loss": 0.4981,
"step": 98
},
{
"epoch": 0.2636484687083888,
"grad_norm": 0.42226304140119747,
"learning_rate": 4.380530973451328e-05,
"loss": 0.4777,
"step": 99
},
{
"epoch": 0.2663115845539281,
"grad_norm": 0.47546631243940135,
"learning_rate": 4.4247787610619477e-05,
"loss": 0.5012,
"step": 100
},
{
"epoch": 0.26897470039946736,
"grad_norm": 0.38067024978966585,
"learning_rate": 4.469026548672566e-05,
"loss": 0.5038,
"step": 101
},
{
"epoch": 0.27163781624500666,
"grad_norm": 0.3549335612107799,
"learning_rate": 4.5132743362831855e-05,
"loss": 0.5046,
"step": 102
},
{
"epoch": 0.27430093209054596,
"grad_norm": 0.4081532806299182,
"learning_rate": 4.5575221238938055e-05,
"loss": 0.4816,
"step": 103
},
{
"epoch": 0.2769640479360852,
"grad_norm": 0.35702973975911423,
"learning_rate": 4.601769911504425e-05,
"loss": 0.4969,
"step": 104
},
{
"epoch": 0.2796271637816245,
"grad_norm": 0.3750952303695297,
"learning_rate": 4.646017699115045e-05,
"loss": 0.5129,
"step": 105
},
{
"epoch": 0.2822902796271638,
"grad_norm": 0.3713537523929101,
"learning_rate": 4.690265486725664e-05,
"loss": 0.4871,
"step": 106
},
{
"epoch": 0.28495339547270304,
"grad_norm": 0.47534354342607993,
"learning_rate": 4.734513274336283e-05,
"loss": 0.4971,
"step": 107
},
{
"epoch": 0.28761651131824234,
"grad_norm": 0.41826478296211245,
"learning_rate": 4.778761061946903e-05,
"loss": 0.4943,
"step": 108
},
{
"epoch": 0.29027962716378164,
"grad_norm": 0.39759514237849775,
"learning_rate": 4.823008849557522e-05,
"loss": 0.5014,
"step": 109
},
{
"epoch": 0.2929427430093209,
"grad_norm": 0.4548008624547614,
"learning_rate": 4.867256637168142e-05,
"loss": 0.5067,
"step": 110
},
{
"epoch": 0.2956058588548602,
"grad_norm": 0.4618812739465874,
"learning_rate": 4.911504424778761e-05,
"loss": 0.487,
"step": 111
},
{
"epoch": 0.2982689747003995,
"grad_norm": 0.31165613667101594,
"learning_rate": 4.955752212389381e-05,
"loss": 0.4908,
"step": 112
},
{
"epoch": 0.3009320905459387,
"grad_norm": 0.45735168765249185,
"learning_rate": 5e-05,
"loss": 0.4924,
"step": 113
},
{
"epoch": 0.303595206391478,
"grad_norm": 0.4659242945372524,
"learning_rate": 4.9950592885375493e-05,
"loss": 0.49,
"step": 114
},
{
"epoch": 0.3062583222370173,
"grad_norm": 0.3422222311667708,
"learning_rate": 4.990118577075099e-05,
"loss": 0.4902,
"step": 115
},
{
"epoch": 0.30892143808255657,
"grad_norm": 0.5702864889691999,
"learning_rate": 4.985177865612648e-05,
"loss": 0.4712,
"step": 116
},
{
"epoch": 0.31158455392809586,
"grad_norm": 0.31000398399919754,
"learning_rate": 4.980237154150198e-05,
"loss": 0.4729,
"step": 117
},
{
"epoch": 0.31424766977363516,
"grad_norm": 0.5329093367544124,
"learning_rate": 4.975296442687747e-05,
"loss": 0.4979,
"step": 118
},
{
"epoch": 0.3169107856191744,
"grad_norm": 0.41581595613618844,
"learning_rate": 4.970355731225297e-05,
"loss": 0.4979,
"step": 119
},
{
"epoch": 0.3195739014647137,
"grad_norm": 0.5898871183617019,
"learning_rate": 4.965415019762846e-05,
"loss": 0.4841,
"step": 120
},
{
"epoch": 0.322237017310253,
"grad_norm": 0.5277745967026336,
"learning_rate": 4.960474308300396e-05,
"loss": 0.494,
"step": 121
},
{
"epoch": 0.3249001331557923,
"grad_norm": 0.6707049603761084,
"learning_rate": 4.955533596837945e-05,
"loss": 0.4816,
"step": 122
},
{
"epoch": 0.32756324900133155,
"grad_norm": 0.39379278723705347,
"learning_rate": 4.950592885375494e-05,
"loss": 0.4708,
"step": 123
},
{
"epoch": 0.33022636484687085,
"grad_norm": 0.5682660745624962,
"learning_rate": 4.945652173913044e-05,
"loss": 0.4844,
"step": 124
},
{
"epoch": 0.33288948069241014,
"grad_norm": 0.4164160620027728,
"learning_rate": 4.940711462450593e-05,
"loss": 0.4577,
"step": 125
},
{
"epoch": 0.3355525965379494,
"grad_norm": 0.5359420179155978,
"learning_rate": 4.9357707509881426e-05,
"loss": 0.4723,
"step": 126
},
{
"epoch": 0.3382157123834887,
"grad_norm": 0.5026386563312899,
"learning_rate": 4.930830039525692e-05,
"loss": 0.4706,
"step": 127
},
{
"epoch": 0.340878828229028,
"grad_norm": 0.5189502106027113,
"learning_rate": 4.9258893280632415e-05,
"loss": 0.4814,
"step": 128
},
{
"epoch": 0.34354194407456723,
"grad_norm": 0.46462849504368775,
"learning_rate": 4.9209486166007906e-05,
"loss": 0.4735,
"step": 129
},
{
"epoch": 0.34620505992010653,
"grad_norm": 0.5495458064144569,
"learning_rate": 4.9160079051383404e-05,
"loss": 0.4964,
"step": 130
},
{
"epoch": 0.3488681757656458,
"grad_norm": 0.4136354389486864,
"learning_rate": 4.9110671936758895e-05,
"loss": 0.4937,
"step": 131
},
{
"epoch": 0.35153129161118507,
"grad_norm": 0.49819742888588847,
"learning_rate": 4.906126482213439e-05,
"loss": 0.4929,
"step": 132
},
{
"epoch": 0.35419440745672437,
"grad_norm": 0.5211986557669676,
"learning_rate": 4.901185770750988e-05,
"loss": 0.4722,
"step": 133
},
{
"epoch": 0.35685752330226367,
"grad_norm": 0.3743611868649684,
"learning_rate": 4.896245059288538e-05,
"loss": 0.4852,
"step": 134
},
{
"epoch": 0.3595206391478029,
"grad_norm": 0.47244102498767254,
"learning_rate": 4.891304347826087e-05,
"loss": 0.4846,
"step": 135
},
{
"epoch": 0.3621837549933422,
"grad_norm": 0.39536123377896054,
"learning_rate": 4.886363636363637e-05,
"loss": 0.4812,
"step": 136
},
{
"epoch": 0.3648468708388815,
"grad_norm": 0.39389579963168014,
"learning_rate": 4.881422924901186e-05,
"loss": 0.4814,
"step": 137
},
{
"epoch": 0.36750998668442075,
"grad_norm": 0.5517767967854046,
"learning_rate": 4.876482213438736e-05,
"loss": 0.4605,
"step": 138
},
{
"epoch": 0.37017310252996005,
"grad_norm": 0.3371092349408584,
"learning_rate": 4.871541501976285e-05,
"loss": 0.4919,
"step": 139
},
{
"epoch": 0.37283621837549935,
"grad_norm": 0.5454997328166629,
"learning_rate": 4.866600790513835e-05,
"loss": 0.478,
"step": 140
},
{
"epoch": 0.3754993342210386,
"grad_norm": 0.38191662974594565,
"learning_rate": 4.861660079051384e-05,
"loss": 0.4675,
"step": 141
},
{
"epoch": 0.3781624500665779,
"grad_norm": 0.44622867680541506,
"learning_rate": 4.8567193675889336e-05,
"loss": 0.4767,
"step": 142
},
{
"epoch": 0.3808255659121172,
"grad_norm": 0.40615171610446554,
"learning_rate": 4.851778656126482e-05,
"loss": 0.4796,
"step": 143
},
{
"epoch": 0.38348868175765644,
"grad_norm": 0.4067512139515564,
"learning_rate": 4.846837944664032e-05,
"loss": 0.4921,
"step": 144
},
{
"epoch": 0.38615179760319573,
"grad_norm": 0.3764557796844728,
"learning_rate": 4.841897233201581e-05,
"loss": 0.4859,
"step": 145
},
{
"epoch": 0.38881491344873503,
"grad_norm": 0.4154794205261891,
"learning_rate": 4.836956521739131e-05,
"loss": 0.4673,
"step": 146
},
{
"epoch": 0.3914780292942743,
"grad_norm": 0.4269745611686079,
"learning_rate": 4.83201581027668e-05,
"loss": 0.4551,
"step": 147
},
{
"epoch": 0.3941411451398136,
"grad_norm": 0.38377387438781274,
"learning_rate": 4.8270750988142296e-05,
"loss": 0.487,
"step": 148
},
{
"epoch": 0.3968042609853529,
"grad_norm": 0.5603533831020405,
"learning_rate": 4.822134387351779e-05,
"loss": 0.4849,
"step": 149
},
{
"epoch": 0.3994673768308921,
"grad_norm": 0.3973953941114295,
"learning_rate": 4.8171936758893284e-05,
"loss": 0.4776,
"step": 150
},
{
"epoch": 0.4021304926764314,
"grad_norm": 0.4956339650363368,
"learning_rate": 4.8122529644268775e-05,
"loss": 0.4588,
"step": 151
},
{
"epoch": 0.4047936085219707,
"grad_norm": 0.38460346615021695,
"learning_rate": 4.807312252964427e-05,
"loss": 0.4737,
"step": 152
},
{
"epoch": 0.40745672436750996,
"grad_norm": 0.5226991882164052,
"learning_rate": 4.8023715415019764e-05,
"loss": 0.4827,
"step": 153
},
{
"epoch": 0.41011984021304926,
"grad_norm": 0.3418933085513387,
"learning_rate": 4.797430830039526e-05,
"loss": 0.4594,
"step": 154
},
{
"epoch": 0.41278295605858856,
"grad_norm": 0.41779277140490917,
"learning_rate": 4.792490118577075e-05,
"loss": 0.4738,
"step": 155
},
{
"epoch": 0.41544607190412786,
"grad_norm": 0.40524225841023903,
"learning_rate": 4.787549407114625e-05,
"loss": 0.4725,
"step": 156
},
{
"epoch": 0.4181091877496671,
"grad_norm": 0.37804713363928255,
"learning_rate": 4.782608695652174e-05,
"loss": 0.476,
"step": 157
},
{
"epoch": 0.4207723035952064,
"grad_norm": 0.32987544007452513,
"learning_rate": 4.777667984189724e-05,
"loss": 0.4606,
"step": 158
},
{
"epoch": 0.4234354194407457,
"grad_norm": 0.32638522089295396,
"learning_rate": 4.772727272727273e-05,
"loss": 0.4796,
"step": 159
},
{
"epoch": 0.42609853528628494,
"grad_norm": 0.3653611962183669,
"learning_rate": 4.767786561264823e-05,
"loss": 0.4703,
"step": 160
},
{
"epoch": 0.42876165113182424,
"grad_norm": 0.39387144328442575,
"learning_rate": 4.762845849802372e-05,
"loss": 0.4821,
"step": 161
},
{
"epoch": 0.43142476697736354,
"grad_norm": 0.473795283228247,
"learning_rate": 4.757905138339921e-05,
"loss": 0.4638,
"step": 162
},
{
"epoch": 0.4340878828229028,
"grad_norm": 0.33040966306125785,
"learning_rate": 4.75296442687747e-05,
"loss": 0.4734,
"step": 163
},
{
"epoch": 0.4367509986684421,
"grad_norm": 0.42723446550700767,
"learning_rate": 4.74802371541502e-05,
"loss": 0.4809,
"step": 164
},
{
"epoch": 0.4394141145139814,
"grad_norm": 0.3675475725903659,
"learning_rate": 4.743083003952569e-05,
"loss": 0.4586,
"step": 165
},
{
"epoch": 0.4420772303595206,
"grad_norm": 0.4219979464151687,
"learning_rate": 4.738142292490119e-05,
"loss": 0.4678,
"step": 166
},
{
"epoch": 0.4447403462050599,
"grad_norm": 0.3857740050906692,
"learning_rate": 4.733201581027668e-05,
"loss": 0.4633,
"step": 167
},
{
"epoch": 0.4474034620505992,
"grad_norm": 0.365686963876862,
"learning_rate": 4.7282608695652177e-05,
"loss": 0.4712,
"step": 168
},
{
"epoch": 0.45006657789613846,
"grad_norm": 0.43242439287350204,
"learning_rate": 4.723320158102767e-05,
"loss": 0.4751,
"step": 169
},
{
"epoch": 0.45272969374167776,
"grad_norm": 0.3908982963736634,
"learning_rate": 4.7183794466403165e-05,
"loss": 0.4723,
"step": 170
},
{
"epoch": 0.45539280958721706,
"grad_norm": 0.4693769425526856,
"learning_rate": 4.7134387351778656e-05,
"loss": 0.4511,
"step": 171
},
{
"epoch": 0.4580559254327563,
"grad_norm": 0.3437754359793867,
"learning_rate": 4.7084980237154154e-05,
"loss": 0.4634,
"step": 172
},
{
"epoch": 0.4607190412782956,
"grad_norm": 0.5270401669346302,
"learning_rate": 4.7035573122529645e-05,
"loss": 0.4621,
"step": 173
},
{
"epoch": 0.4633821571238349,
"grad_norm": 0.4696714456346351,
"learning_rate": 4.698616600790514e-05,
"loss": 0.4544,
"step": 174
},
{
"epoch": 0.46604527296937415,
"grad_norm": 0.5068508932227126,
"learning_rate": 4.6936758893280634e-05,
"loss": 0.4506,
"step": 175
},
{
"epoch": 0.46870838881491345,
"grad_norm": 0.503240500645686,
"learning_rate": 4.688735177865613e-05,
"loss": 0.4653,
"step": 176
},
{
"epoch": 0.47137150466045274,
"grad_norm": 0.4373004531246149,
"learning_rate": 4.683794466403162e-05,
"loss": 0.4711,
"step": 177
},
{
"epoch": 0.474034620505992,
"grad_norm": 0.3777218592654747,
"learning_rate": 4.678853754940712e-05,
"loss": 0.466,
"step": 178
},
{
"epoch": 0.4766977363515313,
"grad_norm": 0.5064461910000716,
"learning_rate": 4.673913043478261e-05,
"loss": 0.4516,
"step": 179
},
{
"epoch": 0.4793608521970706,
"grad_norm": 0.37515242222191797,
"learning_rate": 4.668972332015811e-05,
"loss": 0.4708,
"step": 180
},
{
"epoch": 0.48202396804260983,
"grad_norm": 0.44905049367290634,
"learning_rate": 4.66403162055336e-05,
"loss": 0.4462,
"step": 181
},
{
"epoch": 0.48468708388814913,
"grad_norm": 0.37911463481430624,
"learning_rate": 4.659090909090909e-05,
"loss": 0.4451,
"step": 182
},
{
"epoch": 0.4873501997336884,
"grad_norm": 0.3830462171805543,
"learning_rate": 4.654150197628458e-05,
"loss": 0.4682,
"step": 183
},
{
"epoch": 0.49001331557922767,
"grad_norm": 0.41200778908045926,
"learning_rate": 4.649209486166008e-05,
"loss": 0.4497,
"step": 184
},
{
"epoch": 0.49267643142476697,
"grad_norm": 0.4315187398326425,
"learning_rate": 4.644268774703557e-05,
"loss": 0.4752,
"step": 185
},
{
"epoch": 0.49533954727030627,
"grad_norm": 0.4519541174810682,
"learning_rate": 4.639328063241107e-05,
"loss": 0.4764,
"step": 186
},
{
"epoch": 0.4980026631158455,
"grad_norm": 0.4089102997614078,
"learning_rate": 4.634387351778656e-05,
"loss": 0.4663,
"step": 187
},
{
"epoch": 0.5006657789613849,
"grad_norm": 0.352791614063271,
"learning_rate": 4.629446640316206e-05,
"loss": 0.4671,
"step": 188
},
{
"epoch": 0.5033288948069241,
"grad_norm": 0.3866144187741864,
"learning_rate": 4.624505928853755e-05,
"loss": 0.4746,
"step": 189
},
{
"epoch": 0.5059920106524634,
"grad_norm": 0.4028526989391047,
"learning_rate": 4.6195652173913046e-05,
"loss": 0.4811,
"step": 190
},
{
"epoch": 0.5086551264980027,
"grad_norm": 0.4580432915919317,
"learning_rate": 4.614624505928854e-05,
"loss": 0.4678,
"step": 191
},
{
"epoch": 0.511318242343542,
"grad_norm": 0.47798645545842755,
"learning_rate": 4.6096837944664035e-05,
"loss": 0.4514,
"step": 192
},
{
"epoch": 0.5139813581890812,
"grad_norm": 0.40636636658954495,
"learning_rate": 4.6047430830039526e-05,
"loss": 0.4356,
"step": 193
},
{
"epoch": 0.5166444740346205,
"grad_norm": 0.4206946394322433,
"learning_rate": 4.5998023715415024e-05,
"loss": 0.4637,
"step": 194
},
{
"epoch": 0.5193075898801598,
"grad_norm": 0.4977083130622833,
"learning_rate": 4.5948616600790515e-05,
"loss": 0.4525,
"step": 195
},
{
"epoch": 0.521970705725699,
"grad_norm": 0.3826090231131446,
"learning_rate": 4.589920948616601e-05,
"loss": 0.4647,
"step": 196
},
{
"epoch": 0.5246338215712384,
"grad_norm": 0.443905698975846,
"learning_rate": 4.5849802371541504e-05,
"loss": 0.466,
"step": 197
},
{
"epoch": 0.5272969374167776,
"grad_norm": 0.34058976392880835,
"learning_rate": 4.5800395256917e-05,
"loss": 0.4462,
"step": 198
},
{
"epoch": 0.5299600532623169,
"grad_norm": 0.3708303032984336,
"learning_rate": 4.575098814229249e-05,
"loss": 0.4638,
"step": 199
},
{
"epoch": 0.5326231691078562,
"grad_norm": 0.4046635861089521,
"learning_rate": 4.570158102766799e-05,
"loss": 0.4702,
"step": 200
},
{
"epoch": 0.5352862849533955,
"grad_norm": 0.390485621135718,
"learning_rate": 4.565217391304348e-05,
"loss": 0.467,
"step": 201
},
{
"epoch": 0.5379494007989347,
"grad_norm": 0.36389394329456204,
"learning_rate": 4.560276679841897e-05,
"loss": 0.4676,
"step": 202
},
{
"epoch": 0.5406125166444741,
"grad_norm": 0.36415110756708385,
"learning_rate": 4.555335968379447e-05,
"loss": 0.4508,
"step": 203
},
{
"epoch": 0.5432756324900133,
"grad_norm": 0.5185630368770853,
"learning_rate": 4.550395256916996e-05,
"loss": 0.4835,
"step": 204
},
{
"epoch": 0.5459387483355526,
"grad_norm": 0.3004205195451817,
"learning_rate": 4.545454545454546e-05,
"loss": 0.4655,
"step": 205
},
{
"epoch": 0.5486018641810919,
"grad_norm": 0.40992528241944887,
"learning_rate": 4.540513833992095e-05,
"loss": 0.4516,
"step": 206
},
{
"epoch": 0.5512649800266312,
"grad_norm": 0.3462175317121373,
"learning_rate": 4.535573122529644e-05,
"loss": 0.4471,
"step": 207
},
{
"epoch": 0.5539280958721704,
"grad_norm": 0.4220985656684442,
"learning_rate": 4.530632411067194e-05,
"loss": 0.4483,
"step": 208
},
{
"epoch": 0.5565912117177098,
"grad_norm": 0.2992081906139443,
"learning_rate": 4.525691699604743e-05,
"loss": 0.4659,
"step": 209
},
{
"epoch": 0.559254327563249,
"grad_norm": 0.34958390386904065,
"learning_rate": 4.520750988142293e-05,
"loss": 0.4594,
"step": 210
},
{
"epoch": 0.5619174434087882,
"grad_norm": 0.36711080919022626,
"learning_rate": 4.515810276679842e-05,
"loss": 0.4329,
"step": 211
},
{
"epoch": 0.5645805592543276,
"grad_norm": 0.32211416124144243,
"learning_rate": 4.5108695652173916e-05,
"loss": 0.4487,
"step": 212
},
{
"epoch": 0.5672436750998668,
"grad_norm": 0.38626649006957514,
"learning_rate": 4.505928853754941e-05,
"loss": 0.4544,
"step": 213
},
{
"epoch": 0.5699067909454061,
"grad_norm": 0.4022394284778984,
"learning_rate": 4.5009881422924905e-05,
"loss": 0.4505,
"step": 214
},
{
"epoch": 0.5725699067909454,
"grad_norm": 0.3174185878452103,
"learning_rate": 4.4960474308300396e-05,
"loss": 0.4652,
"step": 215
},
{
"epoch": 0.5752330226364847,
"grad_norm": 0.3872997977647099,
"learning_rate": 4.4911067193675893e-05,
"loss": 0.4771,
"step": 216
},
{
"epoch": 0.5778961384820239,
"grad_norm": 0.2832157450180407,
"learning_rate": 4.4861660079051384e-05,
"loss": 0.4535,
"step": 217
},
{
"epoch": 0.5805592543275633,
"grad_norm": 0.3394496956003534,
"learning_rate": 4.481225296442688e-05,
"loss": 0.4401,
"step": 218
},
{
"epoch": 0.5832223701731025,
"grad_norm": 0.29084562762850125,
"learning_rate": 4.476284584980237e-05,
"loss": 0.445,
"step": 219
},
{
"epoch": 0.5858854860186418,
"grad_norm": 0.30783953367051076,
"learning_rate": 4.471343873517787e-05,
"loss": 0.437,
"step": 220
},
{
"epoch": 0.5885486018641811,
"grad_norm": 0.3183591003829617,
"learning_rate": 4.466403162055336e-05,
"loss": 0.4549,
"step": 221
},
{
"epoch": 0.5912117177097204,
"grad_norm": 0.30102542208170724,
"learning_rate": 4.461462450592885e-05,
"loss": 0.4455,
"step": 222
},
{
"epoch": 0.5938748335552596,
"grad_norm": 0.36209246659651434,
"learning_rate": 4.456521739130435e-05,
"loss": 0.4401,
"step": 223
},
{
"epoch": 0.596537949400799,
"grad_norm": 0.3264752372953629,
"learning_rate": 4.451581027667984e-05,
"loss": 0.4379,
"step": 224
},
{
"epoch": 0.5992010652463382,
"grad_norm": 0.38508783562543825,
"learning_rate": 4.446640316205534e-05,
"loss": 0.4617,
"step": 225
},
{
"epoch": 0.6018641810918774,
"grad_norm": 0.3397449828204806,
"learning_rate": 4.441699604743083e-05,
"loss": 0.4516,
"step": 226
},
{
"epoch": 0.6045272969374168,
"grad_norm": 0.3587152523608094,
"learning_rate": 4.436758893280633e-05,
"loss": 0.4627,
"step": 227
},
{
"epoch": 0.607190412782956,
"grad_norm": 0.3533298903513862,
"learning_rate": 4.431818181818182e-05,
"loss": 0.4539,
"step": 228
},
{
"epoch": 0.6098535286284953,
"grad_norm": 0.4031621223527615,
"learning_rate": 4.426877470355732e-05,
"loss": 0.4475,
"step": 229
},
{
"epoch": 0.6125166444740346,
"grad_norm": 0.31598897434214096,
"learning_rate": 4.421936758893281e-05,
"loss": 0.4594,
"step": 230
},
{
"epoch": 0.6151797603195739,
"grad_norm": 0.39490506767356415,
"learning_rate": 4.4169960474308306e-05,
"loss": 0.4481,
"step": 231
},
{
"epoch": 0.6178428761651131,
"grad_norm": 0.34551286464789904,
"learning_rate": 4.41205533596838e-05,
"loss": 0.4417,
"step": 232
},
{
"epoch": 0.6205059920106525,
"grad_norm": 0.3471665108105545,
"learning_rate": 4.4071146245059295e-05,
"loss": 0.444,
"step": 233
},
{
"epoch": 0.6231691078561917,
"grad_norm": 0.3236727871934815,
"learning_rate": 4.4021739130434786e-05,
"loss": 0.4465,
"step": 234
},
{
"epoch": 0.625832223701731,
"grad_norm": 0.3951638876292987,
"learning_rate": 4.397233201581028e-05,
"loss": 0.4476,
"step": 235
},
{
"epoch": 0.6284953395472703,
"grad_norm": 0.3186324774552031,
"learning_rate": 4.3922924901185774e-05,
"loss": 0.4359,
"step": 236
},
{
"epoch": 0.6311584553928096,
"grad_norm": 0.3446758582788272,
"learning_rate": 4.387351778656127e-05,
"loss": 0.4425,
"step": 237
},
{
"epoch": 0.6338215712383488,
"grad_norm": 0.3712178318421026,
"learning_rate": 4.382411067193676e-05,
"loss": 0.4479,
"step": 238
},
{
"epoch": 0.6364846870838882,
"grad_norm": 0.2869593917948936,
"learning_rate": 4.377470355731226e-05,
"loss": 0.4487,
"step": 239
},
{
"epoch": 0.6391478029294274,
"grad_norm": 0.35621809137402505,
"learning_rate": 4.3725296442687745e-05,
"loss": 0.459,
"step": 240
},
{
"epoch": 0.6418109187749668,
"grad_norm": 0.3219598029099912,
"learning_rate": 4.367588932806324e-05,
"loss": 0.4486,
"step": 241
},
{
"epoch": 0.644474034620506,
"grad_norm": 0.345671883817814,
"learning_rate": 4.3626482213438734e-05,
"loss": 0.4494,
"step": 242
},
{
"epoch": 0.6471371504660453,
"grad_norm": 0.3326228424406132,
"learning_rate": 4.357707509881423e-05,
"loss": 0.467,
"step": 243
},
{
"epoch": 0.6498002663115846,
"grad_norm": 0.42093399894851624,
"learning_rate": 4.352766798418972e-05,
"loss": 0.4361,
"step": 244
},
{
"epoch": 0.6524633821571239,
"grad_norm": 0.4162222276319394,
"learning_rate": 4.347826086956522e-05,
"loss": 0.4606,
"step": 245
},
{
"epoch": 0.6551264980026631,
"grad_norm": 0.36750359997980137,
"learning_rate": 4.342885375494071e-05,
"loss": 0.4429,
"step": 246
},
{
"epoch": 0.6577896138482024,
"grad_norm": 0.5483612794064252,
"learning_rate": 4.337944664031621e-05,
"loss": 0.4533,
"step": 247
},
{
"epoch": 0.6604527296937417,
"grad_norm": 0.3506444877775761,
"learning_rate": 4.33300395256917e-05,
"loss": 0.4469,
"step": 248
},
{
"epoch": 0.6631158455392809,
"grad_norm": 0.49614493451666597,
"learning_rate": 4.32806324110672e-05,
"loss": 0.4511,
"step": 249
},
{
"epoch": 0.6657789613848203,
"grad_norm": 0.38209500350480796,
"learning_rate": 4.323122529644269e-05,
"loss": 0.4556,
"step": 250
},
{
"epoch": 0.6684420772303595,
"grad_norm": 0.3909575859613948,
"learning_rate": 4.318181818181819e-05,
"loss": 0.4573,
"step": 251
},
{
"epoch": 0.6711051930758988,
"grad_norm": 0.41081105341671875,
"learning_rate": 4.313241106719368e-05,
"loss": 0.4319,
"step": 252
},
{
"epoch": 0.6737683089214381,
"grad_norm": 0.3263282193938601,
"learning_rate": 4.3083003952569175e-05,
"loss": 0.4477,
"step": 253
},
{
"epoch": 0.6764314247669774,
"grad_norm": 0.30906206450856727,
"learning_rate": 4.3033596837944666e-05,
"loss": 0.449,
"step": 254
},
{
"epoch": 0.6790945406125166,
"grad_norm": 0.4519613203178409,
"learning_rate": 4.2984189723320164e-05,
"loss": 0.4411,
"step": 255
},
{
"epoch": 0.681757656458056,
"grad_norm": 0.4018486844337667,
"learning_rate": 4.2934782608695655e-05,
"loss": 0.4402,
"step": 256
},
{
"epoch": 0.6844207723035952,
"grad_norm": 0.41908409625079107,
"learning_rate": 4.288537549407115e-05,
"loss": 0.4531,
"step": 257
},
{
"epoch": 0.6870838881491345,
"grad_norm": 0.34694110159483726,
"learning_rate": 4.2835968379446644e-05,
"loss": 0.4533,
"step": 258
},
{
"epoch": 0.6897470039946738,
"grad_norm": 0.4051995527756752,
"learning_rate": 4.2786561264822135e-05,
"loss": 0.4533,
"step": 259
},
{
"epoch": 0.6924101198402131,
"grad_norm": 0.3557731708549695,
"learning_rate": 4.2737154150197626e-05,
"loss": 0.4665,
"step": 260
},
{
"epoch": 0.6950732356857523,
"grad_norm": 0.387832077012766,
"learning_rate": 4.2687747035573124e-05,
"loss": 0.4407,
"step": 261
},
{
"epoch": 0.6977363515312917,
"grad_norm": 0.38082367574409703,
"learning_rate": 4.2638339920948615e-05,
"loss": 0.453,
"step": 262
},
{
"epoch": 0.7003994673768309,
"grad_norm": 0.33683683724829466,
"learning_rate": 4.258893280632411e-05,
"loss": 0.4635,
"step": 263
},
{
"epoch": 0.7030625832223701,
"grad_norm": 0.4169335496839881,
"learning_rate": 4.2539525691699603e-05,
"loss": 0.4563,
"step": 264
},
{
"epoch": 0.7057256990679095,
"grad_norm": 0.3214835965167982,
"learning_rate": 4.24901185770751e-05,
"loss": 0.4542,
"step": 265
},
{
"epoch": 0.7083888149134487,
"grad_norm": 0.3530582715253166,
"learning_rate": 4.244071146245059e-05,
"loss": 0.4331,
"step": 266
},
{
"epoch": 0.711051930758988,
"grad_norm": 0.36340494740289614,
"learning_rate": 4.239130434782609e-05,
"loss": 0.4394,
"step": 267
},
{
"epoch": 0.7137150466045273,
"grad_norm": 0.3874861034018051,
"learning_rate": 4.234189723320158e-05,
"loss": 0.4297,
"step": 268
},
{
"epoch": 0.7163781624500666,
"grad_norm": 0.387734289004501,
"learning_rate": 4.229249011857708e-05,
"loss": 0.4518,
"step": 269
},
{
"epoch": 0.7190412782956058,
"grad_norm": 0.3011771126496286,
"learning_rate": 4.224308300395257e-05,
"loss": 0.4369,
"step": 270
},
{
"epoch": 0.7217043941411452,
"grad_norm": 0.41746724783245387,
"learning_rate": 4.219367588932807e-05,
"loss": 0.4509,
"step": 271
},
{
"epoch": 0.7243675099866844,
"grad_norm": 0.3395798145391856,
"learning_rate": 4.214426877470356e-05,
"loss": 0.4643,
"step": 272
},
{
"epoch": 0.7270306258322237,
"grad_norm": 0.4118033460496559,
"learning_rate": 4.2094861660079056e-05,
"loss": 0.4238,
"step": 273
},
{
"epoch": 0.729693741677763,
"grad_norm": 0.2988995865914867,
"learning_rate": 4.204545454545455e-05,
"loss": 0.4414,
"step": 274
},
{
"epoch": 0.7323568575233023,
"grad_norm": 0.4755302873686915,
"learning_rate": 4.1996047430830045e-05,
"loss": 0.4408,
"step": 275
},
{
"epoch": 0.7350199733688415,
"grad_norm": 0.3321861192448237,
"learning_rate": 4.1946640316205536e-05,
"loss": 0.4471,
"step": 276
},
{
"epoch": 0.7376830892143809,
"grad_norm": 0.45541818319145366,
"learning_rate": 4.1897233201581034e-05,
"loss": 0.4473,
"step": 277
},
{
"epoch": 0.7403462050599201,
"grad_norm": 0.37099566890533026,
"learning_rate": 4.1847826086956525e-05,
"loss": 0.4495,
"step": 278
},
{
"epoch": 0.7430093209054593,
"grad_norm": 0.4035270770785246,
"learning_rate": 4.1798418972332016e-05,
"loss": 0.4513,
"step": 279
},
{
"epoch": 0.7456724367509987,
"grad_norm": 0.3441312582159767,
"learning_rate": 4.174901185770751e-05,
"loss": 0.4358,
"step": 280
},
{
"epoch": 0.748335552596538,
"grad_norm": 0.44606462407083225,
"learning_rate": 4.1699604743083005e-05,
"loss": 0.4441,
"step": 281
},
{
"epoch": 0.7509986684420772,
"grad_norm": 0.41551217890891706,
"learning_rate": 4.1650197628458496e-05,
"loss": 0.4389,
"step": 282
},
{
"epoch": 0.7536617842876165,
"grad_norm": 0.3972988958201408,
"learning_rate": 4.160079051383399e-05,
"loss": 0.4375,
"step": 283
},
{
"epoch": 0.7563249001331558,
"grad_norm": 0.47085225893645843,
"learning_rate": 4.1551383399209484e-05,
"loss": 0.4567,
"step": 284
},
{
"epoch": 0.758988015978695,
"grad_norm": 0.34543261673414827,
"learning_rate": 4.150197628458498e-05,
"loss": 0.4459,
"step": 285
},
{
"epoch": 0.7616511318242344,
"grad_norm": 0.43195994812681116,
"learning_rate": 4.145256916996047e-05,
"loss": 0.4589,
"step": 286
},
{
"epoch": 0.7643142476697736,
"grad_norm": 0.3459436864735825,
"learning_rate": 4.140316205533597e-05,
"loss": 0.4599,
"step": 287
},
{
"epoch": 0.7669773635153129,
"grad_norm": 0.36207300529867464,
"learning_rate": 4.135375494071146e-05,
"loss": 0.4303,
"step": 288
},
{
"epoch": 0.7696404793608522,
"grad_norm": 0.41345784501066335,
"learning_rate": 4.130434782608696e-05,
"loss": 0.4271,
"step": 289
},
{
"epoch": 0.7723035952063915,
"grad_norm": 0.3159838632384483,
"learning_rate": 4.125494071146245e-05,
"loss": 0.4559,
"step": 290
},
{
"epoch": 0.7749667110519307,
"grad_norm": 0.3812699162571922,
"learning_rate": 4.120553359683795e-05,
"loss": 0.4342,
"step": 291
},
{
"epoch": 0.7776298268974701,
"grad_norm": 0.37911131885498967,
"learning_rate": 4.115612648221344e-05,
"loss": 0.4362,
"step": 292
},
{
"epoch": 0.7802929427430093,
"grad_norm": 0.29763254355588903,
"learning_rate": 4.110671936758894e-05,
"loss": 0.438,
"step": 293
},
{
"epoch": 0.7829560585885486,
"grad_norm": 0.42619217859831243,
"learning_rate": 4.105731225296443e-05,
"loss": 0.4359,
"step": 294
},
{
"epoch": 0.7856191744340879,
"grad_norm": 0.3300550679665931,
"learning_rate": 4.1007905138339926e-05,
"loss": 0.43,
"step": 295
},
{
"epoch": 0.7882822902796272,
"grad_norm": 0.36668560763021596,
"learning_rate": 4.095849802371542e-05,
"loss": 0.4307,
"step": 296
},
{
"epoch": 0.7909454061251664,
"grad_norm": 0.4285864023060217,
"learning_rate": 4.0909090909090915e-05,
"loss": 0.44,
"step": 297
},
{
"epoch": 0.7936085219707057,
"grad_norm": 0.40308733058892654,
"learning_rate": 4.0859683794466406e-05,
"loss": 0.4438,
"step": 298
},
{
"epoch": 0.796271637816245,
"grad_norm": 0.48251508562888784,
"learning_rate": 4.08102766798419e-05,
"loss": 0.465,
"step": 299
},
{
"epoch": 0.7989347536617842,
"grad_norm": 0.3630289677972406,
"learning_rate": 4.076086956521739e-05,
"loss": 0.4472,
"step": 300
},
{
"epoch": 0.8015978695073236,
"grad_norm": 0.39496674097555107,
"learning_rate": 4.0711462450592886e-05,
"loss": 0.4391,
"step": 301
},
{
"epoch": 0.8042609853528628,
"grad_norm": 0.3844393845604204,
"learning_rate": 4.0662055335968377e-05,
"loss": 0.4594,
"step": 302
},
{
"epoch": 0.8069241011984021,
"grad_norm": 0.41185922961873794,
"learning_rate": 4.0612648221343874e-05,
"loss": 0.4302,
"step": 303
},
{
"epoch": 0.8095872170439414,
"grad_norm": 0.3856385433600225,
"learning_rate": 4.0563241106719365e-05,
"loss": 0.4436,
"step": 304
},
{
"epoch": 0.8122503328894807,
"grad_norm": 0.38840299488987834,
"learning_rate": 4.051383399209486e-05,
"loss": 0.4536,
"step": 305
},
{
"epoch": 0.8149134487350199,
"grad_norm": 0.3814150713404761,
"learning_rate": 4.0464426877470354e-05,
"loss": 0.4478,
"step": 306
},
{
"epoch": 0.8175765645805593,
"grad_norm": 0.3688695146114231,
"learning_rate": 4.041501976284585e-05,
"loss": 0.4371,
"step": 307
},
{
"epoch": 0.8202396804260985,
"grad_norm": 0.4525942844580142,
"learning_rate": 4.036561264822134e-05,
"loss": 0.4291,
"step": 308
},
{
"epoch": 0.8229027962716379,
"grad_norm": 0.4052871924274271,
"learning_rate": 4.031620553359684e-05,
"loss": 0.4441,
"step": 309
},
{
"epoch": 0.8255659121171771,
"grad_norm": 0.39806513754399514,
"learning_rate": 4.026679841897233e-05,
"loss": 0.4411,
"step": 310
},
{
"epoch": 0.8282290279627164,
"grad_norm": 0.3805049053303521,
"learning_rate": 4.021739130434783e-05,
"loss": 0.4366,
"step": 311
},
{
"epoch": 0.8308921438082557,
"grad_norm": 0.4001908389883243,
"learning_rate": 4.016798418972332e-05,
"loss": 0.4481,
"step": 312
},
{
"epoch": 0.833555259653795,
"grad_norm": 0.3685478975261263,
"learning_rate": 4.011857707509882e-05,
"loss": 0.4444,
"step": 313
},
{
"epoch": 0.8362183754993342,
"grad_norm": 0.3338436350006864,
"learning_rate": 4.006916996047431e-05,
"loss": 0.4479,
"step": 314
},
{
"epoch": 0.8388814913448736,
"grad_norm": 0.41429245260714803,
"learning_rate": 4.001976284584981e-05,
"loss": 0.449,
"step": 315
},
{
"epoch": 0.8415446071904128,
"grad_norm": 0.4423411865525233,
"learning_rate": 3.99703557312253e-05,
"loss": 0.4659,
"step": 316
},
{
"epoch": 0.844207723035952,
"grad_norm": 0.2957853011048819,
"learning_rate": 3.9920948616600796e-05,
"loss": 0.4251,
"step": 317
},
{
"epoch": 0.8468708388814914,
"grad_norm": 0.4030160825498704,
"learning_rate": 3.987154150197629e-05,
"loss": 0.4371,
"step": 318
},
{
"epoch": 0.8495339547270306,
"grad_norm": 0.3580572215645172,
"learning_rate": 3.982213438735178e-05,
"loss": 0.4227,
"step": 319
},
{
"epoch": 0.8521970705725699,
"grad_norm": 0.39710125591854223,
"learning_rate": 3.9772727272727275e-05,
"loss": 0.4293,
"step": 320
},
{
"epoch": 0.8548601864181092,
"grad_norm": 0.4051765562646604,
"learning_rate": 3.9723320158102766e-05,
"loss": 0.4334,
"step": 321
},
{
"epoch": 0.8575233022636485,
"grad_norm": 0.41675278060825943,
"learning_rate": 3.9673913043478264e-05,
"loss": 0.4386,
"step": 322
},
{
"epoch": 0.8601864181091877,
"grad_norm": 0.4375405045592726,
"learning_rate": 3.9624505928853755e-05,
"loss": 0.4533,
"step": 323
},
{
"epoch": 0.8628495339547271,
"grad_norm": 0.4043621563504148,
"learning_rate": 3.957509881422925e-05,
"loss": 0.4497,
"step": 324
},
{
"epoch": 0.8655126498002663,
"grad_norm": 0.37983530045601516,
"learning_rate": 3.9525691699604744e-05,
"loss": 0.4392,
"step": 325
},
{
"epoch": 0.8681757656458056,
"grad_norm": 0.4289732652538706,
"learning_rate": 3.947628458498024e-05,
"loss": 0.4401,
"step": 326
},
{
"epoch": 0.8708388814913449,
"grad_norm": 0.34033600614743714,
"learning_rate": 3.942687747035573e-05,
"loss": 0.453,
"step": 327
},
{
"epoch": 0.8735019973368842,
"grad_norm": 0.399300367168935,
"learning_rate": 3.937747035573123e-05,
"loss": 0.433,
"step": 328
},
{
"epoch": 0.8761651131824234,
"grad_norm": 0.36717092389818584,
"learning_rate": 3.932806324110672e-05,
"loss": 0.4523,
"step": 329
},
{
"epoch": 0.8788282290279628,
"grad_norm": 0.43669770511305556,
"learning_rate": 3.927865612648222e-05,
"loss": 0.437,
"step": 330
},
{
"epoch": 0.881491344873502,
"grad_norm": 0.3631294987791108,
"learning_rate": 3.922924901185771e-05,
"loss": 0.4335,
"step": 331
},
{
"epoch": 0.8841544607190412,
"grad_norm": 0.45116504976872973,
"learning_rate": 3.917984189723321e-05,
"loss": 0.4562,
"step": 332
},
{
"epoch": 0.8868175765645806,
"grad_norm": 0.3163566159546663,
"learning_rate": 3.91304347826087e-05,
"loss": 0.4286,
"step": 333
},
{
"epoch": 0.8894806924101198,
"grad_norm": 0.49699702016497876,
"learning_rate": 3.90810276679842e-05,
"loss": 0.4214,
"step": 334
},
{
"epoch": 0.8921438082556591,
"grad_norm": 0.4164898463983148,
"learning_rate": 3.903162055335969e-05,
"loss": 0.4354,
"step": 335
},
{
"epoch": 0.8948069241011984,
"grad_norm": 0.39631778611383006,
"learning_rate": 3.8982213438735186e-05,
"loss": 0.4389,
"step": 336
},
{
"epoch": 0.8974700399467377,
"grad_norm": 0.4545892509897146,
"learning_rate": 3.893280632411067e-05,
"loss": 0.4312,
"step": 337
},
{
"epoch": 0.9001331557922769,
"grad_norm": 0.41988367228289636,
"learning_rate": 3.888339920948617e-05,
"loss": 0.4433,
"step": 338
},
{
"epoch": 0.9027962716378163,
"grad_norm": 0.3123307577517813,
"learning_rate": 3.883399209486166e-05,
"loss": 0.4272,
"step": 339
},
{
"epoch": 0.9054593874833555,
"grad_norm": 0.31692127951353677,
"learning_rate": 3.8784584980237156e-05,
"loss": 0.4292,
"step": 340
},
{
"epoch": 0.9081225033288948,
"grad_norm": 0.33613245505768613,
"learning_rate": 3.873517786561265e-05,
"loss": 0.4249,
"step": 341
},
{
"epoch": 0.9107856191744341,
"grad_norm": 0.30559768683570065,
"learning_rate": 3.8685770750988145e-05,
"loss": 0.4398,
"step": 342
},
{
"epoch": 0.9134487350199734,
"grad_norm": 0.3939981911193064,
"learning_rate": 3.8636363636363636e-05,
"loss": 0.4335,
"step": 343
},
{
"epoch": 0.9161118508655126,
"grad_norm": 0.33858345690029085,
"learning_rate": 3.8586956521739134e-05,
"loss": 0.4451,
"step": 344
},
{
"epoch": 0.918774966711052,
"grad_norm": 0.3422872934004404,
"learning_rate": 3.8537549407114625e-05,
"loss": 0.4353,
"step": 345
},
{
"epoch": 0.9214380825565912,
"grad_norm": 0.3280283881293896,
"learning_rate": 3.848814229249012e-05,
"loss": 0.4336,
"step": 346
},
{
"epoch": 0.9241011984021305,
"grad_norm": 0.3212166344001671,
"learning_rate": 3.8438735177865614e-05,
"loss": 0.4436,
"step": 347
},
{
"epoch": 0.9267643142476698,
"grad_norm": 0.29779879718680563,
"learning_rate": 3.838932806324111e-05,
"loss": 0.4224,
"step": 348
},
{
"epoch": 0.929427430093209,
"grad_norm": 0.32257209602500175,
"learning_rate": 3.83399209486166e-05,
"loss": 0.4324,
"step": 349
},
{
"epoch": 0.9320905459387483,
"grad_norm": 0.3283760169277036,
"learning_rate": 3.82905138339921e-05,
"loss": 0.4312,
"step": 350
},
{
"epoch": 0.9347536617842876,
"grad_norm": 0.29560048048387905,
"learning_rate": 3.824110671936759e-05,
"loss": 0.438,
"step": 351
},
{
"epoch": 0.9374167776298269,
"grad_norm": 0.31047996971013586,
"learning_rate": 3.819169960474309e-05,
"loss": 0.436,
"step": 352
},
{
"epoch": 0.9400798934753661,
"grad_norm": 0.3203340478559344,
"learning_rate": 3.814229249011858e-05,
"loss": 0.4178,
"step": 353
},
{
"epoch": 0.9427430093209055,
"grad_norm": 0.3000799797652741,
"learning_rate": 3.809288537549408e-05,
"loss": 0.4283,
"step": 354
},
{
"epoch": 0.9454061251664447,
"grad_norm": 0.31625082964426837,
"learning_rate": 3.804347826086957e-05,
"loss": 0.4355,
"step": 355
},
{
"epoch": 0.948069241011984,
"grad_norm": 0.38688019968777704,
"learning_rate": 3.7994071146245066e-05,
"loss": 0.4561,
"step": 356
},
{
"epoch": 0.9507323568575233,
"grad_norm": 0.309916135809927,
"learning_rate": 3.794466403162055e-05,
"loss": 0.4323,
"step": 357
},
{
"epoch": 0.9533954727030626,
"grad_norm": 0.4119303884073823,
"learning_rate": 3.789525691699605e-05,
"loss": 0.4346,
"step": 358
},
{
"epoch": 0.9560585885486018,
"grad_norm": 0.36057463061333933,
"learning_rate": 3.784584980237154e-05,
"loss": 0.4521,
"step": 359
},
{
"epoch": 0.9587217043941412,
"grad_norm": 0.3385683676369823,
"learning_rate": 3.779644268774704e-05,
"loss": 0.4186,
"step": 360
},
{
"epoch": 0.9613848202396804,
"grad_norm": 0.40056553056875543,
"learning_rate": 3.774703557312253e-05,
"loss": 0.4577,
"step": 361
},
{
"epoch": 0.9640479360852197,
"grad_norm": 0.3362167210172609,
"learning_rate": 3.7697628458498026e-05,
"loss": 0.4232,
"step": 362
},
{
"epoch": 0.966711051930759,
"grad_norm": 0.39765353196088127,
"learning_rate": 3.764822134387352e-05,
"loss": 0.4441,
"step": 363
},
{
"epoch": 0.9693741677762983,
"grad_norm": 0.34508268417865146,
"learning_rate": 3.7598814229249015e-05,
"loss": 0.4339,
"step": 364
},
{
"epoch": 0.9720372836218375,
"grad_norm": 0.346158165413465,
"learning_rate": 3.7549407114624506e-05,
"loss": 0.4314,
"step": 365
},
{
"epoch": 0.9747003994673769,
"grad_norm": 0.38758138562436,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.4479,
"step": 366
},
{
"epoch": 0.9773635153129161,
"grad_norm": 0.3616955496837348,
"learning_rate": 3.7450592885375494e-05,
"loss": 0.4295,
"step": 367
},
{
"epoch": 0.9800266311584553,
"grad_norm": 0.36330419598482033,
"learning_rate": 3.740118577075099e-05,
"loss": 0.431,
"step": 368
},
{
"epoch": 0.9826897470039947,
"grad_norm": 0.38220931731215757,
"learning_rate": 3.735177865612648e-05,
"loss": 0.4411,
"step": 369
},
{
"epoch": 0.9853528628495339,
"grad_norm": 0.32482883893874537,
"learning_rate": 3.730237154150198e-05,
"loss": 0.4352,
"step": 370
},
{
"epoch": 0.9880159786950732,
"grad_norm": 0.3797976983855516,
"learning_rate": 3.725296442687747e-05,
"loss": 0.4273,
"step": 371
},
{
"epoch": 0.9906790945406125,
"grad_norm": 0.3333203576267911,
"learning_rate": 3.720355731225297e-05,
"loss": 0.4353,
"step": 372
},
{
"epoch": 0.9933422103861518,
"grad_norm": 0.3565932063789887,
"learning_rate": 3.715415019762846e-05,
"loss": 0.4312,
"step": 373
},
{
"epoch": 0.996005326231691,
"grad_norm": 0.35499721260713074,
"learning_rate": 3.710474308300396e-05,
"loss": 0.4328,
"step": 374
},
{
"epoch": 0.9986684420772304,
"grad_norm": 0.34312841144350587,
"learning_rate": 3.705533596837945e-05,
"loss": 0.4238,
"step": 375
},
{
"epoch": 1.0,
"grad_norm": 0.34312841144350587,
"learning_rate": 3.700592885375494e-05,
"loss": 0.4292,
"step": 376
},
{
"epoch": 1.0026631158455392,
"grad_norm": 0.523484923884555,
"learning_rate": 3.695652173913043e-05,
"loss": 0.3827,
"step": 377
},
{
"epoch": 1.0053262316910785,
"grad_norm": 0.44981178204276556,
"learning_rate": 3.690711462450593e-05,
"loss": 0.3497,
"step": 378
},
{
"epoch": 1.007989347536618,
"grad_norm": 0.30585009680415987,
"learning_rate": 3.685770750988142e-05,
"loss": 0.3667,
"step": 379
},
{
"epoch": 1.0106524633821572,
"grad_norm": 0.3734972975740805,
"learning_rate": 3.680830039525692e-05,
"loss": 0.365,
"step": 380
},
{
"epoch": 1.0133155792276964,
"grad_norm": 0.32549667969227175,
"learning_rate": 3.675889328063241e-05,
"loss": 0.3756,
"step": 381
},
{
"epoch": 1.0159786950732357,
"grad_norm": 0.4493130971817616,
"learning_rate": 3.670948616600791e-05,
"loss": 0.358,
"step": 382
},
{
"epoch": 1.018641810918775,
"grad_norm": 0.40705895511048784,
"learning_rate": 3.66600790513834e-05,
"loss": 0.3711,
"step": 383
},
{
"epoch": 1.0213049267643142,
"grad_norm": 0.3979472669944709,
"learning_rate": 3.6610671936758896e-05,
"loss": 0.3613,
"step": 384
},
{
"epoch": 1.0239680426098536,
"grad_norm": 0.44247177084982264,
"learning_rate": 3.656126482213439e-05,
"loss": 0.3461,
"step": 385
},
{
"epoch": 1.0266311584553929,
"grad_norm": 0.3643767210189153,
"learning_rate": 3.6511857707509884e-05,
"loss": 0.3682,
"step": 386
},
{
"epoch": 1.0292942743009321,
"grad_norm": 0.3710522218627508,
"learning_rate": 3.6462450592885375e-05,
"loss": 0.3616,
"step": 387
},
{
"epoch": 1.0319573901464714,
"grad_norm": 0.39199235847196745,
"learning_rate": 3.641304347826087e-05,
"loss": 0.3373,
"step": 388
},
{
"epoch": 1.0346205059920106,
"grad_norm": 0.3716307271666748,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.3783,
"step": 389
},
{
"epoch": 1.0372836218375499,
"grad_norm": 0.39593613574016095,
"learning_rate": 3.631422924901186e-05,
"loss": 0.3605,
"step": 390
},
{
"epoch": 1.0399467376830893,
"grad_norm": 0.3741049180680241,
"learning_rate": 3.626482213438735e-05,
"loss": 0.3643,
"step": 391
},
{
"epoch": 1.0426098535286286,
"grad_norm": 0.39560887666458844,
"learning_rate": 3.621541501976285e-05,
"loss": 0.3873,
"step": 392
},
{
"epoch": 1.0452729693741678,
"grad_norm": 0.4542194912059658,
"learning_rate": 3.616600790513834e-05,
"loss": 0.3517,
"step": 393
},
{
"epoch": 1.047936085219707,
"grad_norm": 0.3376853296582342,
"learning_rate": 3.611660079051384e-05,
"loss": 0.3746,
"step": 394
},
{
"epoch": 1.0505992010652463,
"grad_norm": 0.38846148578122447,
"learning_rate": 3.606719367588933e-05,
"loss": 0.3389,
"step": 395
},
{
"epoch": 1.0532623169107855,
"grad_norm": 0.32360005393691865,
"learning_rate": 3.601778656126482e-05,
"loss": 0.3663,
"step": 396
},
{
"epoch": 1.055925432756325,
"grad_norm": 0.326112805381814,
"learning_rate": 3.596837944664031e-05,
"loss": 0.3581,
"step": 397
},
{
"epoch": 1.0585885486018642,
"grad_norm": 0.28926622056464246,
"learning_rate": 3.591897233201581e-05,
"loss": 0.358,
"step": 398
},
{
"epoch": 1.0612516644474035,
"grad_norm": 0.3055465293423247,
"learning_rate": 3.58695652173913e-05,
"loss": 0.3617,
"step": 399
},
{
"epoch": 1.0639147802929427,
"grad_norm": 0.33022021713183336,
"learning_rate": 3.58201581027668e-05,
"loss": 0.353,
"step": 400
},
{
"epoch": 1.066577896138482,
"grad_norm": 0.29024468585164404,
"learning_rate": 3.577075098814229e-05,
"loss": 0.355,
"step": 401
},
{
"epoch": 1.0692410119840212,
"grad_norm": 0.2733040275941461,
"learning_rate": 3.572134387351779e-05,
"loss": 0.3574,
"step": 402
},
{
"epoch": 1.0719041278295607,
"grad_norm": 0.3226214256196561,
"learning_rate": 3.567193675889328e-05,
"loss": 0.3528,
"step": 403
},
{
"epoch": 1.0745672436751,
"grad_norm": 0.31534151465175414,
"learning_rate": 3.5622529644268777e-05,
"loss": 0.3539,
"step": 404
},
{
"epoch": 1.0772303595206392,
"grad_norm": 0.2751061424659443,
"learning_rate": 3.557312252964427e-05,
"loss": 0.3667,
"step": 405
},
{
"epoch": 1.0798934753661784,
"grad_norm": 0.3612676719250419,
"learning_rate": 3.5523715415019765e-05,
"loss": 0.3541,
"step": 406
},
{
"epoch": 1.0825565912117177,
"grad_norm": 0.3011759295136269,
"learning_rate": 3.5474308300395256e-05,
"loss": 0.3606,
"step": 407
},
{
"epoch": 1.085219707057257,
"grad_norm": 0.3978993850172965,
"learning_rate": 3.5424901185770754e-05,
"loss": 0.3626,
"step": 408
},
{
"epoch": 1.0878828229027964,
"grad_norm": 0.2872210237523896,
"learning_rate": 3.5375494071146245e-05,
"loss": 0.3889,
"step": 409
},
{
"epoch": 1.0905459387483356,
"grad_norm": 0.443073058318771,
"learning_rate": 3.532608695652174e-05,
"loss": 0.3535,
"step": 410
},
{
"epoch": 1.0932090545938749,
"grad_norm": 0.33127012106810017,
"learning_rate": 3.5276679841897234e-05,
"loss": 0.3459,
"step": 411
},
{
"epoch": 1.095872170439414,
"grad_norm": 0.2919448905657829,
"learning_rate": 3.522727272727273e-05,
"loss": 0.365,
"step": 412
},
{
"epoch": 1.0985352862849533,
"grad_norm": 0.33466018716475304,
"learning_rate": 3.517786561264822e-05,
"loss": 0.3625,
"step": 413
},
{
"epoch": 1.1011984021304926,
"grad_norm": 0.3413607594653121,
"learning_rate": 3.512845849802372e-05,
"loss": 0.3724,
"step": 414
},
{
"epoch": 1.103861517976032,
"grad_norm": 0.35737975021729407,
"learning_rate": 3.507905138339921e-05,
"loss": 0.3774,
"step": 415
},
{
"epoch": 1.1065246338215713,
"grad_norm": 0.34162270993471044,
"learning_rate": 3.50296442687747e-05,
"loss": 0.3686,
"step": 416
},
{
"epoch": 1.1091877496671105,
"grad_norm": 0.35133143811370443,
"learning_rate": 3.49802371541502e-05,
"loss": 0.3699,
"step": 417
},
{
"epoch": 1.1118508655126498,
"grad_norm": 0.3579722853716089,
"learning_rate": 3.493083003952569e-05,
"loss": 0.3505,
"step": 418
},
{
"epoch": 1.114513981358189,
"grad_norm": 0.2618428057689255,
"learning_rate": 3.488142292490119e-05,
"loss": 0.3463,
"step": 419
},
{
"epoch": 1.1171770972037283,
"grad_norm": 0.35732356240927676,
"learning_rate": 3.483201581027668e-05,
"loss": 0.3473,
"step": 420
},
{
"epoch": 1.1198402130492677,
"grad_norm": 0.34101793627943705,
"learning_rate": 3.478260869565218e-05,
"loss": 0.3738,
"step": 421
},
{
"epoch": 1.122503328894807,
"grad_norm": 0.3005835100136546,
"learning_rate": 3.473320158102767e-05,
"loss": 0.3748,
"step": 422
},
{
"epoch": 1.1251664447403462,
"grad_norm": 0.3512554307406862,
"learning_rate": 3.4683794466403166e-05,
"loss": 0.3578,
"step": 423
},
{
"epoch": 1.1278295605858855,
"grad_norm": 0.3037958675770476,
"learning_rate": 3.463438735177866e-05,
"loss": 0.3812,
"step": 424
},
{
"epoch": 1.1304926764314247,
"grad_norm": 0.33131881019625853,
"learning_rate": 3.4584980237154155e-05,
"loss": 0.3475,
"step": 425
},
{
"epoch": 1.133155792276964,
"grad_norm": 0.2887902456682679,
"learning_rate": 3.4535573122529646e-05,
"loss": 0.3658,
"step": 426
},
{
"epoch": 1.1358189081225034,
"grad_norm": 0.3429001374635811,
"learning_rate": 3.4486166007905144e-05,
"loss": 0.37,
"step": 427
},
{
"epoch": 1.1384820239680427,
"grad_norm": 0.32345869994940707,
"learning_rate": 3.4436758893280635e-05,
"loss": 0.3325,
"step": 428
},
{
"epoch": 1.141145139813582,
"grad_norm": 0.3183193536956743,
"learning_rate": 3.438735177865613e-05,
"loss": 0.3597,
"step": 429
},
{
"epoch": 1.1438082556591211,
"grad_norm": 0.3300209265208329,
"learning_rate": 3.4337944664031624e-05,
"loss": 0.3718,
"step": 430
},
{
"epoch": 1.1464713715046604,
"grad_norm": 0.31339838507600637,
"learning_rate": 3.428853754940712e-05,
"loss": 0.3505,
"step": 431
},
{
"epoch": 1.1491344873501999,
"grad_norm": 0.30103241701187505,
"learning_rate": 3.423913043478261e-05,
"loss": 0.3515,
"step": 432
},
{
"epoch": 1.151797603195739,
"grad_norm": 0.33142077936580827,
"learning_rate": 3.418972332015811e-05,
"loss": 0.3454,
"step": 433
},
{
"epoch": 1.1544607190412783,
"grad_norm": 0.26672583595142774,
"learning_rate": 3.41403162055336e-05,
"loss": 0.3557,
"step": 434
},
{
"epoch": 1.1571238348868176,
"grad_norm": 0.29810972252935447,
"learning_rate": 3.409090909090909e-05,
"loss": 0.3627,
"step": 435
},
{
"epoch": 1.1597869507323568,
"grad_norm": 0.4004613882147666,
"learning_rate": 3.404150197628458e-05,
"loss": 0.3596,
"step": 436
},
{
"epoch": 1.162450066577896,
"grad_norm": 0.3230914038022782,
"learning_rate": 3.399209486166008e-05,
"loss": 0.3494,
"step": 437
},
{
"epoch": 1.1651131824234353,
"grad_norm": 0.26213767359417905,
"learning_rate": 3.394268774703557e-05,
"loss": 0.3686,
"step": 438
},
{
"epoch": 1.1677762982689748,
"grad_norm": 0.4095014774133373,
"learning_rate": 3.389328063241107e-05,
"loss": 0.3688,
"step": 439
},
{
"epoch": 1.170439414114514,
"grad_norm": 0.266377270998587,
"learning_rate": 3.384387351778656e-05,
"loss": 0.3648,
"step": 440
},
{
"epoch": 1.1731025299600533,
"grad_norm": 0.32985529288585497,
"learning_rate": 3.379446640316206e-05,
"loss": 0.3703,
"step": 441
},
{
"epoch": 1.1757656458055925,
"grad_norm": 0.3629424885940422,
"learning_rate": 3.374505928853755e-05,
"loss": 0.3502,
"step": 442
},
{
"epoch": 1.1784287616511318,
"grad_norm": 0.29079091604622403,
"learning_rate": 3.369565217391305e-05,
"loss": 0.3696,
"step": 443
},
{
"epoch": 1.1810918774966712,
"grad_norm": 0.36019836895937174,
"learning_rate": 3.364624505928854e-05,
"loss": 0.3507,
"step": 444
},
{
"epoch": 1.1837549933422105,
"grad_norm": 0.3710021105040673,
"learning_rate": 3.3596837944664036e-05,
"loss": 0.3458,
"step": 445
},
{
"epoch": 1.1864181091877497,
"grad_norm": 0.2814671230360335,
"learning_rate": 3.354743083003953e-05,
"loss": 0.3625,
"step": 446
},
{
"epoch": 1.189081225033289,
"grad_norm": 0.39752143956114194,
"learning_rate": 3.3498023715415025e-05,
"loss": 0.3372,
"step": 447
},
{
"epoch": 1.1917443408788282,
"grad_norm": 0.3447518628047081,
"learning_rate": 3.3448616600790516e-05,
"loss": 0.352,
"step": 448
},
{
"epoch": 1.1944074567243674,
"grad_norm": 0.23476338435026442,
"learning_rate": 3.3399209486166014e-05,
"loss": 0.3433,
"step": 449
},
{
"epoch": 1.1970705725699067,
"grad_norm": 0.41285793244761565,
"learning_rate": 3.3349802371541505e-05,
"loss": 0.3507,
"step": 450
},
{
"epoch": 1.1997336884154461,
"grad_norm": 0.2756526642604148,
"learning_rate": 3.3300395256917e-05,
"loss": 0.3679,
"step": 451
},
{
"epoch": 1.2023968042609854,
"grad_norm": 0.35361646973541144,
"learning_rate": 3.325098814229249e-05,
"loss": 0.3771,
"step": 452
},
{
"epoch": 1.2050599201065246,
"grad_norm": 0.3011012199917682,
"learning_rate": 3.320158102766799e-05,
"loss": 0.3501,
"step": 453
},
{
"epoch": 1.2077230359520639,
"grad_norm": 0.2753809532139054,
"learning_rate": 3.3152173913043475e-05,
"loss": 0.3751,
"step": 454
},
{
"epoch": 1.2103861517976031,
"grad_norm": 0.345446601586865,
"learning_rate": 3.310276679841897e-05,
"loss": 0.3675,
"step": 455
},
{
"epoch": 1.2130492676431426,
"grad_norm": 0.3105483046559569,
"learning_rate": 3.3053359683794464e-05,
"loss": 0.3473,
"step": 456
},
{
"epoch": 1.2157123834886818,
"grad_norm": 0.31097501000340777,
"learning_rate": 3.300395256916996e-05,
"loss": 0.3685,
"step": 457
},
{
"epoch": 1.218375499334221,
"grad_norm": 0.35861972517870744,
"learning_rate": 3.295454545454545e-05,
"loss": 0.3493,
"step": 458
},
{
"epoch": 1.2210386151797603,
"grad_norm": 0.2497414905559577,
"learning_rate": 3.290513833992095e-05,
"loss": 0.3596,
"step": 459
},
{
"epoch": 1.2237017310252996,
"grad_norm": 0.3260671903675003,
"learning_rate": 3.285573122529644e-05,
"loss": 0.3584,
"step": 460
},
{
"epoch": 1.2263648468708388,
"grad_norm": 0.303125715747872,
"learning_rate": 3.280632411067194e-05,
"loss": 0.3468,
"step": 461
},
{
"epoch": 1.229027962716378,
"grad_norm": 0.2894307336548194,
"learning_rate": 3.275691699604743e-05,
"loss": 0.3589,
"step": 462
},
{
"epoch": 1.2316910785619175,
"grad_norm": 0.3081296705994847,
"learning_rate": 3.270750988142293e-05,
"loss": 0.3586,
"step": 463
},
{
"epoch": 1.2343541944074568,
"grad_norm": 0.2926327290593828,
"learning_rate": 3.265810276679842e-05,
"loss": 0.3594,
"step": 464
},
{
"epoch": 1.237017310252996,
"grad_norm": 0.3050352656827861,
"learning_rate": 3.260869565217392e-05,
"loss": 0.3794,
"step": 465
},
{
"epoch": 1.2396804260985352,
"grad_norm": 0.34421850278839233,
"learning_rate": 3.255928853754941e-05,
"loss": 0.3448,
"step": 466
},
{
"epoch": 1.2423435419440745,
"grad_norm": 0.3178141996560178,
"learning_rate": 3.2509881422924906e-05,
"loss": 0.3596,
"step": 467
},
{
"epoch": 1.245006657789614,
"grad_norm": 0.36055320312739547,
"learning_rate": 3.24604743083004e-05,
"loss": 0.3374,
"step": 468
},
{
"epoch": 1.2476697736351532,
"grad_norm": 0.2584894490878346,
"learning_rate": 3.2411067193675894e-05,
"loss": 0.3381,
"step": 469
},
{
"epoch": 1.2503328894806924,
"grad_norm": 0.3556442871963007,
"learning_rate": 3.2361660079051385e-05,
"loss": 0.3757,
"step": 470
},
{
"epoch": 1.2529960053262317,
"grad_norm": 0.2936471278443274,
"learning_rate": 3.231225296442688e-05,
"loss": 0.3612,
"step": 471
},
{
"epoch": 1.255659121171771,
"grad_norm": 0.34920820452723006,
"learning_rate": 3.2262845849802374e-05,
"loss": 0.3571,
"step": 472
},
{
"epoch": 1.2583222370173104,
"grad_norm": 0.27353129045046504,
"learning_rate": 3.221343873517787e-05,
"loss": 0.366,
"step": 473
},
{
"epoch": 1.2609853528628494,
"grad_norm": 0.3336825600119343,
"learning_rate": 3.2164031620553356e-05,
"loss": 0.3682,
"step": 474
},
{
"epoch": 1.2636484687083889,
"grad_norm": 0.28422664920281926,
"learning_rate": 3.2114624505928854e-05,
"loss": 0.3574,
"step": 475
},
{
"epoch": 1.2663115845539281,
"grad_norm": 0.27995772097533356,
"learning_rate": 3.2065217391304345e-05,
"loss": 0.3577,
"step": 476
},
{
"epoch": 1.2689747003994674,
"grad_norm": 0.3073145651684054,
"learning_rate": 3.201581027667984e-05,
"loss": 0.356,
"step": 477
},
{
"epoch": 1.2716378162450066,
"grad_norm": 0.2926799912079748,
"learning_rate": 3.1966403162055334e-05,
"loss": 0.3398,
"step": 478
},
{
"epoch": 1.2743009320905458,
"grad_norm": 0.2638946062975387,
"learning_rate": 3.191699604743083e-05,
"loss": 0.3742,
"step": 479
},
{
"epoch": 1.2769640479360853,
"grad_norm": 0.3188095670364053,
"learning_rate": 3.186758893280632e-05,
"loss": 0.3564,
"step": 480
},
{
"epoch": 1.2796271637816246,
"grad_norm": 0.2620162833825017,
"learning_rate": 3.181818181818182e-05,
"loss": 0.36,
"step": 481
},
{
"epoch": 1.2822902796271638,
"grad_norm": 0.34823059030048475,
"learning_rate": 3.176877470355731e-05,
"loss": 0.3595,
"step": 482
},
{
"epoch": 1.284953395472703,
"grad_norm": 0.31553137736166625,
"learning_rate": 3.171936758893281e-05,
"loss": 0.3599,
"step": 483
},
{
"epoch": 1.2876165113182423,
"grad_norm": 0.2955708469323441,
"learning_rate": 3.16699604743083e-05,
"loss": 0.3402,
"step": 484
},
{
"epoch": 1.2902796271637818,
"grad_norm": 0.3913482669169413,
"learning_rate": 3.16205533596838e-05,
"loss": 0.3758,
"step": 485
},
{
"epoch": 1.2929427430093208,
"grad_norm": 0.35700628657251265,
"learning_rate": 3.157114624505929e-05,
"loss": 0.3581,
"step": 486
},
{
"epoch": 1.2956058588548602,
"grad_norm": 0.3014863988052369,
"learning_rate": 3.152173913043479e-05,
"loss": 0.3554,
"step": 487
},
{
"epoch": 1.2982689747003995,
"grad_norm": 0.3644987716917946,
"learning_rate": 3.147233201581028e-05,
"loss": 0.3562,
"step": 488
},
{
"epoch": 1.3009320905459387,
"grad_norm": 0.30956500239595414,
"learning_rate": 3.1422924901185775e-05,
"loss": 0.3454,
"step": 489
},
{
"epoch": 1.303595206391478,
"grad_norm": 0.4175232794253573,
"learning_rate": 3.1373517786561266e-05,
"loss": 0.3641,
"step": 490
},
{
"epoch": 1.3062583222370172,
"grad_norm": 0.28246226404029123,
"learning_rate": 3.1324110671936764e-05,
"loss": 0.3601,
"step": 491
},
{
"epoch": 1.3089214380825567,
"grad_norm": 0.3755376891190061,
"learning_rate": 3.1274703557312255e-05,
"loss": 0.3774,
"step": 492
},
{
"epoch": 1.311584553928096,
"grad_norm": 0.27298674883257873,
"learning_rate": 3.1225296442687746e-05,
"loss": 0.3627,
"step": 493
},
{
"epoch": 1.3142476697736352,
"grad_norm": 0.3706229801540267,
"learning_rate": 3.117588932806324e-05,
"loss": 0.3735,
"step": 494
},
{
"epoch": 1.3169107856191744,
"grad_norm": 0.28143910738942546,
"learning_rate": 3.1126482213438735e-05,
"loss": 0.3725,
"step": 495
},
{
"epoch": 1.3195739014647137,
"grad_norm": 0.3349025665393724,
"learning_rate": 3.1077075098814226e-05,
"loss": 0.3659,
"step": 496
},
{
"epoch": 1.3222370173102531,
"grad_norm": 0.29588987329109573,
"learning_rate": 3.1027667984189724e-05,
"loss": 0.3749,
"step": 497
},
{
"epoch": 1.3249001331557924,
"grad_norm": 0.27901948593654424,
"learning_rate": 3.0978260869565215e-05,
"loss": 0.3555,
"step": 498
},
{
"epoch": 1.3275632490013316,
"grad_norm": 0.3180943674654497,
"learning_rate": 3.092885375494071e-05,
"loss": 0.3399,
"step": 499
},
{
"epoch": 1.3302263648468708,
"grad_norm": 0.3257820898386027,
"learning_rate": 3.0879446640316203e-05,
"loss": 0.3592,
"step": 500
},
{
"epoch": 1.33288948069241,
"grad_norm": 0.29341640703427146,
"learning_rate": 3.08300395256917e-05,
"loss": 0.3602,
"step": 501
},
{
"epoch": 1.3355525965379493,
"grad_norm": 0.2975810782284494,
"learning_rate": 3.078063241106719e-05,
"loss": 0.3392,
"step": 502
},
{
"epoch": 1.3382157123834886,
"grad_norm": 0.26682712897635374,
"learning_rate": 3.073122529644269e-05,
"loss": 0.3539,
"step": 503
},
{
"epoch": 1.340878828229028,
"grad_norm": 0.29028707302441564,
"learning_rate": 3.068181818181818e-05,
"loss": 0.3511,
"step": 504
},
{
"epoch": 1.3435419440745673,
"grad_norm": 0.32760242848226895,
"learning_rate": 3.063241106719368e-05,
"loss": 0.3804,
"step": 505
},
{
"epoch": 1.3462050599201065,
"grad_norm": 0.3092786220233137,
"learning_rate": 3.058300395256917e-05,
"loss": 0.3699,
"step": 506
},
{
"epoch": 1.3488681757656458,
"grad_norm": 0.3020724813833627,
"learning_rate": 3.053359683794467e-05,
"loss": 0.3676,
"step": 507
},
{
"epoch": 1.351531291611185,
"grad_norm": 0.2824033966398368,
"learning_rate": 3.0484189723320162e-05,
"loss": 0.3729,
"step": 508
},
{
"epoch": 1.3541944074567245,
"grad_norm": 0.3618887388165828,
"learning_rate": 3.0434782608695656e-05,
"loss": 0.3554,
"step": 509
},
{
"epoch": 1.3568575233022637,
"grad_norm": 0.28130180514019887,
"learning_rate": 3.038537549407115e-05,
"loss": 0.3553,
"step": 510
},
{
"epoch": 1.359520639147803,
"grad_norm": 0.2893653104001468,
"learning_rate": 3.0335968379446645e-05,
"loss": 0.3782,
"step": 511
},
{
"epoch": 1.3621837549933422,
"grad_norm": 0.3469803538239057,
"learning_rate": 3.0286561264822133e-05,
"loss": 0.3464,
"step": 512
},
{
"epoch": 1.3648468708388815,
"grad_norm": 0.2732418490440155,
"learning_rate": 3.0237154150197627e-05,
"loss": 0.3616,
"step": 513
},
{
"epoch": 1.3675099866844207,
"grad_norm": 0.28562062527552706,
"learning_rate": 3.018774703557312e-05,
"loss": 0.3535,
"step": 514
},
{
"epoch": 1.37017310252996,
"grad_norm": 0.2658369004792245,
"learning_rate": 3.0138339920948616e-05,
"loss": 0.3725,
"step": 515
},
{
"epoch": 1.3728362183754994,
"grad_norm": 0.29358847654377684,
"learning_rate": 3.008893280632411e-05,
"loss": 0.3496,
"step": 516
},
{
"epoch": 1.3754993342210386,
"grad_norm": 0.27539943140564604,
"learning_rate": 3.0039525691699605e-05,
"loss": 0.369,
"step": 517
},
{
"epoch": 1.378162450066578,
"grad_norm": 0.300263236071914,
"learning_rate": 2.99901185770751e-05,
"loss": 0.3585,
"step": 518
},
{
"epoch": 1.3808255659121171,
"grad_norm": 0.31613231965587374,
"learning_rate": 2.9940711462450593e-05,
"loss": 0.3777,
"step": 519
},
{
"epoch": 1.3834886817576564,
"grad_norm": 0.2770700909868314,
"learning_rate": 2.9891304347826088e-05,
"loss": 0.3561,
"step": 520
},
{
"epoch": 1.3861517976031958,
"grad_norm": 0.3050401099786546,
"learning_rate": 2.9841897233201582e-05,
"loss": 0.3563,
"step": 521
},
{
"epoch": 1.388814913448735,
"grad_norm": 0.2533844111874208,
"learning_rate": 2.9792490118577076e-05,
"loss": 0.3469,
"step": 522
},
{
"epoch": 1.3914780292942743,
"grad_norm": 0.2695972396120006,
"learning_rate": 2.974308300395257e-05,
"loss": 0.3621,
"step": 523
},
{
"epoch": 1.3941411451398136,
"grad_norm": 0.28186697645815617,
"learning_rate": 2.9693675889328065e-05,
"loss": 0.3559,
"step": 524
},
{
"epoch": 1.3968042609853528,
"grad_norm": 0.26628352738719235,
"learning_rate": 2.964426877470356e-05,
"loss": 0.3646,
"step": 525
},
{
"epoch": 1.399467376830892,
"grad_norm": 0.2833122304678988,
"learning_rate": 2.9594861660079054e-05,
"loss": 0.3552,
"step": 526
},
{
"epoch": 1.4021304926764313,
"grad_norm": 0.26716813523678146,
"learning_rate": 2.954545454545455e-05,
"loss": 0.3345,
"step": 527
},
{
"epoch": 1.4047936085219708,
"grad_norm": 0.2754005215378796,
"learning_rate": 2.9496047430830043e-05,
"loss": 0.3531,
"step": 528
},
{
"epoch": 1.40745672436751,
"grad_norm": 0.3036387674463336,
"learning_rate": 2.9446640316205537e-05,
"loss": 0.3394,
"step": 529
},
{
"epoch": 1.4101198402130493,
"grad_norm": 0.28788105676480225,
"learning_rate": 2.939723320158103e-05,
"loss": 0.342,
"step": 530
},
{
"epoch": 1.4127829560585885,
"grad_norm": 0.28191999375557225,
"learning_rate": 2.9347826086956526e-05,
"loss": 0.3488,
"step": 531
},
{
"epoch": 1.4154460719041277,
"grad_norm": 0.2973599610924886,
"learning_rate": 2.9298418972332014e-05,
"loss": 0.369,
"step": 532
},
{
"epoch": 1.4181091877496672,
"grad_norm": 0.29639597168777376,
"learning_rate": 2.9249011857707508e-05,
"loss": 0.3696,
"step": 533
},
{
"epoch": 1.4207723035952065,
"grad_norm": 0.2943864772067253,
"learning_rate": 2.9199604743083002e-05,
"loss": 0.3708,
"step": 534
},
{
"epoch": 1.4234354194407457,
"grad_norm": 0.3275031870349291,
"learning_rate": 2.9150197628458497e-05,
"loss": 0.359,
"step": 535
},
{
"epoch": 1.426098535286285,
"grad_norm": 0.288973368099439,
"learning_rate": 2.910079051383399e-05,
"loss": 0.3534,
"step": 536
},
{
"epoch": 1.4287616511318242,
"grad_norm": 0.3066522465043432,
"learning_rate": 2.9051383399209485e-05,
"loss": 0.3568,
"step": 537
},
{
"epoch": 1.4314247669773636,
"grad_norm": 0.3056985012074139,
"learning_rate": 2.900197628458498e-05,
"loss": 0.3457,
"step": 538
},
{
"epoch": 1.4340878828229027,
"grad_norm": 0.2793941010759859,
"learning_rate": 2.8952569169960474e-05,
"loss": 0.3559,
"step": 539
},
{
"epoch": 1.4367509986684421,
"grad_norm": 0.2535278252678889,
"learning_rate": 2.890316205533597e-05,
"loss": 0.3528,
"step": 540
},
{
"epoch": 1.4394141145139814,
"grad_norm": 0.2842251418338047,
"learning_rate": 2.8853754940711463e-05,
"loss": 0.3522,
"step": 541
},
{
"epoch": 1.4420772303595206,
"grad_norm": 0.2778073412674222,
"learning_rate": 2.8804347826086957e-05,
"loss": 0.3603,
"step": 542
},
{
"epoch": 1.4447403462050599,
"grad_norm": 0.2554361454610928,
"learning_rate": 2.8754940711462452e-05,
"loss": 0.3635,
"step": 543
},
{
"epoch": 1.447403462050599,
"grad_norm": 0.3049003958057493,
"learning_rate": 2.8705533596837946e-05,
"loss": 0.3602,
"step": 544
},
{
"epoch": 1.4500665778961386,
"grad_norm": 0.2675057851041106,
"learning_rate": 2.865612648221344e-05,
"loss": 0.3612,
"step": 545
},
{
"epoch": 1.4527296937416778,
"grad_norm": 0.24887490119807607,
"learning_rate": 2.8606719367588935e-05,
"loss": 0.3654,
"step": 546
},
{
"epoch": 1.455392809587217,
"grad_norm": 0.3195728958038635,
"learning_rate": 2.855731225296443e-05,
"loss": 0.3513,
"step": 547
},
{
"epoch": 1.4580559254327563,
"grad_norm": 0.2546987092178984,
"learning_rate": 2.8507905138339924e-05,
"loss": 0.3398,
"step": 548
},
{
"epoch": 1.4607190412782955,
"grad_norm": 0.29773690473267483,
"learning_rate": 2.8458498023715418e-05,
"loss": 0.3694,
"step": 549
},
{
"epoch": 1.463382157123835,
"grad_norm": 0.29315481833169116,
"learning_rate": 2.8409090909090912e-05,
"loss": 0.3426,
"step": 550
},
{
"epoch": 1.466045272969374,
"grad_norm": 0.3296358712762741,
"learning_rate": 2.8359683794466403e-05,
"loss": 0.3761,
"step": 551
},
{
"epoch": 1.4687083888149135,
"grad_norm": 0.2989240945630588,
"learning_rate": 2.8310276679841894e-05,
"loss": 0.3574,
"step": 552
},
{
"epoch": 1.4713715046604527,
"grad_norm": 0.2933347023687216,
"learning_rate": 2.826086956521739e-05,
"loss": 0.3615,
"step": 553
},
{
"epoch": 1.474034620505992,
"grad_norm": 0.31885875118020457,
"learning_rate": 2.8211462450592883e-05,
"loss": 0.3645,
"step": 554
},
{
"epoch": 1.4766977363515312,
"grad_norm": 0.2777657172797497,
"learning_rate": 2.8162055335968378e-05,
"loss": 0.3531,
"step": 555
},
{
"epoch": 1.4793608521970705,
"grad_norm": 0.3318676753935055,
"learning_rate": 2.8112648221343872e-05,
"loss": 0.3668,
"step": 556
},
{
"epoch": 1.48202396804261,
"grad_norm": 0.3316376422278272,
"learning_rate": 2.8063241106719366e-05,
"loss": 0.348,
"step": 557
},
{
"epoch": 1.4846870838881492,
"grad_norm": 0.34334200086282374,
"learning_rate": 2.801383399209486e-05,
"loss": 0.3684,
"step": 558
},
{
"epoch": 1.4873501997336884,
"grad_norm": 0.2998752672686297,
"learning_rate": 2.7964426877470355e-05,
"loss": 0.343,
"step": 559
},
{
"epoch": 1.4900133155792277,
"grad_norm": 0.323718625297975,
"learning_rate": 2.791501976284585e-05,
"loss": 0.3435,
"step": 560
},
{
"epoch": 1.492676431424767,
"grad_norm": 0.3042077739086944,
"learning_rate": 2.7865612648221344e-05,
"loss": 0.357,
"step": 561
},
{
"epoch": 1.4953395472703064,
"grad_norm": 0.3132911982849499,
"learning_rate": 2.7816205533596838e-05,
"loss": 0.3481,
"step": 562
},
{
"epoch": 1.4980026631158454,
"grad_norm": 0.25389583970465485,
"learning_rate": 2.7766798418972333e-05,
"loss": 0.3567,
"step": 563
},
{
"epoch": 1.5006657789613849,
"grad_norm": 0.263337393271962,
"learning_rate": 2.7717391304347827e-05,
"loss": 0.3431,
"step": 564
},
{
"epoch": 1.503328894806924,
"grad_norm": 0.2712654205175259,
"learning_rate": 2.766798418972332e-05,
"loss": 0.3582,
"step": 565
},
{
"epoch": 1.5059920106524634,
"grad_norm": 0.2612896047069462,
"learning_rate": 2.7618577075098816e-05,
"loss": 0.3445,
"step": 566
},
{
"epoch": 1.5086551264980028,
"grad_norm": 0.27219615901029837,
"learning_rate": 2.756916996047431e-05,
"loss": 0.3652,
"step": 567
},
{
"epoch": 1.5113182423435418,
"grad_norm": 0.24840155978956244,
"learning_rate": 2.7519762845849805e-05,
"loss": 0.3421,
"step": 568
},
{
"epoch": 1.5139813581890813,
"grad_norm": 0.24176135920761713,
"learning_rate": 2.74703557312253e-05,
"loss": 0.3512,
"step": 569
},
{
"epoch": 1.5166444740346205,
"grad_norm": 0.2647051981979065,
"learning_rate": 2.7420948616600793e-05,
"loss": 0.3499,
"step": 570
},
{
"epoch": 1.5193075898801598,
"grad_norm": 0.27211007538489024,
"learning_rate": 2.7371541501976284e-05,
"loss": 0.3462,
"step": 571
},
{
"epoch": 1.521970705725699,
"grad_norm": 0.2507493740105373,
"learning_rate": 2.732213438735178e-05,
"loss": 0.3434,
"step": 572
},
{
"epoch": 1.5246338215712383,
"grad_norm": 0.2693556555763232,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.3615,
"step": 573
},
{
"epoch": 1.5272969374167777,
"grad_norm": 0.274645850715254,
"learning_rate": 2.7223320158102767e-05,
"loss": 0.3445,
"step": 574
},
{
"epoch": 1.5299600532623168,
"grad_norm": 0.24351837189102682,
"learning_rate": 2.7173913043478262e-05,
"loss": 0.3686,
"step": 575
},
{
"epoch": 1.5326231691078562,
"grad_norm": 0.27710340393878174,
"learning_rate": 2.7124505928853756e-05,
"loss": 0.3547,
"step": 576
},
{
"epoch": 1.5352862849533955,
"grad_norm": 0.2806488747523977,
"learning_rate": 2.707509881422925e-05,
"loss": 0.3672,
"step": 577
},
{
"epoch": 1.5379494007989347,
"grad_norm": 0.32294972985992815,
"learning_rate": 2.7025691699604745e-05,
"loss": 0.3527,
"step": 578
},
{
"epoch": 1.5406125166444742,
"grad_norm": 0.24771959309258884,
"learning_rate": 2.697628458498024e-05,
"loss": 0.3626,
"step": 579
},
{
"epoch": 1.5432756324900132,
"grad_norm": 0.31974111618484613,
"learning_rate": 2.6926877470355734e-05,
"loss": 0.3553,
"step": 580
},
{
"epoch": 1.5459387483355527,
"grad_norm": 0.28071413168163195,
"learning_rate": 2.6877470355731228e-05,
"loss": 0.3676,
"step": 581
},
{
"epoch": 1.548601864181092,
"grad_norm": 0.2584928716043461,
"learning_rate": 2.6828063241106723e-05,
"loss": 0.3427,
"step": 582
},
{
"epoch": 1.5512649800266312,
"grad_norm": 0.2648608207536266,
"learning_rate": 2.6778656126482217e-05,
"loss": 0.3377,
"step": 583
},
{
"epoch": 1.5539280958721704,
"grad_norm": 0.2671119266891378,
"learning_rate": 2.672924901185771e-05,
"loss": 0.3559,
"step": 584
},
{
"epoch": 1.5565912117177096,
"grad_norm": 0.2840788018392293,
"learning_rate": 2.6679841897233206e-05,
"loss": 0.355,
"step": 585
},
{
"epoch": 1.559254327563249,
"grad_norm": 0.29216560920303836,
"learning_rate": 2.66304347826087e-05,
"loss": 0.3625,
"step": 586
},
{
"epoch": 1.5619174434087881,
"grad_norm": 0.2782406231477868,
"learning_rate": 2.6581027667984194e-05,
"loss": 0.3544,
"step": 587
},
{
"epoch": 1.5645805592543276,
"grad_norm": 0.27482653297611137,
"learning_rate": 2.653162055335969e-05,
"loss": 0.3505,
"step": 588
},
{
"epoch": 1.5672436750998668,
"grad_norm": 0.2737639812672786,
"learning_rate": 2.6482213438735183e-05,
"loss": 0.3339,
"step": 589
},
{
"epoch": 1.569906790945406,
"grad_norm": 0.30172379604459587,
"learning_rate": 2.643280632411067e-05,
"loss": 0.3574,
"step": 590
},
{
"epoch": 1.5725699067909455,
"grad_norm": 0.30937296239336515,
"learning_rate": 2.6383399209486165e-05,
"loss": 0.3552,
"step": 591
},
{
"epoch": 1.5752330226364846,
"grad_norm": 0.30263893603202113,
"learning_rate": 2.633399209486166e-05,
"loss": 0.3806,
"step": 592
},
{
"epoch": 1.577896138482024,
"grad_norm": 0.36351951882340405,
"learning_rate": 2.6284584980237154e-05,
"loss": 0.3483,
"step": 593
},
{
"epoch": 1.5805592543275633,
"grad_norm": 0.27596120256597706,
"learning_rate": 2.623517786561265e-05,
"loss": 0.3785,
"step": 594
},
{
"epoch": 1.5832223701731025,
"grad_norm": 0.30086295136857,
"learning_rate": 2.6185770750988143e-05,
"loss": 0.3536,
"step": 595
},
{
"epoch": 1.5858854860186418,
"grad_norm": 0.3786534775512319,
"learning_rate": 2.6136363636363637e-05,
"loss": 0.3577,
"step": 596
},
{
"epoch": 1.588548601864181,
"grad_norm": 0.294153803281236,
"learning_rate": 2.608695652173913e-05,
"loss": 0.3603,
"step": 597
},
{
"epoch": 1.5912117177097205,
"grad_norm": 0.316506621080003,
"learning_rate": 2.6037549407114626e-05,
"loss": 0.3763,
"step": 598
},
{
"epoch": 1.5938748335552595,
"grad_norm": 0.31539133712695033,
"learning_rate": 2.598814229249012e-05,
"loss": 0.3373,
"step": 599
},
{
"epoch": 1.596537949400799,
"grad_norm": 0.29787884422276756,
"learning_rate": 2.5938735177865615e-05,
"loss": 0.3461,
"step": 600
},
{
"epoch": 1.5992010652463382,
"grad_norm": 0.2794574362382508,
"learning_rate": 2.588932806324111e-05,
"loss": 0.3607,
"step": 601
},
{
"epoch": 1.6018641810918774,
"grad_norm": 0.28198668683252337,
"learning_rate": 2.5839920948616603e-05,
"loss": 0.3698,
"step": 602
},
{
"epoch": 1.604527296937417,
"grad_norm": 0.2767707782956735,
"learning_rate": 2.5790513833992098e-05,
"loss": 0.3358,
"step": 603
},
{
"epoch": 1.607190412782956,
"grad_norm": 0.26770289783678053,
"learning_rate": 2.5741106719367592e-05,
"loss": 0.3376,
"step": 604
},
{
"epoch": 1.6098535286284954,
"grad_norm": 0.3244106061056206,
"learning_rate": 2.5691699604743087e-05,
"loss": 0.3515,
"step": 605
},
{
"epoch": 1.6125166444740346,
"grad_norm": 0.29260066196150414,
"learning_rate": 2.564229249011858e-05,
"loss": 0.3712,
"step": 606
},
{
"epoch": 1.6151797603195739,
"grad_norm": 0.39595763824507085,
"learning_rate": 2.5592885375494075e-05,
"loss": 0.3402,
"step": 607
},
{
"epoch": 1.6178428761651131,
"grad_norm": 0.2911698047056363,
"learning_rate": 2.554347826086957e-05,
"loss": 0.3579,
"step": 608
},
{
"epoch": 1.6205059920106524,
"grad_norm": 0.30667505894069086,
"learning_rate": 2.5494071146245064e-05,
"loss": 0.3488,
"step": 609
},
{
"epoch": 1.6231691078561918,
"grad_norm": 0.3377626596928706,
"learning_rate": 2.5444664031620552e-05,
"loss": 0.3455,
"step": 610
},
{
"epoch": 1.6258322237017309,
"grad_norm": 0.3019507720671119,
"learning_rate": 2.5395256916996046e-05,
"loss": 0.352,
"step": 611
},
{
"epoch": 1.6284953395472703,
"grad_norm": 0.2835949922829532,
"learning_rate": 2.534584980237154e-05,
"loss": 0.3602,
"step": 612
},
{
"epoch": 1.6311584553928096,
"grad_norm": 0.32444980944074003,
"learning_rate": 2.5296442687747035e-05,
"loss": 0.3626,
"step": 613
},
{
"epoch": 1.6338215712383488,
"grad_norm": 0.30852262333031255,
"learning_rate": 2.524703557312253e-05,
"loss": 0.3415,
"step": 614
},
{
"epoch": 1.6364846870838883,
"grad_norm": 0.2769395153617194,
"learning_rate": 2.5197628458498024e-05,
"loss": 0.36,
"step": 615
},
{
"epoch": 1.6391478029294273,
"grad_norm": 0.3225695333542542,
"learning_rate": 2.5148221343873518e-05,
"loss": 0.3504,
"step": 616
},
{
"epoch": 1.6418109187749668,
"grad_norm": 0.26000908179747434,
"learning_rate": 2.5098814229249012e-05,
"loss": 0.3511,
"step": 617
},
{
"epoch": 1.644474034620506,
"grad_norm": 0.2558998742720099,
"learning_rate": 2.5049407114624507e-05,
"loss": 0.3551,
"step": 618
},
{
"epoch": 1.6471371504660453,
"grad_norm": 0.2810631366750719,
"learning_rate": 2.5e-05,
"loss": 0.359,
"step": 619
},
{
"epoch": 1.6498002663115847,
"grad_norm": 0.2764036943026752,
"learning_rate": 2.4950592885375496e-05,
"loss": 0.3552,
"step": 620
},
{
"epoch": 1.6524633821571237,
"grad_norm": 0.29157627798525887,
"learning_rate": 2.490118577075099e-05,
"loss": 0.3477,
"step": 621
},
{
"epoch": 1.6551264980026632,
"grad_norm": 0.30005399168360375,
"learning_rate": 2.4851778656126484e-05,
"loss": 0.3635,
"step": 622
},
{
"epoch": 1.6577896138482024,
"grad_norm": 0.28682265413573244,
"learning_rate": 2.480237154150198e-05,
"loss": 0.3472,
"step": 623
},
{
"epoch": 1.6604527296937417,
"grad_norm": 0.30810891527099654,
"learning_rate": 2.475296442687747e-05,
"loss": 0.3453,
"step": 624
},
{
"epoch": 1.663115845539281,
"grad_norm": 0.2894658697891752,
"learning_rate": 2.4703557312252964e-05,
"loss": 0.348,
"step": 625
},
{
"epoch": 1.6657789613848202,
"grad_norm": 0.26056026406293753,
"learning_rate": 2.465415019762846e-05,
"loss": 0.3422,
"step": 626
},
{
"epoch": 1.6684420772303596,
"grad_norm": 0.27955802745377495,
"learning_rate": 2.4604743083003953e-05,
"loss": 0.351,
"step": 627
},
{
"epoch": 1.6711051930758987,
"grad_norm": 0.2589447838000819,
"learning_rate": 2.4555335968379447e-05,
"loss": 0.3606,
"step": 628
},
{
"epoch": 1.6737683089214381,
"grad_norm": 0.2726720946381243,
"learning_rate": 2.450592885375494e-05,
"loss": 0.3553,
"step": 629
},
{
"epoch": 1.6764314247669774,
"grad_norm": 0.29585982981776077,
"learning_rate": 2.4456521739130436e-05,
"loss": 0.3429,
"step": 630
},
{
"epoch": 1.6790945406125166,
"grad_norm": 0.25866785993085295,
"learning_rate": 2.440711462450593e-05,
"loss": 0.3464,
"step": 631
},
{
"epoch": 1.681757656458056,
"grad_norm": 0.26186173743371105,
"learning_rate": 2.4357707509881425e-05,
"loss": 0.3624,
"step": 632
},
{
"epoch": 1.684420772303595,
"grad_norm": 0.27529386090536323,
"learning_rate": 2.430830039525692e-05,
"loss": 0.3464,
"step": 633
},
{
"epoch": 1.6870838881491346,
"grad_norm": 0.24305368943964414,
"learning_rate": 2.425889328063241e-05,
"loss": 0.3542,
"step": 634
},
{
"epoch": 1.6897470039946738,
"grad_norm": 0.263035963649886,
"learning_rate": 2.4209486166007905e-05,
"loss": 0.3638,
"step": 635
},
{
"epoch": 1.692410119840213,
"grad_norm": 0.2737080512587832,
"learning_rate": 2.41600790513834e-05,
"loss": 0.3368,
"step": 636
},
{
"epoch": 1.6950732356857523,
"grad_norm": 0.33404220986339256,
"learning_rate": 2.4110671936758893e-05,
"loss": 0.3724,
"step": 637
},
{
"epoch": 1.6977363515312915,
"grad_norm": 0.2897416261690682,
"learning_rate": 2.4061264822134388e-05,
"loss": 0.3593,
"step": 638
},
{
"epoch": 1.700399467376831,
"grad_norm": 0.3041816217006561,
"learning_rate": 2.4011857707509882e-05,
"loss": 0.3513,
"step": 639
},
{
"epoch": 1.70306258322237,
"grad_norm": 0.2677006117678147,
"learning_rate": 2.3962450592885376e-05,
"loss": 0.3594,
"step": 640
},
{
"epoch": 1.7057256990679095,
"grad_norm": 0.2783081801536929,
"learning_rate": 2.391304347826087e-05,
"loss": 0.3497,
"step": 641
},
{
"epoch": 1.7083888149134487,
"grad_norm": 0.2949970037820572,
"learning_rate": 2.3863636363636365e-05,
"loss": 0.3527,
"step": 642
},
{
"epoch": 1.711051930758988,
"grad_norm": 0.29435826287206446,
"learning_rate": 2.381422924901186e-05,
"loss": 0.3476,
"step": 643
},
{
"epoch": 1.7137150466045274,
"grad_norm": 0.22820704347237256,
"learning_rate": 2.376482213438735e-05,
"loss": 0.3563,
"step": 644
},
{
"epoch": 1.7163781624500665,
"grad_norm": 0.2662369562790593,
"learning_rate": 2.3715415019762845e-05,
"loss": 0.3564,
"step": 645
},
{
"epoch": 1.719041278295606,
"grad_norm": 0.2660848595820705,
"learning_rate": 2.366600790513834e-05,
"loss": 0.3507,
"step": 646
},
{
"epoch": 1.7217043941411452,
"grad_norm": 0.2736362440179924,
"learning_rate": 2.3616600790513834e-05,
"loss": 0.3583,
"step": 647
},
{
"epoch": 1.7243675099866844,
"grad_norm": 0.2877841104207108,
"learning_rate": 2.3567193675889328e-05,
"loss": 0.3543,
"step": 648
},
{
"epoch": 1.7270306258322237,
"grad_norm": 0.26935615929008033,
"learning_rate": 2.3517786561264823e-05,
"loss": 0.3437,
"step": 649
},
{
"epoch": 1.729693741677763,
"grad_norm": 0.2578776022705283,
"learning_rate": 2.3468379446640317e-05,
"loss": 0.3665,
"step": 650
},
{
"epoch": 1.7323568575233024,
"grad_norm": 0.28540169794092723,
"learning_rate": 2.341897233201581e-05,
"loss": 0.3427,
"step": 651
},
{
"epoch": 1.7350199733688414,
"grad_norm": 0.302406678764912,
"learning_rate": 2.3369565217391306e-05,
"loss": 0.3493,
"step": 652
},
{
"epoch": 1.7376830892143809,
"grad_norm": 0.2613558705976954,
"learning_rate": 2.33201581027668e-05,
"loss": 0.3384,
"step": 653
},
{
"epoch": 1.74034620505992,
"grad_norm": 0.31445958338443253,
"learning_rate": 2.327075098814229e-05,
"loss": 0.3563,
"step": 654
},
{
"epoch": 1.7430093209054593,
"grad_norm": 0.26295035895535324,
"learning_rate": 2.3221343873517785e-05,
"loss": 0.3523,
"step": 655
},
{
"epoch": 1.7456724367509988,
"grad_norm": 0.26455791446031185,
"learning_rate": 2.317193675889328e-05,
"loss": 0.347,
"step": 656
},
{
"epoch": 1.7483355525965378,
"grad_norm": 0.267920904226216,
"learning_rate": 2.3122529644268774e-05,
"loss": 0.3757,
"step": 657
},
{
"epoch": 1.7509986684420773,
"grad_norm": 0.29766057642277893,
"learning_rate": 2.307312252964427e-05,
"loss": 0.3388,
"step": 658
},
{
"epoch": 1.7536617842876165,
"grad_norm": 0.2614333124037635,
"learning_rate": 2.3023715415019763e-05,
"loss": 0.3448,
"step": 659
},
{
"epoch": 1.7563249001331558,
"grad_norm": 0.2460873862604595,
"learning_rate": 2.2974308300395257e-05,
"loss": 0.3701,
"step": 660
},
{
"epoch": 1.758988015978695,
"grad_norm": 0.32415471595000084,
"learning_rate": 2.2924901185770752e-05,
"loss": 0.3502,
"step": 661
},
{
"epoch": 1.7616511318242343,
"grad_norm": 0.28861202445680917,
"learning_rate": 2.2875494071146246e-05,
"loss": 0.3419,
"step": 662
},
{
"epoch": 1.7643142476697737,
"grad_norm": 0.33178480237112284,
"learning_rate": 2.282608695652174e-05,
"loss": 0.364,
"step": 663
},
{
"epoch": 1.7669773635153128,
"grad_norm": 0.28362428197182826,
"learning_rate": 2.2776679841897235e-05,
"loss": 0.3447,
"step": 664
},
{
"epoch": 1.7696404793608522,
"grad_norm": 0.2593493932357841,
"learning_rate": 2.272727272727273e-05,
"loss": 0.3566,
"step": 665
},
{
"epoch": 1.7723035952063915,
"grad_norm": 0.32399886004151673,
"learning_rate": 2.267786561264822e-05,
"loss": 0.352,
"step": 666
},
{
"epoch": 1.7749667110519307,
"grad_norm": 0.2898594306022826,
"learning_rate": 2.2628458498023715e-05,
"loss": 0.3552,
"step": 667
},
{
"epoch": 1.7776298268974702,
"grad_norm": 0.30141440115798507,
"learning_rate": 2.257905138339921e-05,
"loss": 0.3394,
"step": 668
},
{
"epoch": 1.7802929427430092,
"grad_norm": 0.2748566768296462,
"learning_rate": 2.2529644268774703e-05,
"loss": 0.3639,
"step": 669
},
{
"epoch": 1.7829560585885487,
"grad_norm": 0.2597063738725183,
"learning_rate": 2.2480237154150198e-05,
"loss": 0.3523,
"step": 670
},
{
"epoch": 1.785619174434088,
"grad_norm": 0.27428899527158185,
"learning_rate": 2.2430830039525692e-05,
"loss": 0.3576,
"step": 671
},
{
"epoch": 1.7882822902796272,
"grad_norm": 0.27821642567843663,
"learning_rate": 2.2381422924901187e-05,
"loss": 0.3431,
"step": 672
},
{
"epoch": 1.7909454061251664,
"grad_norm": 0.3009289717068197,
"learning_rate": 2.233201581027668e-05,
"loss": 0.3506,
"step": 673
},
{
"epoch": 1.7936085219707056,
"grad_norm": 0.27901500754869907,
"learning_rate": 2.2282608695652175e-05,
"loss": 0.3413,
"step": 674
},
{
"epoch": 1.796271637816245,
"grad_norm": 0.26359419972730574,
"learning_rate": 2.223320158102767e-05,
"loss": 0.3574,
"step": 675
},
{
"epoch": 1.7989347536617841,
"grad_norm": 0.301875250326235,
"learning_rate": 2.2183794466403164e-05,
"loss": 0.3586,
"step": 676
},
{
"epoch": 1.8015978695073236,
"grad_norm": 0.293396805853932,
"learning_rate": 2.213438735177866e-05,
"loss": 0.3631,
"step": 677
},
{
"epoch": 1.8042609853528628,
"grad_norm": 0.2627077951859255,
"learning_rate": 2.2084980237154153e-05,
"loss": 0.3421,
"step": 678
},
{
"epoch": 1.806924101198402,
"grad_norm": 0.2910041424241653,
"learning_rate": 2.2035573122529647e-05,
"loss": 0.3508,
"step": 679
},
{
"epoch": 1.8095872170439415,
"grad_norm": 0.2700422024120216,
"learning_rate": 2.198616600790514e-05,
"loss": 0.3656,
"step": 680
},
{
"epoch": 1.8122503328894806,
"grad_norm": 0.261122870241434,
"learning_rate": 2.1936758893280636e-05,
"loss": 0.3727,
"step": 681
},
{
"epoch": 1.81491344873502,
"grad_norm": 0.2759182990026985,
"learning_rate": 2.188735177865613e-05,
"loss": 0.3429,
"step": 682
},
{
"epoch": 1.8175765645805593,
"grad_norm": 0.25688731642570295,
"learning_rate": 2.183794466403162e-05,
"loss": 0.3638,
"step": 683
},
{
"epoch": 1.8202396804260985,
"grad_norm": 0.2583299882188377,
"learning_rate": 2.1788537549407116e-05,
"loss": 0.3627,
"step": 684
},
{
"epoch": 1.822902796271638,
"grad_norm": 0.24824630818405677,
"learning_rate": 2.173913043478261e-05,
"loss": 0.3509,
"step": 685
},
{
"epoch": 1.825565912117177,
"grad_norm": 0.2775222142294749,
"learning_rate": 2.1689723320158105e-05,
"loss": 0.3421,
"step": 686
},
{
"epoch": 1.8282290279627165,
"grad_norm": 0.23869310034905467,
"learning_rate": 2.16403162055336e-05,
"loss": 0.3376,
"step": 687
},
{
"epoch": 1.8308921438082557,
"grad_norm": 0.2933357911415976,
"learning_rate": 2.1590909090909093e-05,
"loss": 0.3521,
"step": 688
},
{
"epoch": 1.833555259653795,
"grad_norm": 0.27832210393035933,
"learning_rate": 2.1541501976284588e-05,
"loss": 0.3553,
"step": 689
},
{
"epoch": 1.8362183754993342,
"grad_norm": 0.3087436970907245,
"learning_rate": 2.1492094861660082e-05,
"loss": 0.347,
"step": 690
},
{
"epoch": 1.8388814913448734,
"grad_norm": 0.2943513499295711,
"learning_rate": 2.1442687747035576e-05,
"loss": 0.3536,
"step": 691
},
{
"epoch": 1.841544607190413,
"grad_norm": 0.26722654225950093,
"learning_rate": 2.1393280632411067e-05,
"loss": 0.3624,
"step": 692
},
{
"epoch": 1.844207723035952,
"grad_norm": 0.2686739391641238,
"learning_rate": 2.1343873517786562e-05,
"loss": 0.3551,
"step": 693
},
{
"epoch": 1.8468708388814914,
"grad_norm": 0.3317404535951985,
"learning_rate": 2.1294466403162056e-05,
"loss": 0.3519,
"step": 694
},
{
"epoch": 1.8495339547270306,
"grad_norm": 0.25888461414583197,
"learning_rate": 2.124505928853755e-05,
"loss": 0.3621,
"step": 695
},
{
"epoch": 1.8521970705725699,
"grad_norm": 0.2388947383775022,
"learning_rate": 2.1195652173913045e-05,
"loss": 0.3464,
"step": 696
},
{
"epoch": 1.8548601864181093,
"grad_norm": 0.32253652339123096,
"learning_rate": 2.114624505928854e-05,
"loss": 0.3486,
"step": 697
},
{
"epoch": 1.8575233022636484,
"grad_norm": 0.23971764237483872,
"learning_rate": 2.1096837944664034e-05,
"loss": 0.3469,
"step": 698
},
{
"epoch": 1.8601864181091878,
"grad_norm": 0.2822968430519757,
"learning_rate": 2.1047430830039528e-05,
"loss": 0.3464,
"step": 699
},
{
"epoch": 1.862849533954727,
"grad_norm": 0.28707092445711563,
"learning_rate": 2.0998023715415023e-05,
"loss": 0.3454,
"step": 700
},
{
"epoch": 1.8655126498002663,
"grad_norm": 0.26633357589223594,
"learning_rate": 2.0948616600790517e-05,
"loss": 0.3528,
"step": 701
},
{
"epoch": 1.8681757656458056,
"grad_norm": 0.30480677025070735,
"learning_rate": 2.0899209486166008e-05,
"loss": 0.3705,
"step": 702
},
{
"epoch": 1.8708388814913448,
"grad_norm": 0.2589295473498244,
"learning_rate": 2.0849802371541502e-05,
"loss": 0.366,
"step": 703
},
{
"epoch": 1.8735019973368843,
"grad_norm": 0.3615686651832072,
"learning_rate": 2.0800395256916997e-05,
"loss": 0.3545,
"step": 704
},
{
"epoch": 1.8761651131824233,
"grad_norm": 0.2643316410023579,
"learning_rate": 2.075098814229249e-05,
"loss": 0.3478,
"step": 705
},
{
"epoch": 1.8788282290279628,
"grad_norm": 0.3002604064308654,
"learning_rate": 2.0701581027667985e-05,
"loss": 0.3691,
"step": 706
},
{
"epoch": 1.881491344873502,
"grad_norm": 0.2842611156357375,
"learning_rate": 2.065217391304348e-05,
"loss": 0.361,
"step": 707
},
{
"epoch": 1.8841544607190412,
"grad_norm": 0.3130168183378823,
"learning_rate": 2.0602766798418974e-05,
"loss": 0.3536,
"step": 708
},
{
"epoch": 1.8868175765645807,
"grad_norm": 0.3519161067004107,
"learning_rate": 2.055335968379447e-05,
"loss": 0.3557,
"step": 709
},
{
"epoch": 1.8894806924101197,
"grad_norm": 0.27233651062760655,
"learning_rate": 2.0503952569169963e-05,
"loss": 0.3594,
"step": 710
},
{
"epoch": 1.8921438082556592,
"grad_norm": 0.31833253788492577,
"learning_rate": 2.0454545454545457e-05,
"loss": 0.3489,
"step": 711
},
{
"epoch": 1.8948069241011984,
"grad_norm": 0.24567699858003664,
"learning_rate": 2.040513833992095e-05,
"loss": 0.3367,
"step": 712
},
{
"epoch": 1.8974700399467377,
"grad_norm": 0.2969050880879015,
"learning_rate": 2.0355731225296443e-05,
"loss": 0.3537,
"step": 713
},
{
"epoch": 1.900133155792277,
"grad_norm": 0.3189993081371087,
"learning_rate": 2.0306324110671937e-05,
"loss": 0.3669,
"step": 714
},
{
"epoch": 1.9027962716378162,
"grad_norm": 0.24524923802003742,
"learning_rate": 2.025691699604743e-05,
"loss": 0.3448,
"step": 715
},
{
"epoch": 1.9054593874833556,
"grad_norm": 0.3002012848114626,
"learning_rate": 2.0207509881422926e-05,
"loss": 0.3592,
"step": 716
},
{
"epoch": 1.9081225033288947,
"grad_norm": 0.2577221774068482,
"learning_rate": 2.015810276679842e-05,
"loss": 0.3615,
"step": 717
},
{
"epoch": 1.9107856191744341,
"grad_norm": 0.2662922499052391,
"learning_rate": 2.0108695652173915e-05,
"loss": 0.3564,
"step": 718
},
{
"epoch": 1.9134487350199734,
"grad_norm": 0.2748543453818437,
"learning_rate": 2.005928853754941e-05,
"loss": 0.3367,
"step": 719
},
{
"epoch": 1.9161118508655126,
"grad_norm": 0.29453902437825724,
"learning_rate": 2.0009881422924903e-05,
"loss": 0.3346,
"step": 720
},
{
"epoch": 1.918774966711052,
"grad_norm": 0.2958384946201868,
"learning_rate": 1.9960474308300398e-05,
"loss": 0.3653,
"step": 721
},
{
"epoch": 1.921438082556591,
"grad_norm": 0.3110870857995837,
"learning_rate": 1.991106719367589e-05,
"loss": 0.3626,
"step": 722
},
{
"epoch": 1.9241011984021306,
"grad_norm": 0.29754006004298117,
"learning_rate": 1.9861660079051383e-05,
"loss": 0.3595,
"step": 723
},
{
"epoch": 1.9267643142476698,
"grad_norm": 0.2637206512469971,
"learning_rate": 1.9812252964426878e-05,
"loss": 0.3637,
"step": 724
},
{
"epoch": 1.929427430093209,
"grad_norm": 0.28572071909963137,
"learning_rate": 1.9762845849802372e-05,
"loss": 0.351,
"step": 725
},
{
"epoch": 1.9320905459387483,
"grad_norm": 0.26449910347561634,
"learning_rate": 1.9713438735177866e-05,
"loss": 0.3607,
"step": 726
},
{
"epoch": 1.9347536617842875,
"grad_norm": 0.312752897256756,
"learning_rate": 1.966403162055336e-05,
"loss": 0.3591,
"step": 727
},
{
"epoch": 1.937416777629827,
"grad_norm": 0.2592410502272739,
"learning_rate": 1.9614624505928855e-05,
"loss": 0.3439,
"step": 728
},
{
"epoch": 1.940079893475366,
"grad_norm": 0.24250837194662156,
"learning_rate": 1.956521739130435e-05,
"loss": 0.3322,
"step": 729
},
{
"epoch": 1.9427430093209055,
"grad_norm": 0.27100632690728255,
"learning_rate": 1.9515810276679844e-05,
"loss": 0.3478,
"step": 730
},
{
"epoch": 1.9454061251664447,
"grad_norm": 0.2792664428193274,
"learning_rate": 1.9466403162055335e-05,
"loss": 0.3667,
"step": 731
},
{
"epoch": 1.948069241011984,
"grad_norm": 0.2619688688672022,
"learning_rate": 1.941699604743083e-05,
"loss": 0.3533,
"step": 732
},
{
"epoch": 1.9507323568575234,
"grad_norm": 0.250474396728028,
"learning_rate": 1.9367588932806324e-05,
"loss": 0.3615,
"step": 733
},
{
"epoch": 1.9533954727030625,
"grad_norm": 0.2592917559527508,
"learning_rate": 1.9318181818181818e-05,
"loss": 0.35,
"step": 734
},
{
"epoch": 1.956058588548602,
"grad_norm": 0.28358412495828245,
"learning_rate": 1.9268774703557312e-05,
"loss": 0.3438,
"step": 735
},
{
"epoch": 1.9587217043941412,
"grad_norm": 0.2905168266596484,
"learning_rate": 1.9219367588932807e-05,
"loss": 0.3363,
"step": 736
},
{
"epoch": 1.9613848202396804,
"grad_norm": 0.2558334592646534,
"learning_rate": 1.91699604743083e-05,
"loss": 0.3636,
"step": 737
},
{
"epoch": 1.9640479360852197,
"grad_norm": 0.2856486905717076,
"learning_rate": 1.9120553359683796e-05,
"loss": 0.3423,
"step": 738
},
{
"epoch": 1.966711051930759,
"grad_norm": 0.25338680291782845,
"learning_rate": 1.907114624505929e-05,
"loss": 0.3647,
"step": 739
},
{
"epoch": 1.9693741677762984,
"grad_norm": 0.25927241893410596,
"learning_rate": 1.9021739130434784e-05,
"loss": 0.361,
"step": 740
},
{
"epoch": 1.9720372836218374,
"grad_norm": 0.26559107296256046,
"learning_rate": 1.8972332015810275e-05,
"loss": 0.3532,
"step": 741
},
{
"epoch": 1.9747003994673769,
"grad_norm": 0.23909262831928838,
"learning_rate": 1.892292490118577e-05,
"loss": 0.3458,
"step": 742
},
{
"epoch": 1.977363515312916,
"grad_norm": 0.29570607043062813,
"learning_rate": 1.8873517786561264e-05,
"loss": 0.3651,
"step": 743
},
{
"epoch": 1.9800266311584553,
"grad_norm": 0.26837566907079335,
"learning_rate": 1.882411067193676e-05,
"loss": 0.3624,
"step": 744
},
{
"epoch": 1.9826897470039948,
"grad_norm": 0.24855234703810405,
"learning_rate": 1.8774703557312253e-05,
"loss": 0.3458,
"step": 745
},
{
"epoch": 1.9853528628495338,
"grad_norm": 0.2581276414313357,
"learning_rate": 1.8725296442687747e-05,
"loss": 0.3532,
"step": 746
},
{
"epoch": 1.9880159786950733,
"grad_norm": 0.2769192507293847,
"learning_rate": 1.867588932806324e-05,
"loss": 0.3662,
"step": 747
},
{
"epoch": 1.9906790945406125,
"grad_norm": 0.24782306003081656,
"learning_rate": 1.8626482213438736e-05,
"loss": 0.3444,
"step": 748
},
{
"epoch": 1.9933422103861518,
"grad_norm": 0.23338769959338118,
"learning_rate": 1.857707509881423e-05,
"loss": 0.3375,
"step": 749
},
{
"epoch": 1.996005326231691,
"grad_norm": 0.2399452380668713,
"learning_rate": 1.8527667984189725e-05,
"loss": 0.3577,
"step": 750
},
{
"epoch": 1.9986684420772303,
"grad_norm": 0.24061002934920092,
"learning_rate": 1.8478260869565216e-05,
"loss": 0.3558,
"step": 751
},
{
"epoch": 2.0,
"grad_norm": 0.3876397436943037,
"learning_rate": 1.842885375494071e-05,
"loss": 0.3232,
"step": 752
},
{
"epoch": 2.0026631158455395,
"grad_norm": 0.3255318592205839,
"learning_rate": 1.8379446640316205e-05,
"loss": 0.2829,
"step": 753
},
{
"epoch": 2.0053262316910785,
"grad_norm": 0.2688339427044817,
"learning_rate": 1.83300395256917e-05,
"loss": 0.2808,
"step": 754
},
{
"epoch": 2.007989347536618,
"grad_norm": 0.31006819974729777,
"learning_rate": 1.8280632411067193e-05,
"loss": 0.2619,
"step": 755
},
{
"epoch": 2.010652463382157,
"grad_norm": 0.3391232912122683,
"learning_rate": 1.8231225296442688e-05,
"loss": 0.2797,
"step": 756
},
{
"epoch": 2.0133155792276964,
"grad_norm": 0.22961985808221483,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.2716,
"step": 757
},
{
"epoch": 2.015978695073236,
"grad_norm": 0.3029488541333639,
"learning_rate": 1.8132411067193676e-05,
"loss": 0.2748,
"step": 758
},
{
"epoch": 2.018641810918775,
"grad_norm": 0.3272089229771229,
"learning_rate": 1.808300395256917e-05,
"loss": 0.259,
"step": 759
},
{
"epoch": 2.0213049267643144,
"grad_norm": 0.2632568547847837,
"learning_rate": 1.8033596837944665e-05,
"loss": 0.2765,
"step": 760
},
{
"epoch": 2.0239680426098534,
"grad_norm": 0.278440470950714,
"learning_rate": 1.7984189723320156e-05,
"loss": 0.2638,
"step": 761
},
{
"epoch": 2.026631158455393,
"grad_norm": 0.3139907981507755,
"learning_rate": 1.793478260869565e-05,
"loss": 0.2805,
"step": 762
},
{
"epoch": 2.029294274300932,
"grad_norm": 0.26955412514066035,
"learning_rate": 1.7885375494071145e-05,
"loss": 0.2617,
"step": 763
},
{
"epoch": 2.0319573901464714,
"grad_norm": 0.2583856619944918,
"learning_rate": 1.783596837944664e-05,
"loss": 0.2678,
"step": 764
},
{
"epoch": 2.034620505992011,
"grad_norm": 0.27298004272506543,
"learning_rate": 1.7786561264822134e-05,
"loss": 0.2674,
"step": 765
},
{
"epoch": 2.03728362183755,
"grad_norm": 0.272776301937256,
"learning_rate": 1.7737154150197628e-05,
"loss": 0.2783,
"step": 766
},
{
"epoch": 2.0399467376830893,
"grad_norm": 0.23604664211204196,
"learning_rate": 1.7687747035573123e-05,
"loss": 0.2694,
"step": 767
},
{
"epoch": 2.0426098535286283,
"grad_norm": 0.2705685089413051,
"learning_rate": 1.7638339920948617e-05,
"loss": 0.2835,
"step": 768
},
{
"epoch": 2.045272969374168,
"grad_norm": 0.2348856411632335,
"learning_rate": 1.758893280632411e-05,
"loss": 0.2591,
"step": 769
},
{
"epoch": 2.0479360852197073,
"grad_norm": 0.24862768901035942,
"learning_rate": 1.7539525691699606e-05,
"loss": 0.2641,
"step": 770
},
{
"epoch": 2.0505992010652463,
"grad_norm": 0.25511185080416404,
"learning_rate": 1.74901185770751e-05,
"loss": 0.2709,
"step": 771
},
{
"epoch": 2.0532623169107858,
"grad_norm": 0.24302033763825434,
"learning_rate": 1.7440711462450594e-05,
"loss": 0.2759,
"step": 772
},
{
"epoch": 2.0559254327563248,
"grad_norm": 0.20872328589643,
"learning_rate": 1.739130434782609e-05,
"loss": 0.2632,
"step": 773
},
{
"epoch": 2.0585885486018642,
"grad_norm": 0.26636593407387676,
"learning_rate": 1.7341897233201583e-05,
"loss": 0.2636,
"step": 774
},
{
"epoch": 2.0612516644474033,
"grad_norm": 0.28091568129361494,
"learning_rate": 1.7292490118577078e-05,
"loss": 0.2628,
"step": 775
},
{
"epoch": 2.0639147802929427,
"grad_norm": 0.2560746499348802,
"learning_rate": 1.7243083003952572e-05,
"loss": 0.2655,
"step": 776
},
{
"epoch": 2.066577896138482,
"grad_norm": 0.26276899174108526,
"learning_rate": 1.7193675889328066e-05,
"loss": 0.2728,
"step": 777
},
{
"epoch": 2.069241011984021,
"grad_norm": 0.26384946938199305,
"learning_rate": 1.714426877470356e-05,
"loss": 0.2747,
"step": 778
},
{
"epoch": 2.0719041278295607,
"grad_norm": 0.23715984391863434,
"learning_rate": 1.7094861660079055e-05,
"loss": 0.2694,
"step": 779
},
{
"epoch": 2.0745672436750997,
"grad_norm": 0.2404103191932088,
"learning_rate": 1.7045454545454546e-05,
"loss": 0.2844,
"step": 780
},
{
"epoch": 2.077230359520639,
"grad_norm": 0.2295546055568796,
"learning_rate": 1.699604743083004e-05,
"loss": 0.2563,
"step": 781
},
{
"epoch": 2.0798934753661786,
"grad_norm": 0.25081138258701596,
"learning_rate": 1.6946640316205535e-05,
"loss": 0.2657,
"step": 782
},
{
"epoch": 2.0825565912117177,
"grad_norm": 0.23299102413940379,
"learning_rate": 1.689723320158103e-05,
"loss": 0.2841,
"step": 783
},
{
"epoch": 2.085219707057257,
"grad_norm": 0.2352302932330538,
"learning_rate": 1.6847826086956524e-05,
"loss": 0.2696,
"step": 784
},
{
"epoch": 2.087882822902796,
"grad_norm": 0.2396805580902733,
"learning_rate": 1.6798418972332018e-05,
"loss": 0.2687,
"step": 785
},
{
"epoch": 2.0905459387483356,
"grad_norm": 0.22897484277870242,
"learning_rate": 1.6749011857707512e-05,
"loss": 0.2678,
"step": 786
},
{
"epoch": 2.0932090545938746,
"grad_norm": 0.224891214268194,
"learning_rate": 1.6699604743083007e-05,
"loss": 0.2729,
"step": 787
},
{
"epoch": 2.095872170439414,
"grad_norm": 0.26860270920114504,
"learning_rate": 1.66501976284585e-05,
"loss": 0.2581,
"step": 788
},
{
"epoch": 2.0985352862849536,
"grad_norm": 0.24961552358211944,
"learning_rate": 1.6600790513833996e-05,
"loss": 0.2624,
"step": 789
},
{
"epoch": 2.1011984021304926,
"grad_norm": 0.22308364748740767,
"learning_rate": 1.6551383399209487e-05,
"loss": 0.2647,
"step": 790
},
{
"epoch": 2.103861517976032,
"grad_norm": 0.2380839364570976,
"learning_rate": 1.650197628458498e-05,
"loss": 0.271,
"step": 791
},
{
"epoch": 2.106524633821571,
"grad_norm": 0.24381955578610937,
"learning_rate": 1.6452569169960475e-05,
"loss": 0.2694,
"step": 792
},
{
"epoch": 2.1091877496671105,
"grad_norm": 0.23758646142710013,
"learning_rate": 1.640316205533597e-05,
"loss": 0.2775,
"step": 793
},
{
"epoch": 2.11185086551265,
"grad_norm": 0.23538198400085814,
"learning_rate": 1.6353754940711464e-05,
"loss": 0.2814,
"step": 794
},
{
"epoch": 2.114513981358189,
"grad_norm": 0.21674748879871775,
"learning_rate": 1.630434782608696e-05,
"loss": 0.2548,
"step": 795
},
{
"epoch": 2.1171770972037285,
"grad_norm": 0.24105445224605443,
"learning_rate": 1.6254940711462453e-05,
"loss": 0.2641,
"step": 796
},
{
"epoch": 2.1198402130492675,
"grad_norm": 0.23753067329213304,
"learning_rate": 1.6205533596837947e-05,
"loss": 0.2709,
"step": 797
},
{
"epoch": 2.122503328894807,
"grad_norm": 0.23404194217010732,
"learning_rate": 1.615612648221344e-05,
"loss": 0.271,
"step": 798
},
{
"epoch": 2.125166444740346,
"grad_norm": 0.2121069651623829,
"learning_rate": 1.6106719367588936e-05,
"loss": 0.2627,
"step": 799
},
{
"epoch": 2.1278295605858855,
"grad_norm": 0.22624703639894228,
"learning_rate": 1.6057312252964427e-05,
"loss": 0.2538,
"step": 800
},
{
"epoch": 2.130492676431425,
"grad_norm": 0.2386292992012449,
"learning_rate": 1.600790513833992e-05,
"loss": 0.2576,
"step": 801
},
{
"epoch": 2.133155792276964,
"grad_norm": 0.22877737188756703,
"learning_rate": 1.5958498023715416e-05,
"loss": 0.2727,
"step": 802
},
{
"epoch": 2.1358189081225034,
"grad_norm": 0.27117813021650006,
"learning_rate": 1.590909090909091e-05,
"loss": 0.2895,
"step": 803
},
{
"epoch": 2.1384820239680424,
"grad_norm": 0.22867337217751538,
"learning_rate": 1.5859683794466405e-05,
"loss": 0.2734,
"step": 804
},
{
"epoch": 2.141145139813582,
"grad_norm": 0.24512337588151054,
"learning_rate": 1.58102766798419e-05,
"loss": 0.273,
"step": 805
},
{
"epoch": 2.1438082556591214,
"grad_norm": 0.2727608695581687,
"learning_rate": 1.5760869565217393e-05,
"loss": 0.2901,
"step": 806
},
{
"epoch": 2.1464713715046604,
"grad_norm": 0.2387866974014394,
"learning_rate": 1.5711462450592888e-05,
"loss": 0.2643,
"step": 807
},
{
"epoch": 2.1491344873502,
"grad_norm": 0.22440460077720992,
"learning_rate": 1.5662055335968382e-05,
"loss": 0.2653,
"step": 808
},
{
"epoch": 2.151797603195739,
"grad_norm": 0.248288295680679,
"learning_rate": 1.5612648221343873e-05,
"loss": 0.2549,
"step": 809
},
{
"epoch": 2.1544607190412783,
"grad_norm": 0.24110717758110342,
"learning_rate": 1.5563241106719367e-05,
"loss": 0.2748,
"step": 810
},
{
"epoch": 2.157123834886818,
"grad_norm": 0.23171730936199766,
"learning_rate": 1.5513833992094862e-05,
"loss": 0.2709,
"step": 811
},
{
"epoch": 2.159786950732357,
"grad_norm": 0.22345452374040276,
"learning_rate": 1.5464426877470356e-05,
"loss": 0.2688,
"step": 812
},
{
"epoch": 2.1624500665778963,
"grad_norm": 0.26551342546130663,
"learning_rate": 1.541501976284585e-05,
"loss": 0.2709,
"step": 813
},
{
"epoch": 2.1651131824234353,
"grad_norm": 0.2375754285218798,
"learning_rate": 1.5365612648221345e-05,
"loss": 0.259,
"step": 814
},
{
"epoch": 2.1677762982689748,
"grad_norm": 0.2115542246448785,
"learning_rate": 1.531620553359684e-05,
"loss": 0.2684,
"step": 815
},
{
"epoch": 2.170439414114514,
"grad_norm": 0.2447171773393202,
"learning_rate": 1.5266798418972334e-05,
"loss": 0.2762,
"step": 816
},
{
"epoch": 2.1731025299600533,
"grad_norm": 0.22704904523049146,
"learning_rate": 1.5217391304347828e-05,
"loss": 0.2587,
"step": 817
},
{
"epoch": 2.1757656458055927,
"grad_norm": 0.2103985476952429,
"learning_rate": 1.5167984189723323e-05,
"loss": 0.2706,
"step": 818
},
{
"epoch": 2.1784287616511318,
"grad_norm": 0.25159263014889965,
"learning_rate": 1.5118577075098814e-05,
"loss": 0.2584,
"step": 819
},
{
"epoch": 2.181091877496671,
"grad_norm": 0.24458443995501622,
"learning_rate": 1.5069169960474308e-05,
"loss": 0.2704,
"step": 820
},
{
"epoch": 2.1837549933422102,
"grad_norm": 0.22057301940141671,
"learning_rate": 1.5019762845849802e-05,
"loss": 0.2719,
"step": 821
},
{
"epoch": 2.1864181091877497,
"grad_norm": 0.267519780973077,
"learning_rate": 1.4970355731225297e-05,
"loss": 0.2716,
"step": 822
},
{
"epoch": 2.1890812250332887,
"grad_norm": 0.22154250046870252,
"learning_rate": 1.4920948616600791e-05,
"loss": 0.2591,
"step": 823
},
{
"epoch": 2.191744340878828,
"grad_norm": 0.21165234414085649,
"learning_rate": 1.4871541501976285e-05,
"loss": 0.2655,
"step": 824
},
{
"epoch": 2.1944074567243677,
"grad_norm": 0.24374815251314244,
"learning_rate": 1.482213438735178e-05,
"loss": 0.2655,
"step": 825
},
{
"epoch": 2.1970705725699067,
"grad_norm": 0.2455699195489871,
"learning_rate": 1.4772727272727274e-05,
"loss": 0.2665,
"step": 826
},
{
"epoch": 2.199733688415446,
"grad_norm": 0.22958103222280501,
"learning_rate": 1.4723320158102769e-05,
"loss": 0.266,
"step": 827
},
{
"epoch": 2.202396804260985,
"grad_norm": 0.22203196516766327,
"learning_rate": 1.4673913043478263e-05,
"loss": 0.2646,
"step": 828
},
{
"epoch": 2.2050599201065246,
"grad_norm": 0.24608492700980994,
"learning_rate": 1.4624505928853754e-05,
"loss": 0.2794,
"step": 829
},
{
"epoch": 2.207723035952064,
"grad_norm": 0.21991565592070453,
"learning_rate": 1.4575098814229248e-05,
"loss": 0.2721,
"step": 830
},
{
"epoch": 2.210386151797603,
"grad_norm": 0.21684224263000038,
"learning_rate": 1.4525691699604743e-05,
"loss": 0.2584,
"step": 831
},
{
"epoch": 2.2130492676431426,
"grad_norm": 0.25977569519470245,
"learning_rate": 1.4476284584980237e-05,
"loss": 0.2726,
"step": 832
},
{
"epoch": 2.2157123834886816,
"grad_norm": 0.2386084151402447,
"learning_rate": 1.4426877470355732e-05,
"loss": 0.2852,
"step": 833
},
{
"epoch": 2.218375499334221,
"grad_norm": 0.21986693449971093,
"learning_rate": 1.4377470355731226e-05,
"loss": 0.2626,
"step": 834
},
{
"epoch": 2.2210386151797605,
"grad_norm": 0.21749065277576188,
"learning_rate": 1.432806324110672e-05,
"loss": 0.2602,
"step": 835
},
{
"epoch": 2.2237017310252996,
"grad_norm": 0.23989512729814974,
"learning_rate": 1.4278656126482215e-05,
"loss": 0.2692,
"step": 836
},
{
"epoch": 2.226364846870839,
"grad_norm": 0.23832582321216103,
"learning_rate": 1.4229249011857709e-05,
"loss": 0.2635,
"step": 837
},
{
"epoch": 2.229027962716378,
"grad_norm": 0.2426811597238821,
"learning_rate": 1.4179841897233202e-05,
"loss": 0.2668,
"step": 838
},
{
"epoch": 2.2316910785619175,
"grad_norm": 0.22741820303496693,
"learning_rate": 1.4130434782608694e-05,
"loss": 0.2687,
"step": 839
},
{
"epoch": 2.2343541944074565,
"grad_norm": 0.2193731262262756,
"learning_rate": 1.4081027667984189e-05,
"loss": 0.2707,
"step": 840
},
{
"epoch": 2.237017310252996,
"grad_norm": 0.22566921822696567,
"learning_rate": 1.4031620553359683e-05,
"loss": 0.2676,
"step": 841
},
{
"epoch": 2.2396804260985355,
"grad_norm": 0.22383415671065598,
"learning_rate": 1.3982213438735178e-05,
"loss": 0.2652,
"step": 842
},
{
"epoch": 2.2423435419440745,
"grad_norm": 0.20320657711674117,
"learning_rate": 1.3932806324110672e-05,
"loss": 0.2595,
"step": 843
},
{
"epoch": 2.245006657789614,
"grad_norm": 0.2333067790520279,
"learning_rate": 1.3883399209486166e-05,
"loss": 0.2584,
"step": 844
},
{
"epoch": 2.247669773635153,
"grad_norm": 0.2198492093260434,
"learning_rate": 1.383399209486166e-05,
"loss": 0.2787,
"step": 845
},
{
"epoch": 2.2503328894806924,
"grad_norm": 0.20578959481390344,
"learning_rate": 1.3784584980237155e-05,
"loss": 0.2717,
"step": 846
},
{
"epoch": 2.2529960053262315,
"grad_norm": 0.23821537591362393,
"learning_rate": 1.373517786561265e-05,
"loss": 0.2699,
"step": 847
},
{
"epoch": 2.255659121171771,
"grad_norm": 0.22087113735109618,
"learning_rate": 1.3685770750988142e-05,
"loss": 0.2643,
"step": 848
},
{
"epoch": 2.2583222370173104,
"grad_norm": 0.21122229854050678,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.2724,
"step": 849
},
{
"epoch": 2.2609853528628494,
"grad_norm": 0.21706856754708864,
"learning_rate": 1.3586956521739131e-05,
"loss": 0.2726,
"step": 850
},
{
"epoch": 2.263648468708389,
"grad_norm": 0.21623723691120003,
"learning_rate": 1.3537549407114625e-05,
"loss": 0.2551,
"step": 851
},
{
"epoch": 2.266311584553928,
"grad_norm": 0.2271100658389757,
"learning_rate": 1.348814229249012e-05,
"loss": 0.2586,
"step": 852
},
{
"epoch": 2.2689747003994674,
"grad_norm": 0.2209764109681619,
"learning_rate": 1.3438735177865614e-05,
"loss": 0.2716,
"step": 853
},
{
"epoch": 2.271637816245007,
"grad_norm": 0.2178701412614265,
"learning_rate": 1.3389328063241108e-05,
"loss": 0.2891,
"step": 854
},
{
"epoch": 2.274300932090546,
"grad_norm": 0.2661642988662999,
"learning_rate": 1.3339920948616603e-05,
"loss": 0.2564,
"step": 855
},
{
"epoch": 2.2769640479360853,
"grad_norm": 0.21388446109096484,
"learning_rate": 1.3290513833992097e-05,
"loss": 0.2529,
"step": 856
},
{
"epoch": 2.2796271637816243,
"grad_norm": 0.2216576992935052,
"learning_rate": 1.3241106719367592e-05,
"loss": 0.2636,
"step": 857
},
{
"epoch": 2.282290279627164,
"grad_norm": 0.23210662511306396,
"learning_rate": 1.3191699604743083e-05,
"loss": 0.2589,
"step": 858
},
{
"epoch": 2.2849533954727033,
"grad_norm": 0.2392108261983096,
"learning_rate": 1.3142292490118577e-05,
"loss": 0.265,
"step": 859
},
{
"epoch": 2.2876165113182423,
"grad_norm": 0.21786440972478727,
"learning_rate": 1.3092885375494071e-05,
"loss": 0.2793,
"step": 860
},
{
"epoch": 2.2902796271637818,
"grad_norm": 0.260403587668551,
"learning_rate": 1.3043478260869566e-05,
"loss": 0.2777,
"step": 861
},
{
"epoch": 2.2929427430093208,
"grad_norm": 0.2430960989806936,
"learning_rate": 1.299407114624506e-05,
"loss": 0.2572,
"step": 862
},
{
"epoch": 2.2956058588548602,
"grad_norm": 0.21752051573777517,
"learning_rate": 1.2944664031620555e-05,
"loss": 0.2803,
"step": 863
},
{
"epoch": 2.2982689747003997,
"grad_norm": 0.2573344766515025,
"learning_rate": 1.2895256916996049e-05,
"loss": 0.2803,
"step": 864
},
{
"epoch": 2.3009320905459387,
"grad_norm": 0.24369267722963625,
"learning_rate": 1.2845849802371543e-05,
"loss": 0.2559,
"step": 865
},
{
"epoch": 2.303595206391478,
"grad_norm": 0.2676475243278646,
"learning_rate": 1.2796442687747038e-05,
"loss": 0.2634,
"step": 866
},
{
"epoch": 2.306258322237017,
"grad_norm": 0.21674298638149098,
"learning_rate": 1.2747035573122532e-05,
"loss": 0.2673,
"step": 867
},
{
"epoch": 2.3089214380825567,
"grad_norm": 0.23541545396380092,
"learning_rate": 1.2697628458498023e-05,
"loss": 0.2673,
"step": 868
},
{
"epoch": 2.3115845539280957,
"grad_norm": 0.22699711620607352,
"learning_rate": 1.2648221343873517e-05,
"loss": 0.2674,
"step": 869
},
{
"epoch": 2.314247669773635,
"grad_norm": 0.22613468537499234,
"learning_rate": 1.2598814229249012e-05,
"loss": 0.2655,
"step": 870
},
{
"epoch": 2.316910785619174,
"grad_norm": 0.2253665054481723,
"learning_rate": 1.2549407114624506e-05,
"loss": 0.2542,
"step": 871
},
{
"epoch": 2.3195739014647137,
"grad_norm": 0.2389905563347208,
"learning_rate": 1.25e-05,
"loss": 0.2642,
"step": 872
},
{
"epoch": 2.322237017310253,
"grad_norm": 0.1972800090188119,
"learning_rate": 1.2450592885375495e-05,
"loss": 0.2738,
"step": 873
},
{
"epoch": 2.324900133155792,
"grad_norm": 0.22018172948520282,
"learning_rate": 1.240118577075099e-05,
"loss": 0.2736,
"step": 874
},
{
"epoch": 2.3275632490013316,
"grad_norm": 0.22660897800754057,
"learning_rate": 1.2351778656126482e-05,
"loss": 0.2797,
"step": 875
},
{
"epoch": 2.3302263648468706,
"grad_norm": 0.22691181432819396,
"learning_rate": 1.2302371541501976e-05,
"loss": 0.2562,
"step": 876
},
{
"epoch": 2.33288948069241,
"grad_norm": 0.21367535241766863,
"learning_rate": 1.225296442687747e-05,
"loss": 0.2687,
"step": 877
},
{
"epoch": 2.3355525965379496,
"grad_norm": 0.23289737129114052,
"learning_rate": 1.2203557312252965e-05,
"loss": 0.2595,
"step": 878
},
{
"epoch": 2.3382157123834886,
"grad_norm": 0.21941025876118542,
"learning_rate": 1.215415019762846e-05,
"loss": 0.2785,
"step": 879
},
{
"epoch": 2.340878828229028,
"grad_norm": 0.23113074495001715,
"learning_rate": 1.2104743083003952e-05,
"loss": 0.283,
"step": 880
},
{
"epoch": 2.343541944074567,
"grad_norm": 0.21978182787011594,
"learning_rate": 1.2055335968379447e-05,
"loss": 0.2602,
"step": 881
},
{
"epoch": 2.3462050599201065,
"grad_norm": 0.22558732477437654,
"learning_rate": 1.2005928853754941e-05,
"loss": 0.2744,
"step": 882
},
{
"epoch": 2.348868175765646,
"grad_norm": 0.21761347406156886,
"learning_rate": 1.1956521739130435e-05,
"loss": 0.2702,
"step": 883
},
{
"epoch": 2.351531291611185,
"grad_norm": 0.5461188257601155,
"learning_rate": 1.190711462450593e-05,
"loss": 0.2894,
"step": 884
},
{
"epoch": 2.3541944074567245,
"grad_norm": 0.21406318400975563,
"learning_rate": 1.1857707509881423e-05,
"loss": 0.2661,
"step": 885
},
{
"epoch": 2.3568575233022635,
"grad_norm": 0.1984149911802996,
"learning_rate": 1.1808300395256917e-05,
"loss": 0.266,
"step": 886
},
{
"epoch": 2.359520639147803,
"grad_norm": 0.21968916065746072,
"learning_rate": 1.1758893280632411e-05,
"loss": 0.2635,
"step": 887
},
{
"epoch": 2.3621837549933424,
"grad_norm": 0.22188429396465353,
"learning_rate": 1.1709486166007906e-05,
"loss": 0.2729,
"step": 888
},
{
"epoch": 2.3648468708388815,
"grad_norm": 0.21019336767245783,
"learning_rate": 1.16600790513834e-05,
"loss": 0.2773,
"step": 889
},
{
"epoch": 2.367509986684421,
"grad_norm": 0.22711608967366953,
"learning_rate": 1.1610671936758893e-05,
"loss": 0.2714,
"step": 890
},
{
"epoch": 2.37017310252996,
"grad_norm": 0.2226773168313416,
"learning_rate": 1.1561264822134387e-05,
"loss": 0.264,
"step": 891
},
{
"epoch": 2.3728362183754994,
"grad_norm": 0.21211073663718902,
"learning_rate": 1.1511857707509881e-05,
"loss": 0.2623,
"step": 892
},
{
"epoch": 2.3754993342210384,
"grad_norm": 0.22155796804883984,
"learning_rate": 1.1462450592885376e-05,
"loss": 0.2786,
"step": 893
},
{
"epoch": 2.378162450066578,
"grad_norm": 0.21152104541352987,
"learning_rate": 1.141304347826087e-05,
"loss": 0.2754,
"step": 894
},
{
"epoch": 2.3808255659121174,
"grad_norm": 0.2436663825711812,
"learning_rate": 1.1363636363636365e-05,
"loss": 0.2646,
"step": 895
},
{
"epoch": 2.3834886817576564,
"grad_norm": 0.253729858596224,
"learning_rate": 1.1314229249011857e-05,
"loss": 0.2815,
"step": 896
},
{
"epoch": 2.386151797603196,
"grad_norm": 0.20642310572208497,
"learning_rate": 1.1264822134387352e-05,
"loss": 0.272,
"step": 897
},
{
"epoch": 2.388814913448735,
"grad_norm": 0.22828401957220001,
"learning_rate": 1.1215415019762846e-05,
"loss": 0.2679,
"step": 898
},
{
"epoch": 2.3914780292942743,
"grad_norm": 0.2226863403827293,
"learning_rate": 1.116600790513834e-05,
"loss": 0.2783,
"step": 899
},
{
"epoch": 2.3941411451398134,
"grad_norm": 0.2380848377629423,
"learning_rate": 1.1116600790513835e-05,
"loss": 0.2688,
"step": 900
},
{
"epoch": 2.396804260985353,
"grad_norm": 0.23278674245520006,
"learning_rate": 1.106719367588933e-05,
"loss": 0.271,
"step": 901
},
{
"epoch": 2.3994673768308923,
"grad_norm": 0.20419629462602493,
"learning_rate": 1.1017786561264824e-05,
"loss": 0.265,
"step": 902
},
{
"epoch": 2.4021304926764313,
"grad_norm": 0.2390569872958442,
"learning_rate": 1.0968379446640318e-05,
"loss": 0.2638,
"step": 903
},
{
"epoch": 2.4047936085219708,
"grad_norm": 0.2279702813171203,
"learning_rate": 1.091897233201581e-05,
"loss": 0.2428,
"step": 904
},
{
"epoch": 2.40745672436751,
"grad_norm": 0.21319204168497982,
"learning_rate": 1.0869565217391305e-05,
"loss": 0.2738,
"step": 905
},
{
"epoch": 2.4101198402130493,
"grad_norm": 0.21016965126306628,
"learning_rate": 1.08201581027668e-05,
"loss": 0.2591,
"step": 906
},
{
"epoch": 2.4127829560585887,
"grad_norm": 0.23241828917431315,
"learning_rate": 1.0770750988142294e-05,
"loss": 0.2691,
"step": 907
},
{
"epoch": 2.4154460719041277,
"grad_norm": 0.2508034557509808,
"learning_rate": 1.0721343873517788e-05,
"loss": 0.2716,
"step": 908
},
{
"epoch": 2.418109187749667,
"grad_norm": 0.24987214036836988,
"learning_rate": 1.0671936758893281e-05,
"loss": 0.2578,
"step": 909
},
{
"epoch": 2.4207723035952062,
"grad_norm": 0.2380445170755529,
"learning_rate": 1.0622529644268775e-05,
"loss": 0.264,
"step": 910
},
{
"epoch": 2.4234354194407457,
"grad_norm": 0.2201379804572699,
"learning_rate": 1.057312252964427e-05,
"loss": 0.274,
"step": 911
},
{
"epoch": 2.426098535286285,
"grad_norm": 0.250942745509917,
"learning_rate": 1.0523715415019764e-05,
"loss": 0.2648,
"step": 912
},
{
"epoch": 2.428761651131824,
"grad_norm": 0.22995097984900165,
"learning_rate": 1.0474308300395258e-05,
"loss": 0.2647,
"step": 913
},
{
"epoch": 2.4314247669773636,
"grad_norm": 0.23698141688133578,
"learning_rate": 1.0424901185770751e-05,
"loss": 0.2737,
"step": 914
},
{
"epoch": 2.4340878828229027,
"grad_norm": 0.21803776160842997,
"learning_rate": 1.0375494071146246e-05,
"loss": 0.272,
"step": 915
},
{
"epoch": 2.436750998668442,
"grad_norm": 0.24131490172282968,
"learning_rate": 1.032608695652174e-05,
"loss": 0.2695,
"step": 916
},
{
"epoch": 2.4394141145139816,
"grad_norm": 0.21919070590537304,
"learning_rate": 1.0276679841897234e-05,
"loss": 0.2642,
"step": 917
},
{
"epoch": 2.4420772303595206,
"grad_norm": 0.22130430229063322,
"learning_rate": 1.0227272727272729e-05,
"loss": 0.2648,
"step": 918
},
{
"epoch": 2.44474034620506,
"grad_norm": 0.2207950484316367,
"learning_rate": 1.0177865612648221e-05,
"loss": 0.2661,
"step": 919
},
{
"epoch": 2.447403462050599,
"grad_norm": 0.21836484864507769,
"learning_rate": 1.0128458498023716e-05,
"loss": 0.2674,
"step": 920
},
{
"epoch": 2.4500665778961386,
"grad_norm": 0.20744131254325618,
"learning_rate": 1.007905138339921e-05,
"loss": 0.2541,
"step": 921
},
{
"epoch": 2.4527296937416776,
"grad_norm": 0.21453675745306103,
"learning_rate": 1.0029644268774705e-05,
"loss": 0.2739,
"step": 922
},
{
"epoch": 2.455392809587217,
"grad_norm": 0.20834291358903456,
"learning_rate": 9.980237154150199e-06,
"loss": 0.2683,
"step": 923
},
{
"epoch": 2.458055925432756,
"grad_norm": 0.2218801415090961,
"learning_rate": 9.930830039525692e-06,
"loss": 0.2725,
"step": 924
},
{
"epoch": 2.4607190412782955,
"grad_norm": 0.22892525986093554,
"learning_rate": 9.881422924901186e-06,
"loss": 0.2736,
"step": 925
},
{
"epoch": 2.463382157123835,
"grad_norm": 0.21019735025511882,
"learning_rate": 9.83201581027668e-06,
"loss": 0.2667,
"step": 926
},
{
"epoch": 2.466045272969374,
"grad_norm": 0.22029826331712365,
"learning_rate": 9.782608695652175e-06,
"loss": 0.2685,
"step": 927
},
{
"epoch": 2.4687083888149135,
"grad_norm": 0.2048436758988922,
"learning_rate": 9.733201581027667e-06,
"loss": 0.2675,
"step": 928
},
{
"epoch": 2.4713715046604525,
"grad_norm": 0.22910504440789492,
"learning_rate": 9.683794466403162e-06,
"loss": 0.2769,
"step": 929
},
{
"epoch": 2.474034620505992,
"grad_norm": 0.22852762946943356,
"learning_rate": 9.634387351778656e-06,
"loss": 0.2834,
"step": 930
},
{
"epoch": 2.4766977363515315,
"grad_norm": 0.21897574663546826,
"learning_rate": 9.58498023715415e-06,
"loss": 0.2778,
"step": 931
},
{
"epoch": 2.4793608521970705,
"grad_norm": 0.2050794319936511,
"learning_rate": 9.535573122529645e-06,
"loss": 0.2715,
"step": 932
},
{
"epoch": 2.48202396804261,
"grad_norm": 0.21728652059101256,
"learning_rate": 9.486166007905138e-06,
"loss": 0.259,
"step": 933
},
{
"epoch": 2.484687083888149,
"grad_norm": 0.22845416533089977,
"learning_rate": 9.436758893280632e-06,
"loss": 0.2761,
"step": 934
},
{
"epoch": 2.4873501997336884,
"grad_norm": 0.21231590297088435,
"learning_rate": 9.387351778656126e-06,
"loss": 0.2677,
"step": 935
},
{
"epoch": 2.490013315579228,
"grad_norm": 0.1926400508160791,
"learning_rate": 9.33794466403162e-06,
"loss": 0.2575,
"step": 936
},
{
"epoch": 2.492676431424767,
"grad_norm": 0.22996010092008873,
"learning_rate": 9.288537549407115e-06,
"loss": 0.2548,
"step": 937
},
{
"epoch": 2.4953395472703064,
"grad_norm": 0.22619760087939098,
"learning_rate": 9.239130434782608e-06,
"loss": 0.2676,
"step": 938
},
{
"epoch": 2.4980026631158454,
"grad_norm": 0.20946128187824178,
"learning_rate": 9.189723320158102e-06,
"loss": 0.2649,
"step": 939
},
{
"epoch": 2.500665778961385,
"grad_norm": 0.21291901939824368,
"learning_rate": 9.140316205533597e-06,
"loss": 0.2794,
"step": 940
},
{
"epoch": 2.5033288948069243,
"grad_norm": 0.23983156472432737,
"learning_rate": 9.090909090909091e-06,
"loss": 0.2612,
"step": 941
},
{
"epoch": 2.5059920106524634,
"grad_norm": 0.21371558486466197,
"learning_rate": 9.041501976284585e-06,
"loss": 0.2715,
"step": 942
},
{
"epoch": 2.508655126498003,
"grad_norm": 0.20948609220977954,
"learning_rate": 8.992094861660078e-06,
"loss": 0.2685,
"step": 943
},
{
"epoch": 2.511318242343542,
"grad_norm": 0.20326902436416877,
"learning_rate": 8.942687747035572e-06,
"loss": 0.2646,
"step": 944
},
{
"epoch": 2.5139813581890813,
"grad_norm": 0.20716732265525145,
"learning_rate": 8.893280632411067e-06,
"loss": 0.2624,
"step": 945
},
{
"epoch": 2.5166444740346208,
"grad_norm": 0.21310454845084212,
"learning_rate": 8.843873517786561e-06,
"loss": 0.2666,
"step": 946
},
{
"epoch": 2.51930758988016,
"grad_norm": 0.2356341947109539,
"learning_rate": 8.794466403162056e-06,
"loss": 0.2607,
"step": 947
},
{
"epoch": 2.521970705725699,
"grad_norm": 0.206705458805249,
"learning_rate": 8.74505928853755e-06,
"loss": 0.2765,
"step": 948
},
{
"epoch": 2.5246338215712383,
"grad_norm": 0.19941570008688478,
"learning_rate": 8.695652173913044e-06,
"loss": 0.2774,
"step": 949
},
{
"epoch": 2.5272969374167777,
"grad_norm": 0.22426207188439748,
"learning_rate": 8.646245059288539e-06,
"loss": 0.2829,
"step": 950
},
{
"epoch": 2.5299600532623168,
"grad_norm": 0.25258528314600287,
"learning_rate": 8.596837944664033e-06,
"loss": 0.2646,
"step": 951
},
{
"epoch": 2.5326231691078562,
"grad_norm": 0.2145489784213885,
"learning_rate": 8.547430830039528e-06,
"loss": 0.2607,
"step": 952
},
{
"epoch": 2.5352862849533953,
"grad_norm": 0.19599385905462602,
"learning_rate": 8.49802371541502e-06,
"loss": 0.2543,
"step": 953
},
{
"epoch": 2.5379494007989347,
"grad_norm": 0.2480014218006241,
"learning_rate": 8.448616600790515e-06,
"loss": 0.2689,
"step": 954
},
{
"epoch": 2.540612516644474,
"grad_norm": 0.24788509439736134,
"learning_rate": 8.399209486166009e-06,
"loss": 0.2725,
"step": 955
},
{
"epoch": 2.543275632490013,
"grad_norm": 0.2267111546180155,
"learning_rate": 8.349802371541503e-06,
"loss": 0.2635,
"step": 956
},
{
"epoch": 2.5459387483355527,
"grad_norm": 0.21182851928367047,
"learning_rate": 8.300395256916998e-06,
"loss": 0.2638,
"step": 957
},
{
"epoch": 2.5486018641810917,
"grad_norm": 0.21455676194315262,
"learning_rate": 8.25098814229249e-06,
"loss": 0.2585,
"step": 958
},
{
"epoch": 2.551264980026631,
"grad_norm": 0.2169073571862216,
"learning_rate": 8.201581027667985e-06,
"loss": 0.2617,
"step": 959
},
{
"epoch": 2.5539280958721706,
"grad_norm": 0.22625888751011447,
"learning_rate": 8.15217391304348e-06,
"loss": 0.271,
"step": 960
},
{
"epoch": 2.5565912117177096,
"grad_norm": 0.20470193896466704,
"learning_rate": 8.102766798418974e-06,
"loss": 0.2662,
"step": 961
},
{
"epoch": 2.559254327563249,
"grad_norm": 0.21322007235950363,
"learning_rate": 8.053359683794468e-06,
"loss": 0.2556,
"step": 962
},
{
"epoch": 2.561917443408788,
"grad_norm": 0.20150617925679104,
"learning_rate": 8.00395256916996e-06,
"loss": 0.2582,
"step": 963
},
{
"epoch": 2.5645805592543276,
"grad_norm": 0.2286944491087834,
"learning_rate": 7.954545454545455e-06,
"loss": 0.2661,
"step": 964
},
{
"epoch": 2.567243675099867,
"grad_norm": 0.20708520844073464,
"learning_rate": 7.90513833992095e-06,
"loss": 0.2625,
"step": 965
},
{
"epoch": 2.569906790945406,
"grad_norm": 0.1993453778786671,
"learning_rate": 7.855731225296444e-06,
"loss": 0.2684,
"step": 966
},
{
"epoch": 2.5725699067909455,
"grad_norm": 0.19939625758599083,
"learning_rate": 7.806324110671937e-06,
"loss": 0.2658,
"step": 967
},
{
"epoch": 2.5752330226364846,
"grad_norm": 0.20007029899978518,
"learning_rate": 7.756916996047431e-06,
"loss": 0.2612,
"step": 968
},
{
"epoch": 2.577896138482024,
"grad_norm": 0.20768490453881108,
"learning_rate": 7.707509881422925e-06,
"loss": 0.2671,
"step": 969
},
{
"epoch": 2.5805592543275635,
"grad_norm": 0.21354810130953325,
"learning_rate": 7.65810276679842e-06,
"loss": 0.2578,
"step": 970
},
{
"epoch": 2.5832223701731025,
"grad_norm": 0.23174711166338519,
"learning_rate": 7.608695652173914e-06,
"loss": 0.2715,
"step": 971
},
{
"epoch": 2.5858854860186415,
"grad_norm": 0.21079000224350897,
"learning_rate": 7.559288537549407e-06,
"loss": 0.2658,
"step": 972
},
{
"epoch": 2.588548601864181,
"grad_norm": 0.2001035421079937,
"learning_rate": 7.509881422924901e-06,
"loss": 0.2569,
"step": 973
},
{
"epoch": 2.5912117177097205,
"grad_norm": 0.2021065412071498,
"learning_rate": 7.4604743083003955e-06,
"loss": 0.2608,
"step": 974
},
{
"epoch": 2.5938748335552595,
"grad_norm": 0.214158147452307,
"learning_rate": 7.41106719367589e-06,
"loss": 0.2779,
"step": 975
},
{
"epoch": 2.596537949400799,
"grad_norm": 0.20790431049928293,
"learning_rate": 7.361660079051384e-06,
"loss": 0.2733,
"step": 976
},
{
"epoch": 2.599201065246338,
"grad_norm": 0.20549750329181854,
"learning_rate": 7.312252964426877e-06,
"loss": 0.276,
"step": 977
},
{
"epoch": 2.6018641810918774,
"grad_norm": 0.20237657523764993,
"learning_rate": 7.262845849802371e-06,
"loss": 0.2735,
"step": 978
},
{
"epoch": 2.604527296937417,
"grad_norm": 0.20973877300015645,
"learning_rate": 7.213438735177866e-06,
"loss": 0.281,
"step": 979
},
{
"epoch": 2.607190412782956,
"grad_norm": 0.22017905718680691,
"learning_rate": 7.16403162055336e-06,
"loss": 0.2677,
"step": 980
},
{
"epoch": 2.6098535286284954,
"grad_norm": 0.2144342458050631,
"learning_rate": 7.1146245059288545e-06,
"loss": 0.2604,
"step": 981
},
{
"epoch": 2.6125166444740344,
"grad_norm": 0.2050156532271564,
"learning_rate": 7.065217391304347e-06,
"loss": 0.2701,
"step": 982
},
{
"epoch": 2.615179760319574,
"grad_norm": 0.1970203183942734,
"learning_rate": 7.015810276679842e-06,
"loss": 0.2505,
"step": 983
},
{
"epoch": 2.6178428761651134,
"grad_norm": 0.20402269570746995,
"learning_rate": 6.966403162055336e-06,
"loss": 0.2599,
"step": 984
},
{
"epoch": 2.6205059920106524,
"grad_norm": 0.20759868626386915,
"learning_rate": 6.91699604743083e-06,
"loss": 0.2733,
"step": 985
},
{
"epoch": 2.623169107856192,
"grad_norm": 0.22693920517209076,
"learning_rate": 6.867588932806325e-06,
"loss": 0.2627,
"step": 986
},
{
"epoch": 2.625832223701731,
"grad_norm": 0.20970122945185465,
"learning_rate": 6.818181818181818e-06,
"loss": 0.2704,
"step": 987
},
{
"epoch": 2.6284953395472703,
"grad_norm": 0.20332704992870704,
"learning_rate": 6.768774703557313e-06,
"loss": 0.2762,
"step": 988
},
{
"epoch": 2.63115845539281,
"grad_norm": 0.20966961639828544,
"learning_rate": 6.719367588932807e-06,
"loss": 0.2737,
"step": 989
},
{
"epoch": 2.633821571238349,
"grad_norm": 0.2392085498215163,
"learning_rate": 6.6699604743083014e-06,
"loss": 0.2639,
"step": 990
},
{
"epoch": 2.6364846870838883,
"grad_norm": 0.22069815282030755,
"learning_rate": 6.620553359683796e-06,
"loss": 0.2623,
"step": 991
},
{
"epoch": 2.6391478029294273,
"grad_norm": 0.2062130093620195,
"learning_rate": 6.5711462450592885e-06,
"loss": 0.2634,
"step": 992
},
{
"epoch": 2.6418109187749668,
"grad_norm": 0.21202212454473487,
"learning_rate": 6.521739130434783e-06,
"loss": 0.2732,
"step": 993
},
{
"epoch": 2.6444740346205062,
"grad_norm": 0.20742438691074003,
"learning_rate": 6.472332015810277e-06,
"loss": 0.2775,
"step": 994
},
{
"epoch": 2.6471371504660453,
"grad_norm": 0.20539419758832048,
"learning_rate": 6.422924901185772e-06,
"loss": 0.2786,
"step": 995
},
{
"epoch": 2.6498002663115847,
"grad_norm": 0.19871961616535505,
"learning_rate": 6.373517786561266e-06,
"loss": 0.2642,
"step": 996
},
{
"epoch": 2.6524633821571237,
"grad_norm": 0.2445459224085182,
"learning_rate": 6.324110671936759e-06,
"loss": 0.271,
"step": 997
},
{
"epoch": 2.655126498002663,
"grad_norm": 0.20294635449003665,
"learning_rate": 6.274703557312253e-06,
"loss": 0.272,
"step": 998
},
{
"epoch": 2.6577896138482027,
"grad_norm": 0.20711520929552674,
"learning_rate": 6.2252964426877475e-06,
"loss": 0.277,
"step": 999
},
{
"epoch": 2.6604527296937417,
"grad_norm": 0.19858451035812705,
"learning_rate": 6.175889328063241e-06,
"loss": 0.2781,
"step": 1000
},
{
"epoch": 2.6631158455392807,
"grad_norm": 0.2029933078164672,
"learning_rate": 6.126482213438735e-06,
"loss": 0.259,
"step": 1001
},
{
"epoch": 2.66577896138482,
"grad_norm": 0.21745287030160018,
"learning_rate": 6.07707509881423e-06,
"loss": 0.27,
"step": 1002
},
{
"epoch": 2.6684420772303596,
"grad_norm": 0.19345167090566057,
"learning_rate": 6.027667984189723e-06,
"loss": 0.268,
"step": 1003
},
{
"epoch": 2.6711051930758987,
"grad_norm": 0.21568939666641776,
"learning_rate": 5.978260869565218e-06,
"loss": 0.2643,
"step": 1004
},
{
"epoch": 2.673768308921438,
"grad_norm": 0.19296044607870885,
"learning_rate": 5.928853754940711e-06,
"loss": 0.2761,
"step": 1005
},
{
"epoch": 2.676431424766977,
"grad_norm": 0.20181257150105722,
"learning_rate": 5.879446640316206e-06,
"loss": 0.271,
"step": 1006
},
{
"epoch": 2.6790945406125166,
"grad_norm": 0.2073838164023787,
"learning_rate": 5.8300395256917e-06,
"loss": 0.2713,
"step": 1007
},
{
"epoch": 2.681757656458056,
"grad_norm": 0.20965825745167907,
"learning_rate": 5.7806324110671936e-06,
"loss": 0.2689,
"step": 1008
},
{
"epoch": 2.684420772303595,
"grad_norm": 0.20444583357709556,
"learning_rate": 5.731225296442688e-06,
"loss": 0.2831,
"step": 1009
},
{
"epoch": 2.6870838881491346,
"grad_norm": 0.20971896583727812,
"learning_rate": 5.681818181818182e-06,
"loss": 0.2626,
"step": 1010
},
{
"epoch": 2.6897470039946736,
"grad_norm": 0.2080555215910288,
"learning_rate": 5.632411067193676e-06,
"loss": 0.2602,
"step": 1011
},
{
"epoch": 2.692410119840213,
"grad_norm": 0.2013420667078693,
"learning_rate": 5.58300395256917e-06,
"loss": 0.2653,
"step": 1012
},
{
"epoch": 2.6950732356857525,
"grad_norm": 0.19614771328643982,
"learning_rate": 5.533596837944665e-06,
"loss": 0.2556,
"step": 1013
},
{
"epoch": 2.6977363515312915,
"grad_norm": 0.20085761642467498,
"learning_rate": 5.484189723320159e-06,
"loss": 0.2744,
"step": 1014
},
{
"epoch": 2.700399467376831,
"grad_norm": 0.21544774180757933,
"learning_rate": 5.4347826086956525e-06,
"loss": 0.2602,
"step": 1015
},
{
"epoch": 2.70306258322237,
"grad_norm": 0.19696825099825307,
"learning_rate": 5.385375494071147e-06,
"loss": 0.2595,
"step": 1016
},
{
"epoch": 2.7057256990679095,
"grad_norm": 0.1924176776922604,
"learning_rate": 5.3359683794466405e-06,
"loss": 0.2619,
"step": 1017
},
{
"epoch": 2.708388814913449,
"grad_norm": 0.22132480166121332,
"learning_rate": 5.286561264822135e-06,
"loss": 0.2697,
"step": 1018
},
{
"epoch": 2.711051930758988,
"grad_norm": 0.18691262036412767,
"learning_rate": 5.237154150197629e-06,
"loss": 0.2554,
"step": 1019
},
{
"epoch": 2.7137150466045274,
"grad_norm": 0.1938229034237995,
"learning_rate": 5.187747035573123e-06,
"loss": 0.2586,
"step": 1020
},
{
"epoch": 2.7163781624500665,
"grad_norm": 0.2129748283287826,
"learning_rate": 5.138339920948617e-06,
"loss": 0.2795,
"step": 1021
},
{
"epoch": 2.719041278295606,
"grad_norm": 0.20445583537089335,
"learning_rate": 5.088932806324111e-06,
"loss": 0.2658,
"step": 1022
},
{
"epoch": 2.7217043941411454,
"grad_norm": 0.1933528504807178,
"learning_rate": 5.039525691699605e-06,
"loss": 0.2621,
"step": 1023
},
{
"epoch": 2.7243675099866844,
"grad_norm": 0.21949852883334098,
"learning_rate": 4.9901185770750995e-06,
"loss": 0.2649,
"step": 1024
},
{
"epoch": 2.7270306258322234,
"grad_norm": 0.20152020359649447,
"learning_rate": 4.940711462450593e-06,
"loss": 0.265,
"step": 1025
},
{
"epoch": 2.729693741677763,
"grad_norm": 0.20583564086259545,
"learning_rate": 4.891304347826087e-06,
"loss": 0.2619,
"step": 1026
},
{
"epoch": 2.7323568575233024,
"grad_norm": 0.2007179587300372,
"learning_rate": 4.841897233201581e-06,
"loss": 0.2693,
"step": 1027
},
{
"epoch": 2.7350199733688414,
"grad_norm": 0.1998685679119499,
"learning_rate": 4.792490118577075e-06,
"loss": 0.2629,
"step": 1028
},
{
"epoch": 2.737683089214381,
"grad_norm": 0.21626697273734094,
"learning_rate": 4.743083003952569e-06,
"loss": 0.269,
"step": 1029
},
{
"epoch": 2.74034620505992,
"grad_norm": 0.19448387232242922,
"learning_rate": 4.693675889328063e-06,
"loss": 0.2761,
"step": 1030
},
{
"epoch": 2.7430093209054593,
"grad_norm": 0.19395208512967949,
"learning_rate": 4.644268774703558e-06,
"loss": 0.2653,
"step": 1031
},
{
"epoch": 2.745672436750999,
"grad_norm": 0.18925291663752578,
"learning_rate": 4.594861660079051e-06,
"loss": 0.2568,
"step": 1032
},
{
"epoch": 2.748335552596538,
"grad_norm": 0.20842012726728598,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.2689,
"step": 1033
},
{
"epoch": 2.7509986684420773,
"grad_norm": 0.20399895934870427,
"learning_rate": 4.496047430830039e-06,
"loss": 0.262,
"step": 1034
},
{
"epoch": 2.7536617842876163,
"grad_norm": 0.21638718896911208,
"learning_rate": 4.4466403162055334e-06,
"loss": 0.2589,
"step": 1035
},
{
"epoch": 2.756324900133156,
"grad_norm": 0.19757801710020018,
"learning_rate": 4.397233201581028e-06,
"loss": 0.2575,
"step": 1036
},
{
"epoch": 2.7589880159786953,
"grad_norm": 0.1930523815662032,
"learning_rate": 4.347826086956522e-06,
"loss": 0.2589,
"step": 1037
},
{
"epoch": 2.7616511318242343,
"grad_norm": 0.20093506678059855,
"learning_rate": 4.298418972332017e-06,
"loss": 0.2686,
"step": 1038
},
{
"epoch": 2.7643142476697737,
"grad_norm": 0.20051627815913756,
"learning_rate": 4.24901185770751e-06,
"loss": 0.2709,
"step": 1039
},
{
"epoch": 2.7669773635153128,
"grad_norm": 0.196594765327016,
"learning_rate": 4.1996047430830045e-06,
"loss": 0.2617,
"step": 1040
},
{
"epoch": 2.7696404793608522,
"grad_norm": 0.19314366189878793,
"learning_rate": 4.150197628458499e-06,
"loss": 0.2851,
"step": 1041
},
{
"epoch": 2.7723035952063917,
"grad_norm": 0.2161802526854043,
"learning_rate": 4.1007905138339924e-06,
"loss": 0.2674,
"step": 1042
},
{
"epoch": 2.7749667110519307,
"grad_norm": 0.18272700852758644,
"learning_rate": 4.051383399209487e-06,
"loss": 0.2523,
"step": 1043
},
{
"epoch": 2.77762982689747,
"grad_norm": 0.1914267001454524,
"learning_rate": 4.00197628458498e-06,
"loss": 0.271,
"step": 1044
},
{
"epoch": 2.780292942743009,
"grad_norm": 0.20563053341844564,
"learning_rate": 3.952569169960475e-06,
"loss": 0.2588,
"step": 1045
},
{
"epoch": 2.7829560585885487,
"grad_norm": 0.19474283827667518,
"learning_rate": 3.903162055335968e-06,
"loss": 0.259,
"step": 1046
},
{
"epoch": 2.785619174434088,
"grad_norm": 0.199541546086498,
"learning_rate": 3.853754940711463e-06,
"loss": 0.2766,
"step": 1047
},
{
"epoch": 2.788282290279627,
"grad_norm": 0.1962650749461456,
"learning_rate": 3.804347826086957e-06,
"loss": 0.275,
"step": 1048
},
{
"epoch": 2.790945406125166,
"grad_norm": 0.19771877806493995,
"learning_rate": 3.7549407114624506e-06,
"loss": 0.2651,
"step": 1049
},
{
"epoch": 2.7936085219707056,
"grad_norm": 0.25769379294942607,
"learning_rate": 3.705533596837945e-06,
"loss": 0.2792,
"step": 1050
},
{
"epoch": 2.796271637816245,
"grad_norm": 0.2095398170946154,
"learning_rate": 3.6561264822134385e-06,
"loss": 0.2671,
"step": 1051
},
{
"epoch": 2.798934753661784,
"grad_norm": 0.1929871299001819,
"learning_rate": 3.606719367588933e-06,
"loss": 0.2571,
"step": 1052
},
{
"epoch": 2.8015978695073236,
"grad_norm": 0.19854196709504868,
"learning_rate": 3.5573122529644273e-06,
"loss": 0.2734,
"step": 1053
},
{
"epoch": 2.8042609853528626,
"grad_norm": 0.20342959087962045,
"learning_rate": 3.507905138339921e-06,
"loss": 0.2675,
"step": 1054
},
{
"epoch": 2.806924101198402,
"grad_norm": 0.19566813473730155,
"learning_rate": 3.458498023715415e-06,
"loss": 0.2636,
"step": 1055
},
{
"epoch": 2.8095872170439415,
"grad_norm": 0.19394868609732532,
"learning_rate": 3.409090909090909e-06,
"loss": 0.2582,
"step": 1056
},
{
"epoch": 2.8122503328894806,
"grad_norm": 0.19315741666740258,
"learning_rate": 3.3596837944664035e-06,
"loss": 0.2744,
"step": 1057
},
{
"epoch": 2.81491344873502,
"grad_norm": 0.19500591092508857,
"learning_rate": 3.310276679841898e-06,
"loss": 0.2664,
"step": 1058
},
{
"epoch": 2.817576564580559,
"grad_norm": 0.20369824754516933,
"learning_rate": 3.2608695652173914e-06,
"loss": 0.2753,
"step": 1059
},
{
"epoch": 2.8202396804260985,
"grad_norm": 0.21679482311751339,
"learning_rate": 3.211462450592886e-06,
"loss": 0.2668,
"step": 1060
},
{
"epoch": 2.822902796271638,
"grad_norm": 0.19207166020188257,
"learning_rate": 3.1620553359683794e-06,
"loss": 0.2714,
"step": 1061
},
{
"epoch": 2.825565912117177,
"grad_norm": 0.18576307265975345,
"learning_rate": 3.1126482213438737e-06,
"loss": 0.2641,
"step": 1062
},
{
"epoch": 2.8282290279627165,
"grad_norm": 0.19216814433561258,
"learning_rate": 3.0632411067193677e-06,
"loss": 0.2686,
"step": 1063
},
{
"epoch": 2.8308921438082555,
"grad_norm": 0.20223820044568933,
"learning_rate": 3.0138339920948617e-06,
"loss": 0.2681,
"step": 1064
},
{
"epoch": 2.833555259653795,
"grad_norm": 0.2025068882484355,
"learning_rate": 2.9644268774703556e-06,
"loss": 0.2671,
"step": 1065
},
{
"epoch": 2.8362183754993344,
"grad_norm": 0.19192928047405172,
"learning_rate": 2.91501976284585e-06,
"loss": 0.2738,
"step": 1066
},
{
"epoch": 2.8388814913448734,
"grad_norm": 0.18813387022576608,
"learning_rate": 2.865612648221344e-06,
"loss": 0.2555,
"step": 1067
},
{
"epoch": 2.841544607190413,
"grad_norm": 0.17981642336035955,
"learning_rate": 2.816205533596838e-06,
"loss": 0.2649,
"step": 1068
},
{
"epoch": 2.844207723035952,
"grad_norm": 0.19082585501925517,
"learning_rate": 2.7667984189723323e-06,
"loss": 0.2717,
"step": 1069
},
{
"epoch": 2.8468708388814914,
"grad_norm": 0.1934715160744257,
"learning_rate": 2.7173913043478263e-06,
"loss": 0.2588,
"step": 1070
},
{
"epoch": 2.849533954727031,
"grad_norm": 0.1943027368827162,
"learning_rate": 2.6679841897233202e-06,
"loss": 0.2612,
"step": 1071
},
{
"epoch": 2.85219707057257,
"grad_norm": 0.20463059754180915,
"learning_rate": 2.6185770750988146e-06,
"loss": 0.2654,
"step": 1072
},
{
"epoch": 2.8548601864181093,
"grad_norm": 0.21078399413940485,
"learning_rate": 2.5691699604743086e-06,
"loss": 0.2671,
"step": 1073
},
{
"epoch": 2.8575233022636484,
"grad_norm": 0.20725181291345451,
"learning_rate": 2.5197628458498025e-06,
"loss": 0.2658,
"step": 1074
},
{
"epoch": 2.860186418109188,
"grad_norm": 0.19210859826009163,
"learning_rate": 2.4703557312252965e-06,
"loss": 0.2749,
"step": 1075
},
{
"epoch": 2.8628495339547273,
"grad_norm": 0.21087703729971102,
"learning_rate": 2.4209486166007905e-06,
"loss": 0.2565,
"step": 1076
},
{
"epoch": 2.8655126498002663,
"grad_norm": 0.1932869202958659,
"learning_rate": 2.3715415019762844e-06,
"loss": 0.2761,
"step": 1077
},
{
"epoch": 2.8681757656458053,
"grad_norm": 0.212098446975856,
"learning_rate": 2.322134387351779e-06,
"loss": 0.2739,
"step": 1078
},
{
"epoch": 2.870838881491345,
"grad_norm": 0.1907847773078055,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.2645,
"step": 1079
},
{
"epoch": 2.8735019973368843,
"grad_norm": 0.2138904901003034,
"learning_rate": 2.2233201581027667e-06,
"loss": 0.266,
"step": 1080
},
{
"epoch": 2.8761651131824233,
"grad_norm": 0.19201411133409543,
"learning_rate": 2.173913043478261e-06,
"loss": 0.2589,
"step": 1081
},
{
"epoch": 2.8788282290279628,
"grad_norm": 0.1845739978063396,
"learning_rate": 2.124505928853755e-06,
"loss": 0.2597,
"step": 1082
},
{
"epoch": 2.881491344873502,
"grad_norm": 0.19569151053283082,
"learning_rate": 2.0750988142292494e-06,
"loss": 0.2617,
"step": 1083
},
{
"epoch": 2.8841544607190412,
"grad_norm": 0.19194512760322638,
"learning_rate": 2.0256916996047434e-06,
"loss": 0.2741,
"step": 1084
},
{
"epoch": 2.8868175765645807,
"grad_norm": 0.19164700223613637,
"learning_rate": 1.9762845849802374e-06,
"loss": 0.2557,
"step": 1085
},
{
"epoch": 2.8894806924101197,
"grad_norm": 0.20722349213232807,
"learning_rate": 1.9268774703557313e-06,
"loss": 0.2811,
"step": 1086
},
{
"epoch": 2.892143808255659,
"grad_norm": 0.21395903599582983,
"learning_rate": 1.8774703557312253e-06,
"loss": 0.2697,
"step": 1087
},
{
"epoch": 2.894806924101198,
"grad_norm": 0.19932722434475636,
"learning_rate": 1.8280632411067192e-06,
"loss": 0.2848,
"step": 1088
},
{
"epoch": 2.8974700399467377,
"grad_norm": 0.19719366657115883,
"learning_rate": 1.7786561264822136e-06,
"loss": 0.2715,
"step": 1089
},
{
"epoch": 2.900133155792277,
"grad_norm": 0.1975588211380889,
"learning_rate": 1.7292490118577076e-06,
"loss": 0.263,
"step": 1090
},
{
"epoch": 2.902796271637816,
"grad_norm": 0.1939515446139924,
"learning_rate": 1.6798418972332018e-06,
"loss": 0.2576,
"step": 1091
},
{
"epoch": 2.9054593874833556,
"grad_norm": 0.21461670844381095,
"learning_rate": 1.6304347826086957e-06,
"loss": 0.2622,
"step": 1092
},
{
"epoch": 2.9081225033288947,
"grad_norm": 0.18141714157708164,
"learning_rate": 1.5810276679841897e-06,
"loss": 0.2602,
"step": 1093
},
{
"epoch": 2.910785619174434,
"grad_norm": 0.18648909903146674,
"learning_rate": 1.5316205533596839e-06,
"loss": 0.2544,
"step": 1094
},
{
"epoch": 2.9134487350199736,
"grad_norm": 0.19749530453878072,
"learning_rate": 1.4822134387351778e-06,
"loss": 0.2511,
"step": 1095
},
{
"epoch": 2.9161118508655126,
"grad_norm": 0.2008025174676635,
"learning_rate": 1.432806324110672e-06,
"loss": 0.2621,
"step": 1096
},
{
"epoch": 2.918774966711052,
"grad_norm": 0.1926237458483956,
"learning_rate": 1.3833992094861662e-06,
"loss": 0.2584,
"step": 1097
},
{
"epoch": 2.921438082556591,
"grad_norm": 0.1917953810867646,
"learning_rate": 1.3339920948616601e-06,
"loss": 0.2696,
"step": 1098
},
{
"epoch": 2.9241011984021306,
"grad_norm": 0.18863387793323863,
"learning_rate": 1.2845849802371543e-06,
"loss": 0.269,
"step": 1099
},
{
"epoch": 2.92676431424767,
"grad_norm": 0.18859923936820897,
"learning_rate": 1.2351778656126482e-06,
"loss": 0.2629,
"step": 1100
},
{
"epoch": 2.929427430093209,
"grad_norm": 0.18918722042687142,
"learning_rate": 1.1857707509881422e-06,
"loss": 0.2659,
"step": 1101
},
{
"epoch": 2.932090545938748,
"grad_norm": 0.1909436486504395,
"learning_rate": 1.1363636363636364e-06,
"loss": 0.279,
"step": 1102
},
{
"epoch": 2.9347536617842875,
"grad_norm": 0.215394252478964,
"learning_rate": 1.0869565217391306e-06,
"loss": 0.2771,
"step": 1103
},
{
"epoch": 2.937416777629827,
"grad_norm": 0.1868050430391036,
"learning_rate": 1.0375494071146247e-06,
"loss": 0.255,
"step": 1104
},
{
"epoch": 2.940079893475366,
"grad_norm": 0.18705337019297927,
"learning_rate": 9.881422924901187e-07,
"loss": 0.2472,
"step": 1105
},
{
"epoch": 2.9427430093209055,
"grad_norm": 0.1935007995659731,
"learning_rate": 9.387351778656126e-07,
"loss": 0.2713,
"step": 1106
},
{
"epoch": 2.9454061251664445,
"grad_norm": 0.18412759277611498,
"learning_rate": 8.893280632411068e-07,
"loss": 0.2653,
"step": 1107
},
{
"epoch": 2.948069241011984,
"grad_norm": 0.18330377570006776,
"learning_rate": 8.399209486166009e-07,
"loss": 0.256,
"step": 1108
},
{
"epoch": 2.9507323568575234,
"grad_norm": 0.19950543771973236,
"learning_rate": 7.905138339920948e-07,
"loss": 0.2732,
"step": 1109
},
{
"epoch": 2.9533954727030625,
"grad_norm": 0.18701751210436693,
"learning_rate": 7.411067193675889e-07,
"loss": 0.2634,
"step": 1110
},
{
"epoch": 2.956058588548602,
"grad_norm": 0.18889807484399168,
"learning_rate": 6.916996047430831e-07,
"loss": 0.2519,
"step": 1111
},
{
"epoch": 2.958721704394141,
"grad_norm": 0.1898035633014786,
"learning_rate": 6.422924901185771e-07,
"loss": 0.2658,
"step": 1112
},
{
"epoch": 2.9613848202396804,
"grad_norm": 0.1864905294817814,
"learning_rate": 5.928853754940711e-07,
"loss": 0.2562,
"step": 1113
},
{
"epoch": 2.96404793608522,
"grad_norm": 0.18976880996630371,
"learning_rate": 5.434782608695653e-07,
"loss": 0.264,
"step": 1114
},
{
"epoch": 2.966711051930759,
"grad_norm": 0.19331420232956223,
"learning_rate": 4.940711462450593e-07,
"loss": 0.273,
"step": 1115
},
{
"epoch": 2.9693741677762984,
"grad_norm": 0.1930205378531215,
"learning_rate": 4.446640316205534e-07,
"loss": 0.2592,
"step": 1116
},
{
"epoch": 2.9720372836218374,
"grad_norm": 0.19028897264532088,
"learning_rate": 3.952569169960474e-07,
"loss": 0.2654,
"step": 1117
},
{
"epoch": 2.974700399467377,
"grad_norm": 0.19156481816748225,
"learning_rate": 3.4584980237154154e-07,
"loss": 0.261,
"step": 1118
},
{
"epoch": 2.9773635153129163,
"grad_norm": 0.1889476580235995,
"learning_rate": 2.9644268774703555e-07,
"loss": 0.2566,
"step": 1119
},
{
"epoch": 2.9800266311584553,
"grad_norm": 0.19663277621172817,
"learning_rate": 2.4703557312252967e-07,
"loss": 0.2751,
"step": 1120
},
{
"epoch": 2.982689747003995,
"grad_norm": 0.1848208372611624,
"learning_rate": 1.976284584980237e-07,
"loss": 0.2633,
"step": 1121
},
{
"epoch": 2.985352862849534,
"grad_norm": 0.18259691758877614,
"learning_rate": 1.4822134387351778e-07,
"loss": 0.2696,
"step": 1122
},
{
"epoch": 2.9880159786950733,
"grad_norm": 0.1849664900149779,
"learning_rate": 9.881422924901186e-08,
"loss": 0.2704,
"step": 1123
},
{
"epoch": 2.9906790945406128,
"grad_norm": 0.1854714711613864,
"learning_rate": 4.940711462450593e-08,
"loss": 0.2613,
"step": 1124
},
{
"epoch": 2.993342210386152,
"grad_norm": 0.18380044771707796,
"learning_rate": 0.0,
"loss": 0.2614,
"step": 1125
},
{
"epoch": 2.993342210386152,
"step": 1125,
"total_flos": 9.575573608085586e+17,
"train_loss": 0.4614936934842004,
"train_runtime": 99022.1208,
"train_samples_per_second": 0.182,
"train_steps_per_second": 0.011
}
],
"logging_steps": 1,
"max_steps": 1125,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.575573608085586e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}