flyingbugs's picture
Model save
319854e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1221,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002457002457002457,
"grad_norm": 51.00658947196253,
"learning_rate": 4.0650406504065046e-07,
"loss": 11.5201,
"step": 1
},
{
"epoch": 0.004914004914004914,
"grad_norm": 41.32182940742031,
"learning_rate": 8.130081300813009e-07,
"loss": 11.7707,
"step": 2
},
{
"epoch": 0.007371007371007371,
"grad_norm": 44.33185040770822,
"learning_rate": 1.2195121951219514e-06,
"loss": 11.6448,
"step": 3
},
{
"epoch": 0.009828009828009828,
"grad_norm": 42.42836066927598,
"learning_rate": 1.6260162601626018e-06,
"loss": 11.8086,
"step": 4
},
{
"epoch": 0.012285012285012284,
"grad_norm": 53.775532456381285,
"learning_rate": 2.0325203252032523e-06,
"loss": 11.3357,
"step": 5
},
{
"epoch": 0.014742014742014743,
"grad_norm": 47.62249981599493,
"learning_rate": 2.4390243902439027e-06,
"loss": 11.3818,
"step": 6
},
{
"epoch": 0.0171990171990172,
"grad_norm": 55.13732030606171,
"learning_rate": 2.8455284552845528e-06,
"loss": 11.142,
"step": 7
},
{
"epoch": 0.019656019656019656,
"grad_norm": 62.52065331239275,
"learning_rate": 3.2520325203252037e-06,
"loss": 10.5585,
"step": 8
},
{
"epoch": 0.022113022113022112,
"grad_norm": 69.61760079081881,
"learning_rate": 3.6585365853658537e-06,
"loss": 10.4944,
"step": 9
},
{
"epoch": 0.02457002457002457,
"grad_norm": 101.43566538305599,
"learning_rate": 4.0650406504065046e-06,
"loss": 9.3135,
"step": 10
},
{
"epoch": 0.02702702702702703,
"grad_norm": 78.43100915045316,
"learning_rate": 4.471544715447155e-06,
"loss": 5.2457,
"step": 11
},
{
"epoch": 0.029484029484029485,
"grad_norm": 73.16500005409208,
"learning_rate": 4.8780487804878055e-06,
"loss": 5.3303,
"step": 12
},
{
"epoch": 0.03194103194103194,
"grad_norm": 47.86656696315455,
"learning_rate": 5.2845528455284555e-06,
"loss": 3.6113,
"step": 13
},
{
"epoch": 0.0343980343980344,
"grad_norm": 8.67227796195133,
"learning_rate": 5.6910569105691056e-06,
"loss": 2.1041,
"step": 14
},
{
"epoch": 0.036855036855036855,
"grad_norm": 7.283413327700143,
"learning_rate": 6.0975609756097564e-06,
"loss": 2.0566,
"step": 15
},
{
"epoch": 0.03931203931203931,
"grad_norm": 5.08768956210286,
"learning_rate": 6.504065040650407e-06,
"loss": 1.7846,
"step": 16
},
{
"epoch": 0.04176904176904177,
"grad_norm": 3.7989995223107624,
"learning_rate": 6.910569105691057e-06,
"loss": 1.6511,
"step": 17
},
{
"epoch": 0.044226044226044224,
"grad_norm": 3.5721481727371764,
"learning_rate": 7.317073170731707e-06,
"loss": 1.9222,
"step": 18
},
{
"epoch": 0.04668304668304668,
"grad_norm": 2.2512893668476988,
"learning_rate": 7.723577235772358e-06,
"loss": 1.6941,
"step": 19
},
{
"epoch": 0.04914004914004914,
"grad_norm": 2.274570626749542,
"learning_rate": 8.130081300813009e-06,
"loss": 1.3336,
"step": 20
},
{
"epoch": 0.051597051597051594,
"grad_norm": 1.759146954439502,
"learning_rate": 8.53658536585366e-06,
"loss": 1.6479,
"step": 21
},
{
"epoch": 0.05405405405405406,
"grad_norm": 1.5309831654707053,
"learning_rate": 8.94308943089431e-06,
"loss": 1.6839,
"step": 22
},
{
"epoch": 0.056511056511056514,
"grad_norm": 1.7232978302647235,
"learning_rate": 9.34959349593496e-06,
"loss": 1.4097,
"step": 23
},
{
"epoch": 0.05896805896805897,
"grad_norm": 0.9050344472252703,
"learning_rate": 9.756097560975611e-06,
"loss": 1.3058,
"step": 24
},
{
"epoch": 0.06142506142506143,
"grad_norm": 1.011046912711339,
"learning_rate": 1.016260162601626e-05,
"loss": 1.3016,
"step": 25
},
{
"epoch": 0.06388206388206388,
"grad_norm": 0.7633443815628498,
"learning_rate": 1.0569105691056911e-05,
"loss": 1.0767,
"step": 26
},
{
"epoch": 0.06633906633906633,
"grad_norm": 3.711382173921332,
"learning_rate": 1.0975609756097562e-05,
"loss": 1.2445,
"step": 27
},
{
"epoch": 0.0687960687960688,
"grad_norm": 0.9084685938028465,
"learning_rate": 1.1382113821138211e-05,
"loss": 1.3219,
"step": 28
},
{
"epoch": 0.07125307125307126,
"grad_norm": 0.7278631490873225,
"learning_rate": 1.1788617886178862e-05,
"loss": 1.344,
"step": 29
},
{
"epoch": 0.07371007371007371,
"grad_norm": 0.8118157029372023,
"learning_rate": 1.2195121951219513e-05,
"loss": 1.2438,
"step": 30
},
{
"epoch": 0.07616707616707617,
"grad_norm": 0.7833625240021413,
"learning_rate": 1.2601626016260162e-05,
"loss": 1.3219,
"step": 31
},
{
"epoch": 0.07862407862407862,
"grad_norm": 0.5647517543959654,
"learning_rate": 1.3008130081300815e-05,
"loss": 0.9937,
"step": 32
},
{
"epoch": 0.08108108108108109,
"grad_norm": 0.6618891423388924,
"learning_rate": 1.3414634146341466e-05,
"loss": 0.9841,
"step": 33
},
{
"epoch": 0.08353808353808354,
"grad_norm": 0.8111611221708586,
"learning_rate": 1.3821138211382115e-05,
"loss": 1.1497,
"step": 34
},
{
"epoch": 0.085995085995086,
"grad_norm": 0.5989650541769594,
"learning_rate": 1.4227642276422764e-05,
"loss": 1.0802,
"step": 35
},
{
"epoch": 0.08845208845208845,
"grad_norm": 0.5601453621258067,
"learning_rate": 1.4634146341463415e-05,
"loss": 0.9953,
"step": 36
},
{
"epoch": 0.09090909090909091,
"grad_norm": 0.546088498086313,
"learning_rate": 1.5040650406504067e-05,
"loss": 0.9963,
"step": 37
},
{
"epoch": 0.09336609336609336,
"grad_norm": 0.5184238539089115,
"learning_rate": 1.5447154471544717e-05,
"loss": 0.9913,
"step": 38
},
{
"epoch": 0.09582309582309582,
"grad_norm": 0.4859364925634494,
"learning_rate": 1.5853658536585366e-05,
"loss": 0.9131,
"step": 39
},
{
"epoch": 0.09828009828009827,
"grad_norm": 0.5263041709899442,
"learning_rate": 1.6260162601626018e-05,
"loss": 1.2145,
"step": 40
},
{
"epoch": 0.10073710073710074,
"grad_norm": 0.48323570525096055,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.093,
"step": 41
},
{
"epoch": 0.10319410319410319,
"grad_norm": 0.39786588976887655,
"learning_rate": 1.707317073170732e-05,
"loss": 1.0228,
"step": 42
},
{
"epoch": 0.10565110565110565,
"grad_norm": 0.4353003519119437,
"learning_rate": 1.747967479674797e-05,
"loss": 0.8313,
"step": 43
},
{
"epoch": 0.10810810810810811,
"grad_norm": 0.5099701425209221,
"learning_rate": 1.788617886178862e-05,
"loss": 1.0385,
"step": 44
},
{
"epoch": 0.11056511056511056,
"grad_norm": 0.4247114077933769,
"learning_rate": 1.8292682926829268e-05,
"loss": 1.0362,
"step": 45
},
{
"epoch": 0.11302211302211303,
"grad_norm": 0.4278915097042231,
"learning_rate": 1.869918699186992e-05,
"loss": 1.0663,
"step": 46
},
{
"epoch": 0.11547911547911548,
"grad_norm": 0.42504196406677935,
"learning_rate": 1.9105691056910573e-05,
"loss": 1.065,
"step": 47
},
{
"epoch": 0.11793611793611794,
"grad_norm": 0.39322409819280146,
"learning_rate": 1.9512195121951222e-05,
"loss": 0.831,
"step": 48
},
{
"epoch": 0.12039312039312039,
"grad_norm": 0.38178816332973403,
"learning_rate": 1.991869918699187e-05,
"loss": 1.024,
"step": 49
},
{
"epoch": 0.12285012285012285,
"grad_norm": 0.32488634343203454,
"learning_rate": 2.032520325203252e-05,
"loss": 0.8349,
"step": 50
},
{
"epoch": 0.12530712530712532,
"grad_norm": 0.3782484182668685,
"learning_rate": 2.073170731707317e-05,
"loss": 0.9342,
"step": 51
},
{
"epoch": 0.12776412776412777,
"grad_norm": 0.36030842714472017,
"learning_rate": 2.1138211382113822e-05,
"loss": 1.0332,
"step": 52
},
{
"epoch": 0.13022113022113022,
"grad_norm": 0.3504763804177174,
"learning_rate": 2.1544715447154475e-05,
"loss": 1.0438,
"step": 53
},
{
"epoch": 0.13267813267813267,
"grad_norm": 0.3121087782309304,
"learning_rate": 2.1951219512195124e-05,
"loss": 0.8683,
"step": 54
},
{
"epoch": 0.13513513513513514,
"grad_norm": 0.4132956337094442,
"learning_rate": 2.2357723577235773e-05,
"loss": 1.0546,
"step": 55
},
{
"epoch": 0.1375921375921376,
"grad_norm": 0.3356523934810729,
"learning_rate": 2.2764227642276422e-05,
"loss": 0.8938,
"step": 56
},
{
"epoch": 0.14004914004914004,
"grad_norm": 0.3394592355583364,
"learning_rate": 2.3170731707317075e-05,
"loss": 0.8029,
"step": 57
},
{
"epoch": 0.14250614250614252,
"grad_norm": 0.9025202002420913,
"learning_rate": 2.3577235772357724e-05,
"loss": 0.836,
"step": 58
},
{
"epoch": 0.14496314496314497,
"grad_norm": 0.33122567373181955,
"learning_rate": 2.3983739837398377e-05,
"loss": 0.9265,
"step": 59
},
{
"epoch": 0.14742014742014742,
"grad_norm": 0.386487486127247,
"learning_rate": 2.4390243902439026e-05,
"loss": 1.0522,
"step": 60
},
{
"epoch": 0.14987714987714987,
"grad_norm": 0.3180551010846452,
"learning_rate": 2.4796747967479675e-05,
"loss": 0.8465,
"step": 61
},
{
"epoch": 0.15233415233415235,
"grad_norm": 0.3886943015332388,
"learning_rate": 2.5203252032520324e-05,
"loss": 0.8106,
"step": 62
},
{
"epoch": 0.1547911547911548,
"grad_norm": 0.36483367577896464,
"learning_rate": 2.5609756097560977e-05,
"loss": 0.9219,
"step": 63
},
{
"epoch": 0.15724815724815724,
"grad_norm": 0.3476022996526318,
"learning_rate": 2.601626016260163e-05,
"loss": 0.7888,
"step": 64
},
{
"epoch": 0.1597051597051597,
"grad_norm": 0.33332604952333145,
"learning_rate": 2.642276422764228e-05,
"loss": 0.8336,
"step": 65
},
{
"epoch": 0.16216216216216217,
"grad_norm": 0.3210922545885254,
"learning_rate": 2.682926829268293e-05,
"loss": 0.8325,
"step": 66
},
{
"epoch": 0.16461916461916462,
"grad_norm": 0.34166973327455336,
"learning_rate": 2.7235772357723577e-05,
"loss": 0.8286,
"step": 67
},
{
"epoch": 0.16707616707616707,
"grad_norm": 0.2772568838407044,
"learning_rate": 2.764227642276423e-05,
"loss": 0.6857,
"step": 68
},
{
"epoch": 0.16953316953316952,
"grad_norm": 1.0755515057999228,
"learning_rate": 2.8048780487804882e-05,
"loss": 0.8337,
"step": 69
},
{
"epoch": 0.171990171990172,
"grad_norm": 0.774515155788574,
"learning_rate": 2.8455284552845528e-05,
"loss": 0.9026,
"step": 70
},
{
"epoch": 0.17444717444717445,
"grad_norm": 0.343996443532602,
"learning_rate": 2.886178861788618e-05,
"loss": 0.858,
"step": 71
},
{
"epoch": 0.1769041769041769,
"grad_norm": 0.37464578169776397,
"learning_rate": 2.926829268292683e-05,
"loss": 0.8868,
"step": 72
},
{
"epoch": 0.17936117936117937,
"grad_norm": 0.30780292194750675,
"learning_rate": 2.9674796747967482e-05,
"loss": 0.8738,
"step": 73
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.7080920118302183,
"learning_rate": 3.0081300813008135e-05,
"loss": 1.0803,
"step": 74
},
{
"epoch": 0.18427518427518427,
"grad_norm": 0.3195310398410445,
"learning_rate": 3.048780487804878e-05,
"loss": 0.7552,
"step": 75
},
{
"epoch": 0.18673218673218672,
"grad_norm": 0.3398724677379115,
"learning_rate": 3.089430894308943e-05,
"loss": 0.915,
"step": 76
},
{
"epoch": 0.1891891891891892,
"grad_norm": 1.409643650018611,
"learning_rate": 3.130081300813008e-05,
"loss": 0.8522,
"step": 77
},
{
"epoch": 0.19164619164619165,
"grad_norm": 0.9400426414745835,
"learning_rate": 3.170731707317073e-05,
"loss": 0.9234,
"step": 78
},
{
"epoch": 0.1941031941031941,
"grad_norm": 0.44549756510252503,
"learning_rate": 3.2113821138211384e-05,
"loss": 0.8354,
"step": 79
},
{
"epoch": 0.19656019656019655,
"grad_norm": 0.31409628217862606,
"learning_rate": 3.2520325203252037e-05,
"loss": 0.8491,
"step": 80
},
{
"epoch": 0.19901719901719903,
"grad_norm": 0.4537000801486613,
"learning_rate": 3.292682926829269e-05,
"loss": 0.85,
"step": 81
},
{
"epoch": 0.20147420147420148,
"grad_norm": 0.42406673549654195,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.053,
"step": 82
},
{
"epoch": 0.20393120393120392,
"grad_norm": 0.3789788855142771,
"learning_rate": 3.373983739837399e-05,
"loss": 0.8627,
"step": 83
},
{
"epoch": 0.20638820638820637,
"grad_norm": 0.408375242705326,
"learning_rate": 3.414634146341464e-05,
"loss": 0.9088,
"step": 84
},
{
"epoch": 0.20884520884520885,
"grad_norm": 0.4269743612762991,
"learning_rate": 3.4552845528455286e-05,
"loss": 0.9017,
"step": 85
},
{
"epoch": 0.2113022113022113,
"grad_norm": 0.3983104483895218,
"learning_rate": 3.495934959349594e-05,
"loss": 0.8781,
"step": 86
},
{
"epoch": 0.21375921375921375,
"grad_norm": 0.4289837220182789,
"learning_rate": 3.5365853658536584e-05,
"loss": 0.7913,
"step": 87
},
{
"epoch": 0.21621621621621623,
"grad_norm": 0.4383253801829447,
"learning_rate": 3.577235772357724e-05,
"loss": 0.8579,
"step": 88
},
{
"epoch": 0.21867321867321868,
"grad_norm": 0.3815114297981113,
"learning_rate": 3.617886178861789e-05,
"loss": 0.7926,
"step": 89
},
{
"epoch": 0.22113022113022113,
"grad_norm": 0.4460874540522612,
"learning_rate": 3.6585365853658535e-05,
"loss": 0.8682,
"step": 90
},
{
"epoch": 0.22358722358722358,
"grad_norm": 0.4242618487534378,
"learning_rate": 3.699186991869919e-05,
"loss": 0.8574,
"step": 91
},
{
"epoch": 0.22604422604422605,
"grad_norm": 0.3784544099868278,
"learning_rate": 3.739837398373984e-05,
"loss": 0.7585,
"step": 92
},
{
"epoch": 0.2285012285012285,
"grad_norm": 0.4216052185506308,
"learning_rate": 3.780487804878049e-05,
"loss": 0.7668,
"step": 93
},
{
"epoch": 0.23095823095823095,
"grad_norm": 0.43197147956134363,
"learning_rate": 3.8211382113821145e-05,
"loss": 0.9439,
"step": 94
},
{
"epoch": 0.2334152334152334,
"grad_norm": 0.35661007106689985,
"learning_rate": 3.861788617886179e-05,
"loss": 0.745,
"step": 95
},
{
"epoch": 0.23587223587223588,
"grad_norm": 0.35038751371475896,
"learning_rate": 3.9024390243902444e-05,
"loss": 0.8148,
"step": 96
},
{
"epoch": 0.23832923832923833,
"grad_norm": 0.3269434336747683,
"learning_rate": 3.943089430894309e-05,
"loss": 0.7587,
"step": 97
},
{
"epoch": 0.24078624078624078,
"grad_norm": 0.3927980260401744,
"learning_rate": 3.983739837398374e-05,
"loss": 0.8683,
"step": 98
},
{
"epoch": 0.24324324324324326,
"grad_norm": 0.3655921622998464,
"learning_rate": 4.0243902439024395e-05,
"loss": 0.7903,
"step": 99
},
{
"epoch": 0.2457002457002457,
"grad_norm": 7.434933759364144,
"learning_rate": 4.065040650406504e-05,
"loss": 0.9234,
"step": 100
},
{
"epoch": 0.24815724815724816,
"grad_norm": 0.5057132697370877,
"learning_rate": 4.105691056910569e-05,
"loss": 0.8459,
"step": 101
},
{
"epoch": 0.25061425061425063,
"grad_norm": 0.35608438610939613,
"learning_rate": 4.146341463414634e-05,
"loss": 0.9301,
"step": 102
},
{
"epoch": 0.25307125307125306,
"grad_norm": 0.4378277298361604,
"learning_rate": 4.186991869918699e-05,
"loss": 0.751,
"step": 103
},
{
"epoch": 0.25552825552825553,
"grad_norm": 0.39957854645534735,
"learning_rate": 4.2276422764227644e-05,
"loss": 0.6775,
"step": 104
},
{
"epoch": 0.257985257985258,
"grad_norm": 0.4425372497170904,
"learning_rate": 4.26829268292683e-05,
"loss": 0.7637,
"step": 105
},
{
"epoch": 0.26044226044226043,
"grad_norm": 0.5055020698531032,
"learning_rate": 4.308943089430895e-05,
"loss": 0.9547,
"step": 106
},
{
"epoch": 0.2628992628992629,
"grad_norm": 0.48084566592201927,
"learning_rate": 4.3495934959349595e-05,
"loss": 0.7968,
"step": 107
},
{
"epoch": 0.26535626535626533,
"grad_norm": 0.44969395374862164,
"learning_rate": 4.390243902439025e-05,
"loss": 0.9182,
"step": 108
},
{
"epoch": 0.2678132678132678,
"grad_norm": 0.5044687667724931,
"learning_rate": 4.43089430894309e-05,
"loss": 0.8516,
"step": 109
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.46505802585107076,
"learning_rate": 4.4715447154471546e-05,
"loss": 0.8654,
"step": 110
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.5806796709997633,
"learning_rate": 4.51219512195122e-05,
"loss": 0.906,
"step": 111
},
{
"epoch": 0.2751842751842752,
"grad_norm": 0.4210793238854805,
"learning_rate": 4.5528455284552844e-05,
"loss": 0.7159,
"step": 112
},
{
"epoch": 0.27764127764127766,
"grad_norm": 0.45861184290594337,
"learning_rate": 4.59349593495935e-05,
"loss": 0.8742,
"step": 113
},
{
"epoch": 0.2800982800982801,
"grad_norm": 0.474280060915593,
"learning_rate": 4.634146341463415e-05,
"loss": 0.9078,
"step": 114
},
{
"epoch": 0.28255528255528256,
"grad_norm": 3.4497188666996608,
"learning_rate": 4.6747967479674795e-05,
"loss": 0.7941,
"step": 115
},
{
"epoch": 0.28501228501228504,
"grad_norm": 0.6859838903958281,
"learning_rate": 4.715447154471545e-05,
"loss": 0.829,
"step": 116
},
{
"epoch": 0.28746928746928746,
"grad_norm": 0.4257855688775576,
"learning_rate": 4.75609756097561e-05,
"loss": 0.684,
"step": 117
},
{
"epoch": 0.28992628992628994,
"grad_norm": 0.7209470061472436,
"learning_rate": 4.796747967479675e-05,
"loss": 0.8426,
"step": 118
},
{
"epoch": 0.29238329238329236,
"grad_norm": 0.4348904611702599,
"learning_rate": 4.8373983739837406e-05,
"loss": 0.8974,
"step": 119
},
{
"epoch": 0.29484029484029484,
"grad_norm": 0.6022119693773859,
"learning_rate": 4.878048780487805e-05,
"loss": 0.7381,
"step": 120
},
{
"epoch": 0.2972972972972973,
"grad_norm": 0.49356434831001184,
"learning_rate": 4.9186991869918704e-05,
"loss": 0.8491,
"step": 121
},
{
"epoch": 0.29975429975429974,
"grad_norm": 0.5199694383515181,
"learning_rate": 4.959349593495935e-05,
"loss": 0.9368,
"step": 122
},
{
"epoch": 0.3022113022113022,
"grad_norm": 0.6287882015300568,
"learning_rate": 5e-05,
"loss": 0.7733,
"step": 123
},
{
"epoch": 0.3046683046683047,
"grad_norm": 0.47882310869561157,
"learning_rate": 4.99544626593807e-05,
"loss": 0.828,
"step": 124
},
{
"epoch": 0.3071253071253071,
"grad_norm": 0.47321265037200055,
"learning_rate": 4.990892531876138e-05,
"loss": 0.7783,
"step": 125
},
{
"epoch": 0.3095823095823096,
"grad_norm": 5.521364106361822,
"learning_rate": 4.986338797814208e-05,
"loss": 0.8306,
"step": 126
},
{
"epoch": 0.31203931203931207,
"grad_norm": 0.6130054000501296,
"learning_rate": 4.9817850637522776e-05,
"loss": 0.7226,
"step": 127
},
{
"epoch": 0.3144963144963145,
"grad_norm": 0.5002677223225025,
"learning_rate": 4.977231329690346e-05,
"loss": 0.842,
"step": 128
},
{
"epoch": 0.31695331695331697,
"grad_norm": 0.5097826151334071,
"learning_rate": 4.9726775956284156e-05,
"loss": 0.8517,
"step": 129
},
{
"epoch": 0.3194103194103194,
"grad_norm": 0.4823779081669877,
"learning_rate": 4.9681238615664846e-05,
"loss": 0.6543,
"step": 130
},
{
"epoch": 0.32186732186732187,
"grad_norm": 0.6212379815651925,
"learning_rate": 4.9635701275045536e-05,
"loss": 0.8558,
"step": 131
},
{
"epoch": 0.32432432432432434,
"grad_norm": 0.46984602438360945,
"learning_rate": 4.959016393442623e-05,
"loss": 0.9246,
"step": 132
},
{
"epoch": 0.32678132678132676,
"grad_norm": 0.4983418640829827,
"learning_rate": 4.954462659380692e-05,
"loss": 0.7805,
"step": 133
},
{
"epoch": 0.32923832923832924,
"grad_norm": 0.6115973164236492,
"learning_rate": 4.949908925318761e-05,
"loss": 0.8777,
"step": 134
},
{
"epoch": 0.3316953316953317,
"grad_norm": 0.425339912596782,
"learning_rate": 4.945355191256831e-05,
"loss": 0.8364,
"step": 135
},
{
"epoch": 0.33415233415233414,
"grad_norm": 0.5081656348291814,
"learning_rate": 4.9408014571949e-05,
"loss": 0.8182,
"step": 136
},
{
"epoch": 0.3366093366093366,
"grad_norm": 0.4507877289201634,
"learning_rate": 4.936247723132969e-05,
"loss": 0.7059,
"step": 137
},
{
"epoch": 0.33906633906633904,
"grad_norm": 0.4259156527505649,
"learning_rate": 4.9316939890710386e-05,
"loss": 0.7337,
"step": 138
},
{
"epoch": 0.3415233415233415,
"grad_norm": 0.4870288608531628,
"learning_rate": 4.9271402550091076e-05,
"loss": 0.6361,
"step": 139
},
{
"epoch": 0.343980343980344,
"grad_norm": 0.3911986963597503,
"learning_rate": 4.9225865209471766e-05,
"loss": 0.7603,
"step": 140
},
{
"epoch": 0.3464373464373464,
"grad_norm": 0.7264714511948328,
"learning_rate": 4.918032786885246e-05,
"loss": 0.9062,
"step": 141
},
{
"epoch": 0.3488943488943489,
"grad_norm": 0.47242198367565236,
"learning_rate": 4.913479052823315e-05,
"loss": 0.7663,
"step": 142
},
{
"epoch": 0.35135135135135137,
"grad_norm": 0.47328040906145535,
"learning_rate": 4.908925318761385e-05,
"loss": 0.6299,
"step": 143
},
{
"epoch": 0.3538083538083538,
"grad_norm": 0.5189395807696658,
"learning_rate": 4.904371584699454e-05,
"loss": 0.7318,
"step": 144
},
{
"epoch": 0.35626535626535627,
"grad_norm": 0.45277914852216605,
"learning_rate": 4.899817850637523e-05,
"loss": 0.7552,
"step": 145
},
{
"epoch": 0.35872235872235875,
"grad_norm": 0.4495050775600071,
"learning_rate": 4.8952641165755927e-05,
"loss": 0.7726,
"step": 146
},
{
"epoch": 0.36117936117936117,
"grad_norm": 0.5562436967325994,
"learning_rate": 4.890710382513661e-05,
"loss": 0.7093,
"step": 147
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.4021268067280976,
"learning_rate": 4.8861566484517307e-05,
"loss": 0.7734,
"step": 148
},
{
"epoch": 0.36609336609336607,
"grad_norm": 0.5900140052185344,
"learning_rate": 4.8816029143898e-05,
"loss": 0.898,
"step": 149
},
{
"epoch": 0.36855036855036855,
"grad_norm": 0.40531749565353,
"learning_rate": 4.8770491803278687e-05,
"loss": 0.8673,
"step": 150
},
{
"epoch": 0.371007371007371,
"grad_norm": 0.5257544978960317,
"learning_rate": 4.872495446265938e-05,
"loss": 0.8524,
"step": 151
},
{
"epoch": 0.37346437346437344,
"grad_norm": 0.3709941280583937,
"learning_rate": 4.867941712204008e-05,
"loss": 0.7762,
"step": 152
},
{
"epoch": 0.3759213759213759,
"grad_norm": 0.6733613092717959,
"learning_rate": 4.863387978142076e-05,
"loss": 0.7244,
"step": 153
},
{
"epoch": 0.3783783783783784,
"grad_norm": 0.38207770249074585,
"learning_rate": 4.858834244080146e-05,
"loss": 0.7583,
"step": 154
},
{
"epoch": 0.3808353808353808,
"grad_norm": 0.6562455849116627,
"learning_rate": 4.854280510018216e-05,
"loss": 0.8073,
"step": 155
},
{
"epoch": 0.3832923832923833,
"grad_norm": 0.39748554973014666,
"learning_rate": 4.849726775956284e-05,
"loss": 0.7228,
"step": 156
},
{
"epoch": 0.3857493857493858,
"grad_norm": 0.3661785544020982,
"learning_rate": 4.845173041894354e-05,
"loss": 0.6666,
"step": 157
},
{
"epoch": 0.3882063882063882,
"grad_norm": 0.5268622303781758,
"learning_rate": 4.840619307832423e-05,
"loss": 0.7831,
"step": 158
},
{
"epoch": 0.3906633906633907,
"grad_norm": 0.49812730319416026,
"learning_rate": 4.836065573770492e-05,
"loss": 0.7754,
"step": 159
},
{
"epoch": 0.3931203931203931,
"grad_norm": 0.4157898663987463,
"learning_rate": 4.8315118397085614e-05,
"loss": 0.7964,
"step": 160
},
{
"epoch": 0.3955773955773956,
"grad_norm": 0.47307293289652125,
"learning_rate": 4.8269581056466304e-05,
"loss": 0.8665,
"step": 161
},
{
"epoch": 0.39803439803439805,
"grad_norm": 0.5347821089983137,
"learning_rate": 4.8224043715846994e-05,
"loss": 0.9031,
"step": 162
},
{
"epoch": 0.4004914004914005,
"grad_norm": 0.4416804543349193,
"learning_rate": 4.817850637522769e-05,
"loss": 0.8018,
"step": 163
},
{
"epoch": 0.40294840294840295,
"grad_norm": 0.38242798734043876,
"learning_rate": 4.813296903460838e-05,
"loss": 0.8083,
"step": 164
},
{
"epoch": 0.40540540540540543,
"grad_norm": 0.46628473043304297,
"learning_rate": 4.808743169398907e-05,
"loss": 0.7786,
"step": 165
},
{
"epoch": 0.40786240786240785,
"grad_norm": 0.8957454928357931,
"learning_rate": 4.804189435336977e-05,
"loss": 0.7697,
"step": 166
},
{
"epoch": 0.4103194103194103,
"grad_norm": 0.7840441840091149,
"learning_rate": 4.799635701275046e-05,
"loss": 0.9073,
"step": 167
},
{
"epoch": 0.41277641277641275,
"grad_norm": 0.7288094214817507,
"learning_rate": 4.795081967213115e-05,
"loss": 0.7594,
"step": 168
},
{
"epoch": 0.4152334152334152,
"grad_norm": 0.6041127252970878,
"learning_rate": 4.7905282331511844e-05,
"loss": 0.7313,
"step": 169
},
{
"epoch": 0.4176904176904177,
"grad_norm": 0.8145997847617484,
"learning_rate": 4.7859744990892534e-05,
"loss": 0.6774,
"step": 170
},
{
"epoch": 0.4201474201474201,
"grad_norm": 0.5528790249808274,
"learning_rate": 4.7814207650273224e-05,
"loss": 0.8161,
"step": 171
},
{
"epoch": 0.4226044226044226,
"grad_norm": 0.5818892388787992,
"learning_rate": 4.776867030965392e-05,
"loss": 0.6353,
"step": 172
},
{
"epoch": 0.4250614250614251,
"grad_norm": 0.5028845858663835,
"learning_rate": 4.772313296903461e-05,
"loss": 0.7811,
"step": 173
},
{
"epoch": 0.4275184275184275,
"grad_norm": 0.5094764920597807,
"learning_rate": 4.76775956284153e-05,
"loss": 0.77,
"step": 174
},
{
"epoch": 0.42997542997543,
"grad_norm": 0.40339340341267327,
"learning_rate": 4.7632058287796e-05,
"loss": 0.6796,
"step": 175
},
{
"epoch": 0.43243243243243246,
"grad_norm": 0.47577207705852176,
"learning_rate": 4.758652094717669e-05,
"loss": 0.7034,
"step": 176
},
{
"epoch": 0.4348894348894349,
"grad_norm": 0.39888134217182175,
"learning_rate": 4.754098360655738e-05,
"loss": 0.6607,
"step": 177
},
{
"epoch": 0.43734643734643736,
"grad_norm": 0.3965895014017134,
"learning_rate": 4.749544626593807e-05,
"loss": 0.7624,
"step": 178
},
{
"epoch": 0.4398034398034398,
"grad_norm": 0.4709202993164332,
"learning_rate": 4.7449908925318764e-05,
"loss": 0.8225,
"step": 179
},
{
"epoch": 0.44226044226044225,
"grad_norm": 0.382474212228653,
"learning_rate": 4.740437158469946e-05,
"loss": 0.8546,
"step": 180
},
{
"epoch": 0.44471744471744473,
"grad_norm": 0.4231565796785838,
"learning_rate": 4.7358834244080144e-05,
"loss": 0.771,
"step": 181
},
{
"epoch": 0.44717444717444715,
"grad_norm": 0.38054832898962976,
"learning_rate": 4.731329690346084e-05,
"loss": 0.6595,
"step": 182
},
{
"epoch": 0.44963144963144963,
"grad_norm": 0.3547946010093686,
"learning_rate": 4.726775956284154e-05,
"loss": 0.6817,
"step": 183
},
{
"epoch": 0.4520884520884521,
"grad_norm": 0.3945726785571152,
"learning_rate": 4.722222222222222e-05,
"loss": 0.7525,
"step": 184
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.3928424227592678,
"learning_rate": 4.717668488160292e-05,
"loss": 0.7477,
"step": 185
},
{
"epoch": 0.457002457002457,
"grad_norm": 0.4426304351649171,
"learning_rate": 4.713114754098361e-05,
"loss": 0.7423,
"step": 186
},
{
"epoch": 0.4594594594594595,
"grad_norm": 1.6332435201318054,
"learning_rate": 4.70856102003643e-05,
"loss": 0.7035,
"step": 187
},
{
"epoch": 0.4619164619164619,
"grad_norm": 0.4178581553378913,
"learning_rate": 4.7040072859744995e-05,
"loss": 0.7806,
"step": 188
},
{
"epoch": 0.4643734643734644,
"grad_norm": 0.37923597472442744,
"learning_rate": 4.6994535519125685e-05,
"loss": 0.7407,
"step": 189
},
{
"epoch": 0.4668304668304668,
"grad_norm": 0.4449909952237191,
"learning_rate": 4.6948998178506375e-05,
"loss": 0.7735,
"step": 190
},
{
"epoch": 0.4692874692874693,
"grad_norm": 1.3250256301620615,
"learning_rate": 4.690346083788707e-05,
"loss": 0.7303,
"step": 191
},
{
"epoch": 0.47174447174447176,
"grad_norm": 0.9645765967219847,
"learning_rate": 4.685792349726776e-05,
"loss": 0.7097,
"step": 192
},
{
"epoch": 0.4742014742014742,
"grad_norm": 0.6361558765678473,
"learning_rate": 4.681238615664845e-05,
"loss": 0.7363,
"step": 193
},
{
"epoch": 0.47665847665847666,
"grad_norm": 0.45389595024787915,
"learning_rate": 4.676684881602915e-05,
"loss": 0.7055,
"step": 194
},
{
"epoch": 0.47911547911547914,
"grad_norm": 0.6258698325325335,
"learning_rate": 4.672131147540984e-05,
"loss": 0.6585,
"step": 195
},
{
"epoch": 0.48157248157248156,
"grad_norm": 1.0118318676213243,
"learning_rate": 4.667577413479053e-05,
"loss": 0.7367,
"step": 196
},
{
"epoch": 0.48402948402948404,
"grad_norm": 0.4198144007946843,
"learning_rate": 4.6630236794171225e-05,
"loss": 0.6616,
"step": 197
},
{
"epoch": 0.4864864864864865,
"grad_norm": 0.5913511667595013,
"learning_rate": 4.6584699453551915e-05,
"loss": 0.7845,
"step": 198
},
{
"epoch": 0.48894348894348894,
"grad_norm": 0.33611220980462847,
"learning_rate": 4.6539162112932605e-05,
"loss": 0.7603,
"step": 199
},
{
"epoch": 0.4914004914004914,
"grad_norm": 0.5711542431602112,
"learning_rate": 4.64936247723133e-05,
"loss": 0.7322,
"step": 200
},
{
"epoch": 0.49385749385749383,
"grad_norm": 0.3189868792840459,
"learning_rate": 4.644808743169399e-05,
"loss": 0.6661,
"step": 201
},
{
"epoch": 0.4963144963144963,
"grad_norm": 0.4826389836871673,
"learning_rate": 4.640255009107468e-05,
"loss": 0.6749,
"step": 202
},
{
"epoch": 0.4987714987714988,
"grad_norm": 0.40951703225470715,
"learning_rate": 4.635701275045538e-05,
"loss": 0.7387,
"step": 203
},
{
"epoch": 0.5012285012285013,
"grad_norm": 0.38375829145246065,
"learning_rate": 4.631147540983607e-05,
"loss": 0.7027,
"step": 204
},
{
"epoch": 0.5036855036855037,
"grad_norm": 0.4733049578471896,
"learning_rate": 4.626593806921676e-05,
"loss": 0.7509,
"step": 205
},
{
"epoch": 0.5061425061425061,
"grad_norm": 0.3474159718643396,
"learning_rate": 4.622040072859745e-05,
"loss": 0.7367,
"step": 206
},
{
"epoch": 0.5085995085995086,
"grad_norm": 0.48857281066114416,
"learning_rate": 4.6174863387978145e-05,
"loss": 0.8525,
"step": 207
},
{
"epoch": 0.5110565110565111,
"grad_norm": 0.38214808096990427,
"learning_rate": 4.6129326047358835e-05,
"loss": 0.7906,
"step": 208
},
{
"epoch": 0.5135135135135135,
"grad_norm": 0.33815932263073856,
"learning_rate": 4.6083788706739525e-05,
"loss": 0.7306,
"step": 209
},
{
"epoch": 0.515970515970516,
"grad_norm": 0.4339469943504887,
"learning_rate": 4.603825136612022e-05,
"loss": 0.8031,
"step": 210
},
{
"epoch": 0.5184275184275184,
"grad_norm": 0.3911806777997916,
"learning_rate": 4.599271402550091e-05,
"loss": 0.6236,
"step": 211
},
{
"epoch": 0.5208845208845209,
"grad_norm": 0.4169040746627703,
"learning_rate": 4.59471766848816e-05,
"loss": 0.6954,
"step": 212
},
{
"epoch": 0.5233415233415234,
"grad_norm": 0.409930981249451,
"learning_rate": 4.59016393442623e-05,
"loss": 0.6972,
"step": 213
},
{
"epoch": 0.5257985257985258,
"grad_norm": 0.3662077296397301,
"learning_rate": 4.585610200364299e-05,
"loss": 0.7205,
"step": 214
},
{
"epoch": 0.5282555282555282,
"grad_norm": 0.3999793098185867,
"learning_rate": 4.581056466302368e-05,
"loss": 0.7142,
"step": 215
},
{
"epoch": 0.5307125307125307,
"grad_norm": 0.33426678861834175,
"learning_rate": 4.5765027322404376e-05,
"loss": 0.7806,
"step": 216
},
{
"epoch": 0.5331695331695332,
"grad_norm": 0.2920950465438566,
"learning_rate": 4.5719489981785066e-05,
"loss": 0.5735,
"step": 217
},
{
"epoch": 0.5356265356265356,
"grad_norm": 0.4387714217174655,
"learning_rate": 4.5673952641165756e-05,
"loss": 0.7661,
"step": 218
},
{
"epoch": 0.538083538083538,
"grad_norm": 0.40724721414199005,
"learning_rate": 4.562841530054645e-05,
"loss": 0.7578,
"step": 219
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.4361008788632283,
"learning_rate": 4.558287795992714e-05,
"loss": 0.6755,
"step": 220
},
{
"epoch": 0.542997542997543,
"grad_norm": 0.4246249810597821,
"learning_rate": 4.553734061930783e-05,
"loss": 0.7546,
"step": 221
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.36118319320850206,
"learning_rate": 4.549180327868853e-05,
"loss": 0.7669,
"step": 222
},
{
"epoch": 0.547911547911548,
"grad_norm": 0.908289119148723,
"learning_rate": 4.544626593806922e-05,
"loss": 0.7135,
"step": 223
},
{
"epoch": 0.5503685503685504,
"grad_norm": 0.39602734595220085,
"learning_rate": 4.540072859744991e-05,
"loss": 0.749,
"step": 224
},
{
"epoch": 0.5528255528255528,
"grad_norm": 0.5078448020996696,
"learning_rate": 4.5355191256830606e-05,
"loss": 0.6208,
"step": 225
},
{
"epoch": 0.5552825552825553,
"grad_norm": 0.3443372372601607,
"learning_rate": 4.5309653916211296e-05,
"loss": 0.7046,
"step": 226
},
{
"epoch": 0.5577395577395577,
"grad_norm": 0.4525893747493054,
"learning_rate": 4.5264116575591986e-05,
"loss": 0.7592,
"step": 227
},
{
"epoch": 0.5601965601965602,
"grad_norm": 0.40243874841518706,
"learning_rate": 4.521857923497268e-05,
"loss": 0.8445,
"step": 228
},
{
"epoch": 0.5626535626535627,
"grad_norm": 0.35161294551869515,
"learning_rate": 4.517304189435337e-05,
"loss": 0.677,
"step": 229
},
{
"epoch": 0.5651105651105651,
"grad_norm": 0.41535550493065193,
"learning_rate": 4.512750455373406e-05,
"loss": 0.7478,
"step": 230
},
{
"epoch": 0.5675675675675675,
"grad_norm": 0.4226366849862933,
"learning_rate": 4.508196721311476e-05,
"loss": 0.745,
"step": 231
},
{
"epoch": 0.5700245700245701,
"grad_norm": 0.3673983419967179,
"learning_rate": 4.503642987249545e-05,
"loss": 0.7015,
"step": 232
},
{
"epoch": 0.5724815724815725,
"grad_norm": 0.38024111457034476,
"learning_rate": 4.499089253187614e-05,
"loss": 0.7877,
"step": 233
},
{
"epoch": 0.5749385749385749,
"grad_norm": 0.38382167053979005,
"learning_rate": 4.494535519125683e-05,
"loss": 0.6943,
"step": 234
},
{
"epoch": 0.5773955773955773,
"grad_norm": 0.3773460766513446,
"learning_rate": 4.4899817850637526e-05,
"loss": 0.7944,
"step": 235
},
{
"epoch": 0.5798525798525799,
"grad_norm": 0.4206436428227826,
"learning_rate": 4.4854280510018216e-05,
"loss": 0.6814,
"step": 236
},
{
"epoch": 0.5823095823095823,
"grad_norm": 6.225234570790709,
"learning_rate": 4.4808743169398906e-05,
"loss": 0.7907,
"step": 237
},
{
"epoch": 0.5847665847665847,
"grad_norm": 0.4921907401337786,
"learning_rate": 4.47632058287796e-05,
"loss": 0.6665,
"step": 238
},
{
"epoch": 0.5872235872235873,
"grad_norm": 0.48327648449237093,
"learning_rate": 4.471766848816029e-05,
"loss": 0.7715,
"step": 239
},
{
"epoch": 0.5896805896805897,
"grad_norm": 0.4791973859907425,
"learning_rate": 4.467213114754098e-05,
"loss": 0.6644,
"step": 240
},
{
"epoch": 0.5921375921375921,
"grad_norm": 0.5219036090133962,
"learning_rate": 4.462659380692168e-05,
"loss": 0.8049,
"step": 241
},
{
"epoch": 0.5945945945945946,
"grad_norm": 0.5456422166867602,
"learning_rate": 4.458105646630237e-05,
"loss": 0.7501,
"step": 242
},
{
"epoch": 0.597051597051597,
"grad_norm": 0.42200513727398753,
"learning_rate": 4.453551912568306e-05,
"loss": 0.887,
"step": 243
},
{
"epoch": 0.5995085995085995,
"grad_norm": 0.4322560276672431,
"learning_rate": 4.4489981785063757e-05,
"loss": 0.7695,
"step": 244
},
{
"epoch": 0.601965601965602,
"grad_norm": 0.6813701089189296,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.8039,
"step": 245
},
{
"epoch": 0.6044226044226044,
"grad_norm": 0.34727875514808987,
"learning_rate": 4.4398907103825137e-05,
"loss": 0.6736,
"step": 246
},
{
"epoch": 0.6068796068796068,
"grad_norm": 0.5097357043993563,
"learning_rate": 4.435336976320583e-05,
"loss": 0.7733,
"step": 247
},
{
"epoch": 0.6093366093366094,
"grad_norm": 0.3917452125453462,
"learning_rate": 4.430783242258652e-05,
"loss": 0.6345,
"step": 248
},
{
"epoch": 0.6117936117936118,
"grad_norm": 0.3886971533793202,
"learning_rate": 4.426229508196721e-05,
"loss": 0.6465,
"step": 249
},
{
"epoch": 0.6142506142506142,
"grad_norm": 0.42563955199502573,
"learning_rate": 4.421675774134791e-05,
"loss": 0.6499,
"step": 250
},
{
"epoch": 0.6167076167076168,
"grad_norm": 0.34063379000466826,
"learning_rate": 4.41712204007286e-05,
"loss": 0.6963,
"step": 251
},
{
"epoch": 0.6191646191646192,
"grad_norm": 0.4724839536346018,
"learning_rate": 4.412568306010929e-05,
"loss": 0.7917,
"step": 252
},
{
"epoch": 0.6216216216216216,
"grad_norm": 0.43466297048497554,
"learning_rate": 4.408014571948999e-05,
"loss": 0.7627,
"step": 253
},
{
"epoch": 0.6240786240786241,
"grad_norm": 0.35263290647277007,
"learning_rate": 4.403460837887068e-05,
"loss": 0.624,
"step": 254
},
{
"epoch": 0.6265356265356266,
"grad_norm": 0.41771099490666685,
"learning_rate": 4.398907103825137e-05,
"loss": 0.6774,
"step": 255
},
{
"epoch": 0.628992628992629,
"grad_norm": 0.45045654101278304,
"learning_rate": 4.3943533697632064e-05,
"loss": 0.6706,
"step": 256
},
{
"epoch": 0.6314496314496314,
"grad_norm": 0.4054524028616639,
"learning_rate": 4.3897996357012754e-05,
"loss": 0.6856,
"step": 257
},
{
"epoch": 0.6339066339066339,
"grad_norm": 0.4199071567113292,
"learning_rate": 4.3852459016393444e-05,
"loss": 0.7385,
"step": 258
},
{
"epoch": 0.6363636363636364,
"grad_norm": 0.4359170619851533,
"learning_rate": 4.380692167577414e-05,
"loss": 0.7095,
"step": 259
},
{
"epoch": 0.6388206388206388,
"grad_norm": 0.3850739753964197,
"learning_rate": 4.376138433515483e-05,
"loss": 0.6958,
"step": 260
},
{
"epoch": 0.6412776412776413,
"grad_norm": 0.4890138604791565,
"learning_rate": 4.371584699453552e-05,
"loss": 0.7211,
"step": 261
},
{
"epoch": 0.6437346437346437,
"grad_norm": 0.38398720286811694,
"learning_rate": 4.367030965391621e-05,
"loss": 0.8539,
"step": 262
},
{
"epoch": 0.6461916461916462,
"grad_norm": 0.5242499237496944,
"learning_rate": 4.362477231329691e-05,
"loss": 0.7239,
"step": 263
},
{
"epoch": 0.6486486486486487,
"grad_norm": 0.6576624559407754,
"learning_rate": 4.35792349726776e-05,
"loss": 0.6224,
"step": 264
},
{
"epoch": 0.6511056511056511,
"grad_norm": 0.48964094334247854,
"learning_rate": 4.353369763205829e-05,
"loss": 0.7645,
"step": 265
},
{
"epoch": 0.6535626535626535,
"grad_norm": 0.4674980129473235,
"learning_rate": 4.3488160291438984e-05,
"loss": 0.768,
"step": 266
},
{
"epoch": 0.6560196560196561,
"grad_norm": 0.4434022776784131,
"learning_rate": 4.3442622950819674e-05,
"loss": 0.7459,
"step": 267
},
{
"epoch": 0.6584766584766585,
"grad_norm": 0.538941168132682,
"learning_rate": 4.3397085610200364e-05,
"loss": 0.6968,
"step": 268
},
{
"epoch": 0.6609336609336609,
"grad_norm": 0.3624467815465402,
"learning_rate": 4.335154826958106e-05,
"loss": 0.6096,
"step": 269
},
{
"epoch": 0.6633906633906634,
"grad_norm": 0.5599889013533942,
"learning_rate": 4.330601092896175e-05,
"loss": 0.7658,
"step": 270
},
{
"epoch": 0.6658476658476659,
"grad_norm": 0.690440401509493,
"learning_rate": 4.326047358834244e-05,
"loss": 0.7877,
"step": 271
},
{
"epoch": 0.6683046683046683,
"grad_norm": 0.3686357695682294,
"learning_rate": 4.321493624772314e-05,
"loss": 0.6895,
"step": 272
},
{
"epoch": 0.6707616707616708,
"grad_norm": 0.545620565235858,
"learning_rate": 4.316939890710383e-05,
"loss": 0.69,
"step": 273
},
{
"epoch": 0.6732186732186732,
"grad_norm": 0.4204580863650939,
"learning_rate": 4.312386156648452e-05,
"loss": 0.6768,
"step": 274
},
{
"epoch": 0.6756756756756757,
"grad_norm": 0.47301510227399846,
"learning_rate": 4.3078324225865214e-05,
"loss": 0.6414,
"step": 275
},
{
"epoch": 0.6781326781326781,
"grad_norm": 0.39120871861762363,
"learning_rate": 4.3032786885245904e-05,
"loss": 0.6393,
"step": 276
},
{
"epoch": 0.6805896805896806,
"grad_norm": 0.5663194594331895,
"learning_rate": 4.2987249544626594e-05,
"loss": 0.7721,
"step": 277
},
{
"epoch": 0.683046683046683,
"grad_norm": 0.5578558026406056,
"learning_rate": 4.294171220400729e-05,
"loss": 0.679,
"step": 278
},
{
"epoch": 0.6855036855036855,
"grad_norm": 0.4785935193977311,
"learning_rate": 4.289617486338798e-05,
"loss": 0.8548,
"step": 279
},
{
"epoch": 0.687960687960688,
"grad_norm": 0.7344196795158664,
"learning_rate": 4.285063752276867e-05,
"loss": 0.7421,
"step": 280
},
{
"epoch": 0.6904176904176904,
"grad_norm": 0.8908899764975586,
"learning_rate": 4.280510018214937e-05,
"loss": 0.7894,
"step": 281
},
{
"epoch": 0.6928746928746928,
"grad_norm": 0.6287419956030045,
"learning_rate": 4.275956284153005e-05,
"loss": 0.6785,
"step": 282
},
{
"epoch": 0.6953316953316954,
"grad_norm": 0.5149422483348357,
"learning_rate": 4.271402550091075e-05,
"loss": 0.7382,
"step": 283
},
{
"epoch": 0.6977886977886978,
"grad_norm": 0.5454860373961983,
"learning_rate": 4.2668488160291445e-05,
"loss": 0.7274,
"step": 284
},
{
"epoch": 0.7002457002457002,
"grad_norm": 0.5477624009062736,
"learning_rate": 4.262295081967213e-05,
"loss": 0.7058,
"step": 285
},
{
"epoch": 0.7027027027027027,
"grad_norm": 0.5596039899044134,
"learning_rate": 4.2577413479052825e-05,
"loss": 0.8178,
"step": 286
},
{
"epoch": 0.7051597051597052,
"grad_norm": 0.5919332487502931,
"learning_rate": 4.253187613843352e-05,
"loss": 0.69,
"step": 287
},
{
"epoch": 0.7076167076167076,
"grad_norm": 0.5283900337631473,
"learning_rate": 4.248633879781421e-05,
"loss": 0.8171,
"step": 288
},
{
"epoch": 0.7100737100737101,
"grad_norm": 0.7692525624223621,
"learning_rate": 4.24408014571949e-05,
"loss": 0.7239,
"step": 289
},
{
"epoch": 0.7125307125307125,
"grad_norm": 0.3863360498506725,
"learning_rate": 4.23952641165756e-05,
"loss": 0.5576,
"step": 290
},
{
"epoch": 0.714987714987715,
"grad_norm": 0.7223883296775482,
"learning_rate": 4.234972677595629e-05,
"loss": 0.6975,
"step": 291
},
{
"epoch": 0.7174447174447175,
"grad_norm": 1.7771798036626734,
"learning_rate": 4.230418943533698e-05,
"loss": 0.6565,
"step": 292
},
{
"epoch": 0.7199017199017199,
"grad_norm": 0.6430310979475962,
"learning_rate": 4.225865209471767e-05,
"loss": 0.6675,
"step": 293
},
{
"epoch": 0.7223587223587223,
"grad_norm": 0.3794537639280509,
"learning_rate": 4.2213114754098365e-05,
"loss": 0.7642,
"step": 294
},
{
"epoch": 0.7248157248157249,
"grad_norm": 0.60943195656342,
"learning_rate": 4.2167577413479055e-05,
"loss": 0.7247,
"step": 295
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.37289390653274224,
"learning_rate": 4.2122040072859745e-05,
"loss": 0.6255,
"step": 296
},
{
"epoch": 0.7297297297297297,
"grad_norm": 0.4532436953171903,
"learning_rate": 4.207650273224044e-05,
"loss": 0.7069,
"step": 297
},
{
"epoch": 0.7321867321867321,
"grad_norm": 0.40650887131809266,
"learning_rate": 4.203096539162113e-05,
"loss": 0.7403,
"step": 298
},
{
"epoch": 0.7346437346437347,
"grad_norm": 0.5965618573882557,
"learning_rate": 4.198542805100182e-05,
"loss": 0.7413,
"step": 299
},
{
"epoch": 0.7371007371007371,
"grad_norm": 0.35937646145739954,
"learning_rate": 4.193989071038252e-05,
"loss": 0.7104,
"step": 300
},
{
"epoch": 0.7395577395577395,
"grad_norm": 0.45967984584408983,
"learning_rate": 4.189435336976321e-05,
"loss": 0.8102,
"step": 301
},
{
"epoch": 0.742014742014742,
"grad_norm": 0.4885635149330037,
"learning_rate": 4.18488160291439e-05,
"loss": 0.7302,
"step": 302
},
{
"epoch": 0.7444717444717445,
"grad_norm": 0.3152058972635706,
"learning_rate": 4.1803278688524595e-05,
"loss": 0.6891,
"step": 303
},
{
"epoch": 0.7469287469287469,
"grad_norm": 0.4161834589482244,
"learning_rate": 4.1757741347905285e-05,
"loss": 0.6623,
"step": 304
},
{
"epoch": 0.7493857493857494,
"grad_norm": 0.36473148815614853,
"learning_rate": 4.1712204007285975e-05,
"loss": 0.7902,
"step": 305
},
{
"epoch": 0.7518427518427518,
"grad_norm": 0.4147403697677368,
"learning_rate": 4.166666666666667e-05,
"loss": 0.7875,
"step": 306
},
{
"epoch": 0.7542997542997543,
"grad_norm": 0.4077917564117238,
"learning_rate": 4.162112932604736e-05,
"loss": 0.7275,
"step": 307
},
{
"epoch": 0.7567567567567568,
"grad_norm": 0.4060094467217255,
"learning_rate": 4.157559198542805e-05,
"loss": 0.7783,
"step": 308
},
{
"epoch": 0.7592137592137592,
"grad_norm": 0.4130103975738772,
"learning_rate": 4.153005464480875e-05,
"loss": 0.6847,
"step": 309
},
{
"epoch": 0.7616707616707616,
"grad_norm": 0.3681636230585068,
"learning_rate": 4.148451730418943e-05,
"loss": 0.7531,
"step": 310
},
{
"epoch": 0.7641277641277642,
"grad_norm": 0.3827065341158274,
"learning_rate": 4.143897996357013e-05,
"loss": 0.7141,
"step": 311
},
{
"epoch": 0.7665847665847666,
"grad_norm": 0.29238085362688543,
"learning_rate": 4.1393442622950826e-05,
"loss": 0.6273,
"step": 312
},
{
"epoch": 0.769041769041769,
"grad_norm": 0.33937884647496835,
"learning_rate": 4.134790528233151e-05,
"loss": 0.6489,
"step": 313
},
{
"epoch": 0.7714987714987716,
"grad_norm": 0.3015348898927694,
"learning_rate": 4.1302367941712206e-05,
"loss": 0.5207,
"step": 314
},
{
"epoch": 0.773955773955774,
"grad_norm": 0.35134100703007254,
"learning_rate": 4.12568306010929e-05,
"loss": 0.7576,
"step": 315
},
{
"epoch": 0.7764127764127764,
"grad_norm": 0.31798902115911587,
"learning_rate": 4.1211293260473586e-05,
"loss": 0.6203,
"step": 316
},
{
"epoch": 0.7788697788697788,
"grad_norm": 0.35299888238401994,
"learning_rate": 4.116575591985428e-05,
"loss": 0.6875,
"step": 317
},
{
"epoch": 0.7813267813267813,
"grad_norm": 0.3525914582079822,
"learning_rate": 4.112021857923498e-05,
"loss": 0.6804,
"step": 318
},
{
"epoch": 0.7837837837837838,
"grad_norm": 0.3006720346358963,
"learning_rate": 4.107468123861566e-05,
"loss": 0.5346,
"step": 319
},
{
"epoch": 0.7862407862407862,
"grad_norm": 1.5252533561825474,
"learning_rate": 4.102914389799636e-05,
"loss": 0.7018,
"step": 320
},
{
"epoch": 0.7886977886977887,
"grad_norm": 0.32274770353739635,
"learning_rate": 4.098360655737705e-05,
"loss": 0.614,
"step": 321
},
{
"epoch": 0.7911547911547911,
"grad_norm": 0.32985165709996966,
"learning_rate": 4.093806921675774e-05,
"loss": 0.6607,
"step": 322
},
{
"epoch": 0.7936117936117936,
"grad_norm": 0.30025432983818734,
"learning_rate": 4.0892531876138436e-05,
"loss": 0.6357,
"step": 323
},
{
"epoch": 0.7960687960687961,
"grad_norm": 0.3049594116455463,
"learning_rate": 4.0846994535519126e-05,
"loss": 0.5822,
"step": 324
},
{
"epoch": 0.7985257985257985,
"grad_norm": 0.3629904661955952,
"learning_rate": 4.080145719489982e-05,
"loss": 0.6978,
"step": 325
},
{
"epoch": 0.800982800982801,
"grad_norm": 0.9634279527349047,
"learning_rate": 4.075591985428051e-05,
"loss": 0.8865,
"step": 326
},
{
"epoch": 0.8034398034398035,
"grad_norm": 0.45193045970841783,
"learning_rate": 4.07103825136612e-05,
"loss": 0.8065,
"step": 327
},
{
"epoch": 0.8058968058968059,
"grad_norm": 0.3177464973567778,
"learning_rate": 4.06648451730419e-05,
"loss": 0.711,
"step": 328
},
{
"epoch": 0.8083538083538083,
"grad_norm": 0.5049266007665172,
"learning_rate": 4.061930783242259e-05,
"loss": 0.788,
"step": 329
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.44936451115710957,
"learning_rate": 4.057377049180328e-05,
"loss": 0.6603,
"step": 330
},
{
"epoch": 0.8132678132678133,
"grad_norm": 0.40221025853337433,
"learning_rate": 4.0528233151183976e-05,
"loss": 0.6261,
"step": 331
},
{
"epoch": 0.8157248157248157,
"grad_norm": 0.38900176002138404,
"learning_rate": 4.0482695810564666e-05,
"loss": 0.6544,
"step": 332
},
{
"epoch": 0.8181818181818182,
"grad_norm": 0.453208732932394,
"learning_rate": 4.0437158469945356e-05,
"loss": 0.6353,
"step": 333
},
{
"epoch": 0.8206388206388207,
"grad_norm": 0.3681796156494085,
"learning_rate": 4.039162112932605e-05,
"loss": 0.6836,
"step": 334
},
{
"epoch": 0.8230958230958231,
"grad_norm": 0.468685040057859,
"learning_rate": 4.034608378870674e-05,
"loss": 0.7046,
"step": 335
},
{
"epoch": 0.8255528255528255,
"grad_norm": 0.43444130480919046,
"learning_rate": 4.030054644808743e-05,
"loss": 0.6659,
"step": 336
},
{
"epoch": 0.828009828009828,
"grad_norm": 0.3619248405794401,
"learning_rate": 4.025500910746813e-05,
"loss": 0.6417,
"step": 337
},
{
"epoch": 0.8304668304668305,
"grad_norm": 0.410561658075711,
"learning_rate": 4.020947176684881e-05,
"loss": 0.659,
"step": 338
},
{
"epoch": 0.8329238329238329,
"grad_norm": 0.349661211154494,
"learning_rate": 4.016393442622951e-05,
"loss": 0.7009,
"step": 339
},
{
"epoch": 0.8353808353808354,
"grad_norm": 0.45025633913904883,
"learning_rate": 4.0118397085610207e-05,
"loss": 0.7118,
"step": 340
},
{
"epoch": 0.8378378378378378,
"grad_norm": 0.3491439279038829,
"learning_rate": 4.007285974499089e-05,
"loss": 0.7326,
"step": 341
},
{
"epoch": 0.8402948402948403,
"grad_norm": 0.37516636206626935,
"learning_rate": 4.0027322404371587e-05,
"loss": 0.7005,
"step": 342
},
{
"epoch": 0.8427518427518428,
"grad_norm": 0.3135717435105698,
"learning_rate": 3.998178506375228e-05,
"loss": 0.6751,
"step": 343
},
{
"epoch": 0.8452088452088452,
"grad_norm": 0.45748071875834095,
"learning_rate": 3.9936247723132967e-05,
"loss": 0.7238,
"step": 344
},
{
"epoch": 0.8476658476658476,
"grad_norm": 0.43936046038898285,
"learning_rate": 3.989071038251366e-05,
"loss": 0.7568,
"step": 345
},
{
"epoch": 0.8501228501228502,
"grad_norm": 0.38829296038456096,
"learning_rate": 3.984517304189436e-05,
"loss": 0.6835,
"step": 346
},
{
"epoch": 0.8525798525798526,
"grad_norm": 0.45261007109171814,
"learning_rate": 3.979963570127504e-05,
"loss": 0.7626,
"step": 347
},
{
"epoch": 0.855036855036855,
"grad_norm": 0.3469325577394658,
"learning_rate": 3.975409836065574e-05,
"loss": 0.7497,
"step": 348
},
{
"epoch": 0.8574938574938575,
"grad_norm": 0.5400301615988978,
"learning_rate": 3.970856102003643e-05,
"loss": 0.8051,
"step": 349
},
{
"epoch": 0.85995085995086,
"grad_norm": 0.4001992360407668,
"learning_rate": 3.966302367941712e-05,
"loss": 0.7536,
"step": 350
},
{
"epoch": 0.8624078624078624,
"grad_norm": 0.3724180671895729,
"learning_rate": 3.961748633879782e-05,
"loss": 0.6238,
"step": 351
},
{
"epoch": 0.8648648648648649,
"grad_norm": 0.386974931071893,
"learning_rate": 3.957194899817851e-05,
"loss": 0.6876,
"step": 352
},
{
"epoch": 0.8673218673218673,
"grad_norm": 0.372863116265662,
"learning_rate": 3.95264116575592e-05,
"loss": 0.5849,
"step": 353
},
{
"epoch": 0.8697788697788698,
"grad_norm": 0.33795820672046467,
"learning_rate": 3.9480874316939894e-05,
"loss": 0.5205,
"step": 354
},
{
"epoch": 0.8722358722358723,
"grad_norm": 0.40729933902725135,
"learning_rate": 3.9435336976320584e-05,
"loss": 0.7655,
"step": 355
},
{
"epoch": 0.8746928746928747,
"grad_norm": 0.30755968744467366,
"learning_rate": 3.9389799635701274e-05,
"loss": 0.6263,
"step": 356
},
{
"epoch": 0.8771498771498771,
"grad_norm": 0.37093708872360476,
"learning_rate": 3.934426229508197e-05,
"loss": 0.7129,
"step": 357
},
{
"epoch": 0.8796068796068796,
"grad_norm": 0.37633511734635255,
"learning_rate": 3.929872495446266e-05,
"loss": 0.5872,
"step": 358
},
{
"epoch": 0.8820638820638821,
"grad_norm": 0.9614590556739387,
"learning_rate": 3.925318761384335e-05,
"loss": 0.7089,
"step": 359
},
{
"epoch": 0.8845208845208845,
"grad_norm": 0.36669325077055215,
"learning_rate": 3.920765027322405e-05,
"loss": 0.5196,
"step": 360
},
{
"epoch": 0.8869778869778869,
"grad_norm": 0.36063038368340206,
"learning_rate": 3.916211293260474e-05,
"loss": 0.7037,
"step": 361
},
{
"epoch": 0.8894348894348895,
"grad_norm": 0.3844550677877335,
"learning_rate": 3.9116575591985434e-05,
"loss": 0.6472,
"step": 362
},
{
"epoch": 0.8918918918918919,
"grad_norm": 0.36208926990085244,
"learning_rate": 3.9071038251366124e-05,
"loss": 0.6393,
"step": 363
},
{
"epoch": 0.8943488943488943,
"grad_norm": 0.36998305778442386,
"learning_rate": 3.9025500910746814e-05,
"loss": 0.7667,
"step": 364
},
{
"epoch": 0.8968058968058968,
"grad_norm": 0.3447294134836953,
"learning_rate": 3.897996357012751e-05,
"loss": 0.605,
"step": 365
},
{
"epoch": 0.8992628992628993,
"grad_norm": 0.36709184015795876,
"learning_rate": 3.89344262295082e-05,
"loss": 0.6642,
"step": 366
},
{
"epoch": 0.9017199017199017,
"grad_norm": 0.3486298961479053,
"learning_rate": 3.888888888888889e-05,
"loss": 0.6621,
"step": 367
},
{
"epoch": 0.9041769041769042,
"grad_norm": 0.4328843991656747,
"learning_rate": 3.884335154826959e-05,
"loss": 0.6797,
"step": 368
},
{
"epoch": 0.9066339066339066,
"grad_norm": 0.35617103914532294,
"learning_rate": 3.879781420765027e-05,
"loss": 0.6853,
"step": 369
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.48563000772634657,
"learning_rate": 3.875227686703097e-05,
"loss": 0.6981,
"step": 370
},
{
"epoch": 0.9115479115479116,
"grad_norm": 0.3887375137301516,
"learning_rate": 3.8706739526411664e-05,
"loss": 0.603,
"step": 371
},
{
"epoch": 0.914004914004914,
"grad_norm": 0.439470097514328,
"learning_rate": 3.866120218579235e-05,
"loss": 0.7077,
"step": 372
},
{
"epoch": 0.9164619164619164,
"grad_norm": 0.3403160171473462,
"learning_rate": 3.8615664845173044e-05,
"loss": 0.586,
"step": 373
},
{
"epoch": 0.918918918918919,
"grad_norm": 0.42760829158750546,
"learning_rate": 3.857012750455374e-05,
"loss": 0.7303,
"step": 374
},
{
"epoch": 0.9213759213759214,
"grad_norm": 0.36489243280535705,
"learning_rate": 3.8524590163934424e-05,
"loss": 0.6256,
"step": 375
},
{
"epoch": 0.9238329238329238,
"grad_norm": 0.3808217161262314,
"learning_rate": 3.847905282331512e-05,
"loss": 0.7059,
"step": 376
},
{
"epoch": 0.9262899262899262,
"grad_norm": 0.34013903969336157,
"learning_rate": 3.843351548269581e-05,
"loss": 0.7301,
"step": 377
},
{
"epoch": 0.9287469287469288,
"grad_norm": 1.1463809470744701,
"learning_rate": 3.83879781420765e-05,
"loss": 0.6969,
"step": 378
},
{
"epoch": 0.9312039312039312,
"grad_norm": 0.4235667833129601,
"learning_rate": 3.83424408014572e-05,
"loss": 0.6473,
"step": 379
},
{
"epoch": 0.9336609336609336,
"grad_norm": 0.79876765490425,
"learning_rate": 3.829690346083789e-05,
"loss": 0.6183,
"step": 380
},
{
"epoch": 0.9361179361179361,
"grad_norm": 0.49555963725341723,
"learning_rate": 3.825136612021858e-05,
"loss": 0.8044,
"step": 381
},
{
"epoch": 0.9385749385749386,
"grad_norm": 0.3428503165110703,
"learning_rate": 3.8205828779599275e-05,
"loss": 0.5995,
"step": 382
},
{
"epoch": 0.941031941031941,
"grad_norm": 0.44593307884321404,
"learning_rate": 3.8160291438979965e-05,
"loss": 0.7151,
"step": 383
},
{
"epoch": 0.9434889434889435,
"grad_norm": 0.37468176709006323,
"learning_rate": 3.8114754098360655e-05,
"loss": 0.7905,
"step": 384
},
{
"epoch": 0.9459459459459459,
"grad_norm": 0.5722646888774676,
"learning_rate": 3.806921675774135e-05,
"loss": 0.731,
"step": 385
},
{
"epoch": 0.9484029484029484,
"grad_norm": 1.8799684973155986,
"learning_rate": 3.802367941712204e-05,
"loss": 0.7714,
"step": 386
},
{
"epoch": 0.9508599508599509,
"grad_norm": 0.5172547101235551,
"learning_rate": 3.797814207650273e-05,
"loss": 0.6399,
"step": 387
},
{
"epoch": 0.9533169533169533,
"grad_norm": 0.4418711377815284,
"learning_rate": 3.793260473588343e-05,
"loss": 0.6997,
"step": 388
},
{
"epoch": 0.9557739557739557,
"grad_norm": 0.5285652919128196,
"learning_rate": 3.788706739526412e-05,
"loss": 0.704,
"step": 389
},
{
"epoch": 0.9582309582309583,
"grad_norm": 0.45024081362204066,
"learning_rate": 3.784153005464481e-05,
"loss": 0.7121,
"step": 390
},
{
"epoch": 0.9606879606879607,
"grad_norm": 0.4069199989712789,
"learning_rate": 3.7795992714025505e-05,
"loss": 0.6408,
"step": 391
},
{
"epoch": 0.9631449631449631,
"grad_norm": 0.4856083258958585,
"learning_rate": 3.7750455373406195e-05,
"loss": 0.6723,
"step": 392
},
{
"epoch": 0.9656019656019657,
"grad_norm": 0.3584054750131388,
"learning_rate": 3.7704918032786885e-05,
"loss": 0.619,
"step": 393
},
{
"epoch": 0.9680589680589681,
"grad_norm": 0.46503131404325265,
"learning_rate": 3.765938069216758e-05,
"loss": 0.7499,
"step": 394
},
{
"epoch": 0.9705159705159705,
"grad_norm": 0.3568325978396338,
"learning_rate": 3.761384335154827e-05,
"loss": 0.6153,
"step": 395
},
{
"epoch": 0.972972972972973,
"grad_norm": 0.35548746505907636,
"learning_rate": 3.756830601092896e-05,
"loss": 0.6644,
"step": 396
},
{
"epoch": 0.9754299754299754,
"grad_norm": 3.732099146967768,
"learning_rate": 3.752276867030965e-05,
"loss": 0.7677,
"step": 397
},
{
"epoch": 0.9778869778869779,
"grad_norm": 0.6417926585769745,
"learning_rate": 3.747723132969035e-05,
"loss": 0.713,
"step": 398
},
{
"epoch": 0.9803439803439803,
"grad_norm": 0.3428338885926231,
"learning_rate": 3.7431693989071045e-05,
"loss": 0.7057,
"step": 399
},
{
"epoch": 0.9828009828009828,
"grad_norm": 0.44136452162974704,
"learning_rate": 3.738615664845173e-05,
"loss": 0.5293,
"step": 400
},
{
"epoch": 0.9852579852579852,
"grad_norm": 0.4267208521085863,
"learning_rate": 3.7340619307832425e-05,
"loss": 0.6846,
"step": 401
},
{
"epoch": 0.9877149877149877,
"grad_norm": 0.36579693412461944,
"learning_rate": 3.729508196721312e-05,
"loss": 0.7243,
"step": 402
},
{
"epoch": 0.9901719901719902,
"grad_norm": 0.46204688211658324,
"learning_rate": 3.7249544626593805e-05,
"loss": 0.68,
"step": 403
},
{
"epoch": 0.9926289926289926,
"grad_norm": 0.37956013971155556,
"learning_rate": 3.72040072859745e-05,
"loss": 0.7447,
"step": 404
},
{
"epoch": 0.995085995085995,
"grad_norm": 0.3910625026439214,
"learning_rate": 3.71584699453552e-05,
"loss": 0.6197,
"step": 405
},
{
"epoch": 0.9975429975429976,
"grad_norm": 0.41783305217284267,
"learning_rate": 3.711293260473588e-05,
"loss": 0.6719,
"step": 406
},
{
"epoch": 1.0,
"grad_norm": 0.3611623758486256,
"learning_rate": 3.706739526411658e-05,
"loss": 0.5977,
"step": 407
},
{
"epoch": 1.0024570024570025,
"grad_norm": 0.4712316660517998,
"learning_rate": 3.702185792349727e-05,
"loss": 0.6267,
"step": 408
},
{
"epoch": 1.0049140049140048,
"grad_norm": 0.44510865147589923,
"learning_rate": 3.697632058287796e-05,
"loss": 0.5723,
"step": 409
},
{
"epoch": 1.0073710073710074,
"grad_norm": 0.4897737184802636,
"learning_rate": 3.6930783242258656e-05,
"loss": 0.6133,
"step": 410
},
{
"epoch": 1.00982800982801,
"grad_norm": 0.4710019531923247,
"learning_rate": 3.6885245901639346e-05,
"loss": 0.6601,
"step": 411
},
{
"epoch": 1.0122850122850122,
"grad_norm": 0.4127476864637772,
"learning_rate": 3.6839708561020036e-05,
"loss": 0.5831,
"step": 412
},
{
"epoch": 1.0147420147420148,
"grad_norm": 0.3852466347026918,
"learning_rate": 3.679417122040073e-05,
"loss": 0.6171,
"step": 413
},
{
"epoch": 1.0171990171990173,
"grad_norm": 0.35722854453354774,
"learning_rate": 3.674863387978142e-05,
"loss": 0.4941,
"step": 414
},
{
"epoch": 1.0196560196560196,
"grad_norm": 0.3477409452059263,
"learning_rate": 3.670309653916211e-05,
"loss": 0.6014,
"step": 415
},
{
"epoch": 1.0221130221130221,
"grad_norm": 0.38573394146966594,
"learning_rate": 3.665755919854281e-05,
"loss": 0.5435,
"step": 416
},
{
"epoch": 1.0245700245700247,
"grad_norm": 0.3152965022867117,
"learning_rate": 3.66120218579235e-05,
"loss": 0.5363,
"step": 417
},
{
"epoch": 1.027027027027027,
"grad_norm": 0.37855487804654653,
"learning_rate": 3.656648451730419e-05,
"loss": 0.6216,
"step": 418
},
{
"epoch": 1.0294840294840295,
"grad_norm": 0.3915386797411922,
"learning_rate": 3.6520947176684886e-05,
"loss": 0.6468,
"step": 419
},
{
"epoch": 1.031941031941032,
"grad_norm": 0.30903418418917916,
"learning_rate": 3.6475409836065576e-05,
"loss": 0.5918,
"step": 420
},
{
"epoch": 1.0343980343980343,
"grad_norm": 18.485814215831798,
"learning_rate": 3.6429872495446266e-05,
"loss": 0.7571,
"step": 421
},
{
"epoch": 1.0368550368550369,
"grad_norm": 0.43418803474006623,
"learning_rate": 3.638433515482696e-05,
"loss": 0.4651,
"step": 422
},
{
"epoch": 1.0393120393120394,
"grad_norm": 0.4296276366274725,
"learning_rate": 3.633879781420765e-05,
"loss": 0.569,
"step": 423
},
{
"epoch": 1.0417690417690417,
"grad_norm": 0.3252040498050024,
"learning_rate": 3.629326047358834e-05,
"loss": 0.5682,
"step": 424
},
{
"epoch": 1.0442260442260443,
"grad_norm": 0.5555580641102786,
"learning_rate": 3.624772313296903e-05,
"loss": 0.5685,
"step": 425
},
{
"epoch": 1.0466830466830466,
"grad_norm": 0.30439876353558465,
"learning_rate": 3.620218579234973e-05,
"loss": 0.5509,
"step": 426
},
{
"epoch": 1.049140049140049,
"grad_norm": 0.5257024496923978,
"learning_rate": 3.615664845173042e-05,
"loss": 0.6175,
"step": 427
},
{
"epoch": 1.0515970515970516,
"grad_norm": 0.3924880233071523,
"learning_rate": 3.611111111111111e-05,
"loss": 0.5463,
"step": 428
},
{
"epoch": 1.054054054054054,
"grad_norm": 0.3912483665248679,
"learning_rate": 3.6065573770491806e-05,
"loss": 0.5172,
"step": 429
},
{
"epoch": 1.0565110565110565,
"grad_norm": 0.35522183054743234,
"learning_rate": 3.6020036429872496e-05,
"loss": 0.548,
"step": 430
},
{
"epoch": 1.058968058968059,
"grad_norm": 0.41696382741795146,
"learning_rate": 3.5974499089253186e-05,
"loss": 0.6179,
"step": 431
},
{
"epoch": 1.0614250614250613,
"grad_norm": 0.34899632677634346,
"learning_rate": 3.592896174863388e-05,
"loss": 0.5848,
"step": 432
},
{
"epoch": 1.0638820638820639,
"grad_norm": 0.29764763902529734,
"learning_rate": 3.588342440801457e-05,
"loss": 0.5065,
"step": 433
},
{
"epoch": 1.0663390663390664,
"grad_norm": 0.33789418991474374,
"learning_rate": 3.583788706739526e-05,
"loss": 0.557,
"step": 434
},
{
"epoch": 1.0687960687960687,
"grad_norm": 0.3817072319681774,
"learning_rate": 3.579234972677596e-05,
"loss": 0.573,
"step": 435
},
{
"epoch": 1.0712530712530712,
"grad_norm": 0.27883801849612727,
"learning_rate": 3.574681238615665e-05,
"loss": 0.4778,
"step": 436
},
{
"epoch": 1.0737100737100738,
"grad_norm": 0.3923116193005877,
"learning_rate": 3.570127504553734e-05,
"loss": 0.5919,
"step": 437
},
{
"epoch": 1.076167076167076,
"grad_norm": 0.29914831145059495,
"learning_rate": 3.5655737704918037e-05,
"loss": 0.494,
"step": 438
},
{
"epoch": 1.0786240786240786,
"grad_norm": 0.31767336538989416,
"learning_rate": 3.5610200364298727e-05,
"loss": 0.6199,
"step": 439
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.46913096826211653,
"learning_rate": 3.5564663023679417e-05,
"loss": 0.6955,
"step": 440
},
{
"epoch": 1.0835380835380835,
"grad_norm": 0.3675875371319456,
"learning_rate": 3.551912568306011e-05,
"loss": 0.538,
"step": 441
},
{
"epoch": 1.085995085995086,
"grad_norm": 0.3330032586684102,
"learning_rate": 3.54735883424408e-05,
"loss": 0.5659,
"step": 442
},
{
"epoch": 1.0884520884520885,
"grad_norm": 0.39684518158418425,
"learning_rate": 3.542805100182149e-05,
"loss": 0.4902,
"step": 443
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.3370350178101319,
"learning_rate": 3.538251366120219e-05,
"loss": 0.6277,
"step": 444
},
{
"epoch": 1.0933660933660934,
"grad_norm": 0.3120031541968653,
"learning_rate": 3.533697632058288e-05,
"loss": 0.5705,
"step": 445
},
{
"epoch": 1.095823095823096,
"grad_norm": 0.35804818545314876,
"learning_rate": 3.529143897996357e-05,
"loss": 0.5268,
"step": 446
},
{
"epoch": 1.0982800982800982,
"grad_norm": 0.36340510531282566,
"learning_rate": 3.524590163934427e-05,
"loss": 0.5328,
"step": 447
},
{
"epoch": 1.1007371007371007,
"grad_norm": 0.3098836614900157,
"learning_rate": 3.520036429872496e-05,
"loss": 0.5773,
"step": 448
},
{
"epoch": 1.1031941031941033,
"grad_norm": 0.3135507590572425,
"learning_rate": 3.515482695810565e-05,
"loss": 0.6188,
"step": 449
},
{
"epoch": 1.1056511056511056,
"grad_norm": 0.31164002022216103,
"learning_rate": 3.5109289617486344e-05,
"loss": 0.4606,
"step": 450
},
{
"epoch": 1.1081081081081081,
"grad_norm": 0.3623278294612082,
"learning_rate": 3.5063752276867034e-05,
"loss": 0.6439,
"step": 451
},
{
"epoch": 1.1105651105651106,
"grad_norm": 0.296521047913555,
"learning_rate": 3.5018214936247724e-05,
"loss": 0.4695,
"step": 452
},
{
"epoch": 1.113022113022113,
"grad_norm": 0.3483084595473505,
"learning_rate": 3.4972677595628414e-05,
"loss": 0.5615,
"step": 453
},
{
"epoch": 1.1154791154791155,
"grad_norm": 0.2860532290662123,
"learning_rate": 3.492714025500911e-05,
"loss": 0.5426,
"step": 454
},
{
"epoch": 1.117936117936118,
"grad_norm": 0.35904055869223206,
"learning_rate": 3.48816029143898e-05,
"loss": 0.6372,
"step": 455
},
{
"epoch": 1.1203931203931203,
"grad_norm": 0.3035047945160019,
"learning_rate": 3.483606557377049e-05,
"loss": 0.5084,
"step": 456
},
{
"epoch": 1.1228501228501229,
"grad_norm": 0.34056825729709134,
"learning_rate": 3.479052823315119e-05,
"loss": 0.5445,
"step": 457
},
{
"epoch": 1.1253071253071254,
"grad_norm": 0.34548063719869543,
"learning_rate": 3.474499089253188e-05,
"loss": 0.5538,
"step": 458
},
{
"epoch": 1.1277641277641277,
"grad_norm": 0.34863453010817147,
"learning_rate": 3.469945355191257e-05,
"loss": 0.6136,
"step": 459
},
{
"epoch": 1.1302211302211302,
"grad_norm": 0.36452640020436167,
"learning_rate": 3.4653916211293264e-05,
"loss": 0.6339,
"step": 460
},
{
"epoch": 1.1326781326781328,
"grad_norm": 0.33505641304640355,
"learning_rate": 3.4608378870673954e-05,
"loss": 0.5226,
"step": 461
},
{
"epoch": 1.135135135135135,
"grad_norm": 0.5832869535948028,
"learning_rate": 3.4562841530054644e-05,
"loss": 0.6528,
"step": 462
},
{
"epoch": 1.1375921375921376,
"grad_norm": 0.29618924105134536,
"learning_rate": 3.451730418943534e-05,
"loss": 0.6025,
"step": 463
},
{
"epoch": 1.1400491400491402,
"grad_norm": 0.34874600771453107,
"learning_rate": 3.447176684881603e-05,
"loss": 0.5565,
"step": 464
},
{
"epoch": 1.1425061425061425,
"grad_norm": 0.335951908594719,
"learning_rate": 3.442622950819672e-05,
"loss": 0.547,
"step": 465
},
{
"epoch": 1.144963144963145,
"grad_norm": 0.2998993608726187,
"learning_rate": 3.438069216757742e-05,
"loss": 0.628,
"step": 466
},
{
"epoch": 1.1474201474201475,
"grad_norm": 0.29644218347091184,
"learning_rate": 3.433515482695811e-05,
"loss": 0.52,
"step": 467
},
{
"epoch": 1.1498771498771498,
"grad_norm": 0.30863434769848686,
"learning_rate": 3.42896174863388e-05,
"loss": 0.5253,
"step": 468
},
{
"epoch": 1.1523341523341524,
"grad_norm": 0.28232514356630184,
"learning_rate": 3.4244080145719494e-05,
"loss": 0.5264,
"step": 469
},
{
"epoch": 1.154791154791155,
"grad_norm": 0.3486029632281899,
"learning_rate": 3.4198542805100184e-05,
"loss": 0.5337,
"step": 470
},
{
"epoch": 1.1572481572481572,
"grad_norm": 0.2749244379146869,
"learning_rate": 3.4153005464480874e-05,
"loss": 0.4396,
"step": 471
},
{
"epoch": 1.1597051597051597,
"grad_norm": 0.35073763579329614,
"learning_rate": 3.410746812386157e-05,
"loss": 0.5767,
"step": 472
},
{
"epoch": 1.1621621621621623,
"grad_norm": 0.3148751339175056,
"learning_rate": 3.406193078324226e-05,
"loss": 0.554,
"step": 473
},
{
"epoch": 1.1646191646191646,
"grad_norm": 0.31661478461777187,
"learning_rate": 3.401639344262295e-05,
"loss": 0.6312,
"step": 474
},
{
"epoch": 1.1670761670761671,
"grad_norm": 0.32266558978084553,
"learning_rate": 3.397085610200365e-05,
"loss": 0.5549,
"step": 475
},
{
"epoch": 1.1695331695331694,
"grad_norm": 0.31175094191334074,
"learning_rate": 3.392531876138434e-05,
"loss": 0.6031,
"step": 476
},
{
"epoch": 1.171990171990172,
"grad_norm": 0.2860842816292032,
"learning_rate": 3.387978142076503e-05,
"loss": 0.5033,
"step": 477
},
{
"epoch": 1.1744471744471745,
"grad_norm": 0.2863055488397975,
"learning_rate": 3.3834244080145725e-05,
"loss": 0.5826,
"step": 478
},
{
"epoch": 1.1769041769041768,
"grad_norm": 0.2814884571892455,
"learning_rate": 3.3788706739526415e-05,
"loss": 0.6098,
"step": 479
},
{
"epoch": 1.1793611793611793,
"grad_norm": 0.3343616425168066,
"learning_rate": 3.3743169398907105e-05,
"loss": 0.6576,
"step": 480
},
{
"epoch": 1.1818181818181819,
"grad_norm": 4.471655389420487,
"learning_rate": 3.36976320582878e-05,
"loss": 0.6501,
"step": 481
},
{
"epoch": 1.1842751842751842,
"grad_norm": 0.3531434211683213,
"learning_rate": 3.365209471766849e-05,
"loss": 0.5736,
"step": 482
},
{
"epoch": 1.1867321867321867,
"grad_norm": 0.30933282032145204,
"learning_rate": 3.360655737704918e-05,
"loss": 0.5773,
"step": 483
},
{
"epoch": 1.1891891891891893,
"grad_norm": 0.34749618430933105,
"learning_rate": 3.356102003642987e-05,
"loss": 0.5344,
"step": 484
},
{
"epoch": 1.1916461916461916,
"grad_norm": 0.2890952500864336,
"learning_rate": 3.351548269581057e-05,
"loss": 0.5979,
"step": 485
},
{
"epoch": 1.194103194103194,
"grad_norm": 0.34484921930011087,
"learning_rate": 3.346994535519126e-05,
"loss": 0.5318,
"step": 486
},
{
"epoch": 1.1965601965601966,
"grad_norm": 0.30984886065289263,
"learning_rate": 3.342440801457195e-05,
"loss": 0.5531,
"step": 487
},
{
"epoch": 1.199017199017199,
"grad_norm": 0.32020672210102435,
"learning_rate": 3.3378870673952645e-05,
"loss": 0.559,
"step": 488
},
{
"epoch": 1.2014742014742015,
"grad_norm": 0.3715980189408075,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.5414,
"step": 489
},
{
"epoch": 1.203931203931204,
"grad_norm": 0.2867099183140612,
"learning_rate": 3.3287795992714025e-05,
"loss": 0.5016,
"step": 490
},
{
"epoch": 1.2063882063882063,
"grad_norm": 0.32647658657343387,
"learning_rate": 3.324225865209472e-05,
"loss": 0.5668,
"step": 491
},
{
"epoch": 1.2088452088452089,
"grad_norm": 0.31285287963181513,
"learning_rate": 3.319672131147541e-05,
"loss": 0.5808,
"step": 492
},
{
"epoch": 1.2113022113022114,
"grad_norm": 0.31154263564497325,
"learning_rate": 3.31511839708561e-05,
"loss": 0.577,
"step": 493
},
{
"epoch": 1.2137592137592137,
"grad_norm": 0.3148888983694767,
"learning_rate": 3.31056466302368e-05,
"loss": 0.5713,
"step": 494
},
{
"epoch": 1.2162162162162162,
"grad_norm": 0.33196948700396134,
"learning_rate": 3.306010928961749e-05,
"loss": 0.6411,
"step": 495
},
{
"epoch": 1.2186732186732188,
"grad_norm": 0.3089241773992785,
"learning_rate": 3.301457194899818e-05,
"loss": 0.6084,
"step": 496
},
{
"epoch": 1.221130221130221,
"grad_norm": 0.35264205238860336,
"learning_rate": 3.2969034608378875e-05,
"loss": 0.6082,
"step": 497
},
{
"epoch": 1.2235872235872236,
"grad_norm": 0.3592504157610499,
"learning_rate": 3.2923497267759565e-05,
"loss": 0.5017,
"step": 498
},
{
"epoch": 1.2260442260442261,
"grad_norm": 0.3294945441126368,
"learning_rate": 3.2877959927140255e-05,
"loss": 0.5671,
"step": 499
},
{
"epoch": 1.2285012285012284,
"grad_norm": 0.31804938107229946,
"learning_rate": 3.283242258652095e-05,
"loss": 0.5706,
"step": 500
},
{
"epoch": 1.230958230958231,
"grad_norm": 0.2933642876504185,
"learning_rate": 3.2786885245901635e-05,
"loss": 0.5426,
"step": 501
},
{
"epoch": 1.2334152334152333,
"grad_norm": 0.3626340514862369,
"learning_rate": 3.274134790528233e-05,
"loss": 0.6105,
"step": 502
},
{
"epoch": 1.2358722358722358,
"grad_norm": 0.26476010226570695,
"learning_rate": 3.269581056466303e-05,
"loss": 0.4702,
"step": 503
},
{
"epoch": 1.2383292383292384,
"grad_norm": 0.3661036271637661,
"learning_rate": 3.265027322404371e-05,
"loss": 0.542,
"step": 504
},
{
"epoch": 1.2407862407862407,
"grad_norm": 0.3421274093595941,
"learning_rate": 3.260473588342441e-05,
"loss": 0.5814,
"step": 505
},
{
"epoch": 1.2432432432432432,
"grad_norm": 0.27100222834936427,
"learning_rate": 3.2559198542805106e-05,
"loss": 0.5478,
"step": 506
},
{
"epoch": 1.2457002457002457,
"grad_norm": 0.314120753601731,
"learning_rate": 3.251366120218579e-05,
"loss": 0.5531,
"step": 507
},
{
"epoch": 1.248157248157248,
"grad_norm": 0.9759156709730757,
"learning_rate": 3.2468123861566486e-05,
"loss": 0.6531,
"step": 508
},
{
"epoch": 1.2506142506142506,
"grad_norm": 0.30944457432745653,
"learning_rate": 3.242258652094718e-05,
"loss": 0.5513,
"step": 509
},
{
"epoch": 1.253071253071253,
"grad_norm": 0.3010475271711826,
"learning_rate": 3.237704918032787e-05,
"loss": 0.5095,
"step": 510
},
{
"epoch": 1.2555282555282554,
"grad_norm": 2.091229835428742,
"learning_rate": 3.233151183970856e-05,
"loss": 0.6917,
"step": 511
},
{
"epoch": 1.257985257985258,
"grad_norm": 0.4263480510636171,
"learning_rate": 3.228597449908925e-05,
"loss": 0.5107,
"step": 512
},
{
"epoch": 1.2604422604422605,
"grad_norm": 0.2662240671218934,
"learning_rate": 3.224043715846995e-05,
"loss": 0.5171,
"step": 513
},
{
"epoch": 1.2628992628992628,
"grad_norm": 0.38958730612737474,
"learning_rate": 3.219489981785064e-05,
"loss": 0.5062,
"step": 514
},
{
"epoch": 1.2653562653562653,
"grad_norm": 0.30418756289720655,
"learning_rate": 3.214936247723133e-05,
"loss": 0.5835,
"step": 515
},
{
"epoch": 1.2678132678132679,
"grad_norm": 0.344436665503126,
"learning_rate": 3.2103825136612026e-05,
"loss": 0.5983,
"step": 516
},
{
"epoch": 1.2702702702702702,
"grad_norm": 1.918725499774248,
"learning_rate": 3.2058287795992716e-05,
"loss": 0.6293,
"step": 517
},
{
"epoch": 1.2727272727272727,
"grad_norm": 0.3623753413503759,
"learning_rate": 3.2012750455373406e-05,
"loss": 0.631,
"step": 518
},
{
"epoch": 1.2751842751842752,
"grad_norm": 0.4577653918156244,
"learning_rate": 3.19672131147541e-05,
"loss": 0.5013,
"step": 519
},
{
"epoch": 1.2776412776412776,
"grad_norm": 0.31126359791794433,
"learning_rate": 3.192167577413479e-05,
"loss": 0.502,
"step": 520
},
{
"epoch": 1.28009828009828,
"grad_norm": 0.30127450296424224,
"learning_rate": 3.187613843351548e-05,
"loss": 0.4445,
"step": 521
},
{
"epoch": 1.2825552825552826,
"grad_norm": 0.32777802361056146,
"learning_rate": 3.183060109289618e-05,
"loss": 0.586,
"step": 522
},
{
"epoch": 1.285012285012285,
"grad_norm": 0.3151574260038467,
"learning_rate": 3.178506375227687e-05,
"loss": 0.5101,
"step": 523
},
{
"epoch": 1.2874692874692875,
"grad_norm": 0.2958405193987708,
"learning_rate": 3.173952641165756e-05,
"loss": 0.5115,
"step": 524
},
{
"epoch": 1.28992628992629,
"grad_norm": 0.30692569753814974,
"learning_rate": 3.1693989071038256e-05,
"loss": 0.5255,
"step": 525
},
{
"epoch": 1.2923832923832923,
"grad_norm": 0.31369349705521754,
"learning_rate": 3.1648451730418946e-05,
"loss": 0.5708,
"step": 526
},
{
"epoch": 1.2948402948402948,
"grad_norm": 0.2818423915221156,
"learning_rate": 3.1602914389799636e-05,
"loss": 0.4837,
"step": 527
},
{
"epoch": 1.2972972972972974,
"grad_norm": 0.3134826582265648,
"learning_rate": 3.155737704918033e-05,
"loss": 0.5751,
"step": 528
},
{
"epoch": 1.2997542997542997,
"grad_norm": 0.2816827747685129,
"learning_rate": 3.1511839708561016e-05,
"loss": 0.4596,
"step": 529
},
{
"epoch": 1.3022113022113022,
"grad_norm": 0.34084186090096374,
"learning_rate": 3.146630236794171e-05,
"loss": 0.5656,
"step": 530
},
{
"epoch": 1.3046683046683047,
"grad_norm": 0.32476535285413916,
"learning_rate": 3.142076502732241e-05,
"loss": 0.5661,
"step": 531
},
{
"epoch": 1.307125307125307,
"grad_norm": 0.3188888254272654,
"learning_rate": 3.137522768670309e-05,
"loss": 0.5269,
"step": 532
},
{
"epoch": 1.3095823095823096,
"grad_norm": 0.3366341919026146,
"learning_rate": 3.132969034608379e-05,
"loss": 0.4923,
"step": 533
},
{
"epoch": 1.3120393120393121,
"grad_norm": 0.3271992624122109,
"learning_rate": 3.1284153005464487e-05,
"loss": 0.5977,
"step": 534
},
{
"epoch": 1.3144963144963144,
"grad_norm": 0.417085896710461,
"learning_rate": 3.123861566484517e-05,
"loss": 0.614,
"step": 535
},
{
"epoch": 1.316953316953317,
"grad_norm": 0.31946680031176,
"learning_rate": 3.1193078324225867e-05,
"loss": 0.5364,
"step": 536
},
{
"epoch": 1.3194103194103195,
"grad_norm": 0.34172653254662405,
"learning_rate": 3.114754098360656e-05,
"loss": 0.5552,
"step": 537
},
{
"epoch": 1.3218673218673218,
"grad_norm": 0.334367874832506,
"learning_rate": 3.1102003642987247e-05,
"loss": 0.5969,
"step": 538
},
{
"epoch": 1.3243243243243243,
"grad_norm": 0.3986000529335846,
"learning_rate": 3.105646630236794e-05,
"loss": 0.499,
"step": 539
},
{
"epoch": 1.3267813267813269,
"grad_norm": 0.30475256236149134,
"learning_rate": 3.101092896174863e-05,
"loss": 0.5106,
"step": 540
},
{
"epoch": 1.3292383292383292,
"grad_norm": 0.3316364648381355,
"learning_rate": 3.096539162112932e-05,
"loss": 0.5334,
"step": 541
},
{
"epoch": 1.3316953316953317,
"grad_norm": 0.409941745047023,
"learning_rate": 3.091985428051002e-05,
"loss": 0.6345,
"step": 542
},
{
"epoch": 1.3341523341523343,
"grad_norm": 0.3401524473507645,
"learning_rate": 3.087431693989071e-05,
"loss": 0.6766,
"step": 543
},
{
"epoch": 1.3366093366093366,
"grad_norm": 0.3950305885673271,
"learning_rate": 3.082877959927141e-05,
"loss": 0.5296,
"step": 544
},
{
"epoch": 1.339066339066339,
"grad_norm": 0.306723619335892,
"learning_rate": 3.07832422586521e-05,
"loss": 0.6201,
"step": 545
},
{
"epoch": 1.3415233415233416,
"grad_norm": 0.3877898069868618,
"learning_rate": 3.073770491803279e-05,
"loss": 0.5411,
"step": 546
},
{
"epoch": 1.343980343980344,
"grad_norm": 0.31598719997076186,
"learning_rate": 3.0692167577413484e-05,
"loss": 0.5121,
"step": 547
},
{
"epoch": 1.3464373464373465,
"grad_norm": 0.3712193743058151,
"learning_rate": 3.0646630236794174e-05,
"loss": 0.5978,
"step": 548
},
{
"epoch": 1.348894348894349,
"grad_norm": 0.33020226938329394,
"learning_rate": 3.0601092896174864e-05,
"loss": 0.4806,
"step": 549
},
{
"epoch": 1.3513513513513513,
"grad_norm": 0.3248140873697447,
"learning_rate": 3.055555555555556e-05,
"loss": 0.5433,
"step": 550
},
{
"epoch": 1.3538083538083538,
"grad_norm": 0.3230001924138346,
"learning_rate": 3.0510018214936247e-05,
"loss": 0.5853,
"step": 551
},
{
"epoch": 1.3562653562653564,
"grad_norm": 0.35792498410700313,
"learning_rate": 3.046448087431694e-05,
"loss": 0.4961,
"step": 552
},
{
"epoch": 1.3587223587223587,
"grad_norm": 0.37595474090197006,
"learning_rate": 3.0418943533697637e-05,
"loss": 0.5736,
"step": 553
},
{
"epoch": 1.3611793611793612,
"grad_norm": 4.505487497550051,
"learning_rate": 3.0373406193078324e-05,
"loss": 0.7102,
"step": 554
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.4374726309839091,
"learning_rate": 3.0327868852459017e-05,
"loss": 0.5518,
"step": 555
},
{
"epoch": 1.366093366093366,
"grad_norm": 0.3410372388042177,
"learning_rate": 3.028233151183971e-05,
"loss": 0.5756,
"step": 556
},
{
"epoch": 1.3685503685503686,
"grad_norm": 0.4059522167570831,
"learning_rate": 3.02367941712204e-05,
"loss": 0.5434,
"step": 557
},
{
"epoch": 1.3710073710073711,
"grad_norm": 1.27646389360509,
"learning_rate": 3.0191256830601094e-05,
"loss": 0.4415,
"step": 558
},
{
"epoch": 1.3734643734643734,
"grad_norm": 0.39439443949738967,
"learning_rate": 3.0145719489981787e-05,
"loss": 0.5545,
"step": 559
},
{
"epoch": 1.375921375921376,
"grad_norm": 0.3677529411655735,
"learning_rate": 3.0100182149362477e-05,
"loss": 0.5867,
"step": 560
},
{
"epoch": 1.3783783783783785,
"grad_norm": 0.3938192601050454,
"learning_rate": 3.005464480874317e-05,
"loss": 0.6695,
"step": 561
},
{
"epoch": 1.3808353808353808,
"grad_norm": 0.4188986867298627,
"learning_rate": 3.0009107468123864e-05,
"loss": 0.6397,
"step": 562
},
{
"epoch": 1.3832923832923834,
"grad_norm": 0.36496617388518987,
"learning_rate": 2.9963570127504554e-05,
"loss": 0.642,
"step": 563
},
{
"epoch": 1.3857493857493859,
"grad_norm": 0.36732841184379483,
"learning_rate": 2.9918032786885248e-05,
"loss": 0.6248,
"step": 564
},
{
"epoch": 1.3882063882063882,
"grad_norm": 0.3287638014013039,
"learning_rate": 2.987249544626594e-05,
"loss": 0.5528,
"step": 565
},
{
"epoch": 1.3906633906633907,
"grad_norm": 0.3290826647548518,
"learning_rate": 2.982695810564663e-05,
"loss": 0.5995,
"step": 566
},
{
"epoch": 1.393120393120393,
"grad_norm": 7.1958757083746585,
"learning_rate": 2.9781420765027324e-05,
"loss": 1.1998,
"step": 567
},
{
"epoch": 1.3955773955773956,
"grad_norm": 2.242377112943564,
"learning_rate": 2.9735883424408018e-05,
"loss": 0.6146,
"step": 568
},
{
"epoch": 1.398034398034398,
"grad_norm": 0.46246249293412817,
"learning_rate": 2.9690346083788704e-05,
"loss": 0.6113,
"step": 569
},
{
"epoch": 1.4004914004914004,
"grad_norm": 0.32979804742212093,
"learning_rate": 2.96448087431694e-05,
"loss": 0.4534,
"step": 570
},
{
"epoch": 1.402948402948403,
"grad_norm": 0.362610751520732,
"learning_rate": 2.9599271402550094e-05,
"loss": 0.6311,
"step": 571
},
{
"epoch": 1.4054054054054055,
"grad_norm": 0.3834824335793335,
"learning_rate": 2.955373406193078e-05,
"loss": 0.5671,
"step": 572
},
{
"epoch": 1.4078624078624078,
"grad_norm": 0.3162740542302538,
"learning_rate": 2.9508196721311478e-05,
"loss": 0.4862,
"step": 573
},
{
"epoch": 1.4103194103194103,
"grad_norm": 0.6767191055749541,
"learning_rate": 2.946265938069217e-05,
"loss": 0.574,
"step": 574
},
{
"epoch": 1.4127764127764126,
"grad_norm": 0.2842423993917733,
"learning_rate": 2.9417122040072858e-05,
"loss": 0.4788,
"step": 575
},
{
"epoch": 1.4152334152334152,
"grad_norm": 0.4398113214411677,
"learning_rate": 2.937158469945355e-05,
"loss": 0.5164,
"step": 576
},
{
"epoch": 1.4176904176904177,
"grad_norm": 0.33122596395932824,
"learning_rate": 2.9326047358834248e-05,
"loss": 0.5859,
"step": 577
},
{
"epoch": 1.42014742014742,
"grad_norm": 0.6087999515003836,
"learning_rate": 2.9280510018214935e-05,
"loss": 0.5303,
"step": 578
},
{
"epoch": 1.4226044226044225,
"grad_norm": 0.32726522540965197,
"learning_rate": 2.9234972677595628e-05,
"loss": 0.5724,
"step": 579
},
{
"epoch": 1.425061425061425,
"grad_norm": 0.4106956970848515,
"learning_rate": 2.918943533697632e-05,
"loss": 0.5341,
"step": 580
},
{
"epoch": 1.4275184275184274,
"grad_norm": 0.2852492527312244,
"learning_rate": 2.9143897996357018e-05,
"loss": 0.5196,
"step": 581
},
{
"epoch": 1.42997542997543,
"grad_norm": 0.285739926248913,
"learning_rate": 2.9098360655737705e-05,
"loss": 0.5442,
"step": 582
},
{
"epoch": 1.4324324324324325,
"grad_norm": 0.36932760352978566,
"learning_rate": 2.9052823315118398e-05,
"loss": 0.6142,
"step": 583
},
{
"epoch": 1.4348894348894348,
"grad_norm": 0.33676368381537514,
"learning_rate": 2.9007285974499095e-05,
"loss": 0.6479,
"step": 584
},
{
"epoch": 1.4373464373464373,
"grad_norm": 2.762615052994114,
"learning_rate": 2.896174863387978e-05,
"loss": 0.5999,
"step": 585
},
{
"epoch": 1.4398034398034398,
"grad_norm": 0.33823584373786114,
"learning_rate": 2.8916211293260475e-05,
"loss": 0.5738,
"step": 586
},
{
"epoch": 1.4422604422604421,
"grad_norm": 0.35461746489514906,
"learning_rate": 2.8870673952641168e-05,
"loss": 0.5155,
"step": 587
},
{
"epoch": 1.4447174447174447,
"grad_norm": 0.30665641874707567,
"learning_rate": 2.8825136612021858e-05,
"loss": 0.5726,
"step": 588
},
{
"epoch": 1.4471744471744472,
"grad_norm": 0.3218144044024646,
"learning_rate": 2.877959927140255e-05,
"loss": 0.5198,
"step": 589
},
{
"epoch": 1.4496314496314495,
"grad_norm": 0.3282045050488162,
"learning_rate": 2.8734061930783245e-05,
"loss": 0.5923,
"step": 590
},
{
"epoch": 1.452088452088452,
"grad_norm": 0.2882401227029393,
"learning_rate": 2.8688524590163935e-05,
"loss": 0.5092,
"step": 591
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.3127088054502666,
"learning_rate": 2.864298724954463e-05,
"loss": 0.526,
"step": 592
},
{
"epoch": 1.457002457002457,
"grad_norm": 2.4172038138032828,
"learning_rate": 2.8597449908925322e-05,
"loss": 0.7051,
"step": 593
},
{
"epoch": 1.4594594594594594,
"grad_norm": 0.5233594884346805,
"learning_rate": 2.8551912568306012e-05,
"loss": 0.4573,
"step": 594
},
{
"epoch": 1.461916461916462,
"grad_norm": 21.995326582795617,
"learning_rate": 2.8506375227686705e-05,
"loss": 0.6321,
"step": 595
},
{
"epoch": 1.4643734643734643,
"grad_norm": 0.5078686288976298,
"learning_rate": 2.84608378870674e-05,
"loss": 0.5941,
"step": 596
},
{
"epoch": 1.4668304668304668,
"grad_norm": 0.3609671816475955,
"learning_rate": 2.841530054644809e-05,
"loss": 0.5707,
"step": 597
},
{
"epoch": 1.4692874692874693,
"grad_norm": 0.35532192813895724,
"learning_rate": 2.8369763205828782e-05,
"loss": 0.4821,
"step": 598
},
{
"epoch": 1.4717444717444716,
"grad_norm": 0.3671957212508993,
"learning_rate": 2.8324225865209475e-05,
"loss": 0.4661,
"step": 599
},
{
"epoch": 1.4742014742014742,
"grad_norm": 0.30117085000522925,
"learning_rate": 2.8278688524590162e-05,
"loss": 0.4893,
"step": 600
},
{
"epoch": 1.4766584766584767,
"grad_norm": 0.33268501284939167,
"learning_rate": 2.823315118397086e-05,
"loss": 0.5176,
"step": 601
},
{
"epoch": 1.479115479115479,
"grad_norm": 0.33604265458925436,
"learning_rate": 2.8187613843351552e-05,
"loss": 0.5182,
"step": 602
},
{
"epoch": 1.4815724815724816,
"grad_norm": 0.3360889711504089,
"learning_rate": 2.814207650273224e-05,
"loss": 0.5907,
"step": 603
},
{
"epoch": 1.484029484029484,
"grad_norm": 0.328673675164007,
"learning_rate": 2.8096539162112932e-05,
"loss": 0.55,
"step": 604
},
{
"epoch": 1.4864864864864864,
"grad_norm": 0.31706495449515043,
"learning_rate": 2.805100182149363e-05,
"loss": 0.6074,
"step": 605
},
{
"epoch": 1.488943488943489,
"grad_norm": 0.2920463104930728,
"learning_rate": 2.8005464480874316e-05,
"loss": 0.5588,
"step": 606
},
{
"epoch": 1.4914004914004915,
"grad_norm": 0.2861747694993797,
"learning_rate": 2.795992714025501e-05,
"loss": 0.5292,
"step": 607
},
{
"epoch": 1.4938574938574938,
"grad_norm": 0.3285475879079594,
"learning_rate": 2.7914389799635702e-05,
"loss": 0.6151,
"step": 608
},
{
"epoch": 1.4963144963144963,
"grad_norm": 0.34034599761877166,
"learning_rate": 2.7868852459016392e-05,
"loss": 0.697,
"step": 609
},
{
"epoch": 1.4987714987714988,
"grad_norm": 0.2804576195171845,
"learning_rate": 2.7823315118397086e-05,
"loss": 0.5973,
"step": 610
},
{
"epoch": 1.5012285012285012,
"grad_norm": 0.33784205792768834,
"learning_rate": 2.777777777777778e-05,
"loss": 0.474,
"step": 611
},
{
"epoch": 1.5036855036855037,
"grad_norm": 0.31097306577779116,
"learning_rate": 2.773224043715847e-05,
"loss": 0.4641,
"step": 612
},
{
"epoch": 1.5061425061425062,
"grad_norm": 0.2844030812033379,
"learning_rate": 2.7686703096539162e-05,
"loss": 0.5817,
"step": 613
},
{
"epoch": 1.5085995085995085,
"grad_norm": 0.38968984446732813,
"learning_rate": 2.7641165755919856e-05,
"loss": 0.5086,
"step": 614
},
{
"epoch": 1.511056511056511,
"grad_norm": 0.31040975194056414,
"learning_rate": 2.7595628415300546e-05,
"loss": 0.4693,
"step": 615
},
{
"epoch": 1.5135135135135136,
"grad_norm": 0.33741077330159325,
"learning_rate": 2.755009107468124e-05,
"loss": 0.589,
"step": 616
},
{
"epoch": 1.515970515970516,
"grad_norm": 0.31661478005044347,
"learning_rate": 2.7504553734061933e-05,
"loss": 0.6049,
"step": 617
},
{
"epoch": 1.5184275184275184,
"grad_norm": 0.3335488405098975,
"learning_rate": 2.7459016393442626e-05,
"loss": 0.6017,
"step": 618
},
{
"epoch": 1.520884520884521,
"grad_norm": 0.31027476247695246,
"learning_rate": 2.7413479052823316e-05,
"loss": 0.5838,
"step": 619
},
{
"epoch": 1.5233415233415233,
"grad_norm": 0.2777646630361566,
"learning_rate": 2.736794171220401e-05,
"loss": 0.5584,
"step": 620
},
{
"epoch": 1.5257985257985258,
"grad_norm": 0.2913563020518906,
"learning_rate": 2.7322404371584703e-05,
"loss": 0.4762,
"step": 621
},
{
"epoch": 1.5282555282555284,
"grad_norm": 0.350171904455104,
"learning_rate": 2.7276867030965393e-05,
"loss": 0.6177,
"step": 622
},
{
"epoch": 1.5307125307125307,
"grad_norm": 0.30111848450509915,
"learning_rate": 2.7231329690346086e-05,
"loss": 0.5101,
"step": 623
},
{
"epoch": 1.5331695331695332,
"grad_norm": 0.2867090459147248,
"learning_rate": 2.718579234972678e-05,
"loss": 0.5821,
"step": 624
},
{
"epoch": 1.5356265356265357,
"grad_norm": 0.3638114975712457,
"learning_rate": 2.714025500910747e-05,
"loss": 0.6383,
"step": 625
},
{
"epoch": 1.538083538083538,
"grad_norm": 0.2867067374702508,
"learning_rate": 2.7094717668488163e-05,
"loss": 0.5452,
"step": 626
},
{
"epoch": 1.5405405405405406,
"grad_norm": 0.28737600107101,
"learning_rate": 2.7049180327868856e-05,
"loss": 0.5418,
"step": 627
},
{
"epoch": 1.542997542997543,
"grad_norm": 0.26498806931754665,
"learning_rate": 2.7003642987249543e-05,
"loss": 0.5039,
"step": 628
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.3011713624967737,
"learning_rate": 2.695810564663024e-05,
"loss": 0.53,
"step": 629
},
{
"epoch": 1.547911547911548,
"grad_norm": 0.2691530635935858,
"learning_rate": 2.6912568306010933e-05,
"loss": 0.4859,
"step": 630
},
{
"epoch": 1.5503685503685505,
"grad_norm": 0.2558261406391803,
"learning_rate": 2.686703096539162e-05,
"loss": 0.477,
"step": 631
},
{
"epoch": 1.5528255528255528,
"grad_norm": 0.25620551411091325,
"learning_rate": 2.6821493624772313e-05,
"loss": 0.4604,
"step": 632
},
{
"epoch": 1.5552825552825553,
"grad_norm": 0.2763114315788065,
"learning_rate": 2.677595628415301e-05,
"loss": 0.5792,
"step": 633
},
{
"epoch": 1.5577395577395579,
"grad_norm": 0.28083914382759145,
"learning_rate": 2.6730418943533697e-05,
"loss": 0.5356,
"step": 634
},
{
"epoch": 1.5601965601965602,
"grad_norm": 0.2920422213997252,
"learning_rate": 2.668488160291439e-05,
"loss": 0.5251,
"step": 635
},
{
"epoch": 1.5626535626535627,
"grad_norm": 0.27262649363198743,
"learning_rate": 2.6639344262295087e-05,
"loss": 0.5402,
"step": 636
},
{
"epoch": 1.5651105651105652,
"grad_norm": 0.2910361795236517,
"learning_rate": 2.6593806921675773e-05,
"loss": 0.5268,
"step": 637
},
{
"epoch": 1.5675675675675675,
"grad_norm": 0.26654925026502435,
"learning_rate": 2.6548269581056467e-05,
"loss": 0.5328,
"step": 638
},
{
"epoch": 1.57002457002457,
"grad_norm": 0.2973118734804211,
"learning_rate": 2.650273224043716e-05,
"loss": 0.5803,
"step": 639
},
{
"epoch": 1.5724815724815726,
"grad_norm": 0.2971915311670167,
"learning_rate": 2.645719489981785e-05,
"loss": 0.5564,
"step": 640
},
{
"epoch": 1.574938574938575,
"grad_norm": 0.2969196104157182,
"learning_rate": 2.6411657559198543e-05,
"loss": 0.5677,
"step": 641
},
{
"epoch": 1.5773955773955772,
"grad_norm": 0.273834529796921,
"learning_rate": 2.6366120218579237e-05,
"loss": 0.4952,
"step": 642
},
{
"epoch": 1.57985257985258,
"grad_norm": 0.2743084458426201,
"learning_rate": 2.6320582877959927e-05,
"loss": 0.5289,
"step": 643
},
{
"epoch": 1.5823095823095823,
"grad_norm": 0.27010983622024526,
"learning_rate": 2.627504553734062e-05,
"loss": 0.5218,
"step": 644
},
{
"epoch": 1.5847665847665846,
"grad_norm": 0.33171449854749435,
"learning_rate": 2.6229508196721314e-05,
"loss": 0.5671,
"step": 645
},
{
"epoch": 1.5872235872235874,
"grad_norm": 0.30504425469503404,
"learning_rate": 2.6183970856102004e-05,
"loss": 0.5877,
"step": 646
},
{
"epoch": 1.5896805896805897,
"grad_norm": 0.278481170782536,
"learning_rate": 2.6138433515482697e-05,
"loss": 0.6005,
"step": 647
},
{
"epoch": 1.592137592137592,
"grad_norm": 0.29696142460818625,
"learning_rate": 2.609289617486339e-05,
"loss": 0.5305,
"step": 648
},
{
"epoch": 1.5945945945945947,
"grad_norm": 0.33350125578968326,
"learning_rate": 2.604735883424408e-05,
"loss": 0.5594,
"step": 649
},
{
"epoch": 1.597051597051597,
"grad_norm": 0.2895010135396355,
"learning_rate": 2.6001821493624774e-05,
"loss": 0.5711,
"step": 650
},
{
"epoch": 1.5995085995085994,
"grad_norm": 0.3218565904038471,
"learning_rate": 2.5956284153005467e-05,
"loss": 0.5958,
"step": 651
},
{
"epoch": 1.6019656019656021,
"grad_norm": 0.333555069936009,
"learning_rate": 2.5910746812386154e-05,
"loss": 0.5207,
"step": 652
},
{
"epoch": 1.6044226044226044,
"grad_norm": 0.24599267706082115,
"learning_rate": 2.586520947176685e-05,
"loss": 0.5302,
"step": 653
},
{
"epoch": 1.6068796068796067,
"grad_norm": 0.30331457642746157,
"learning_rate": 2.5819672131147544e-05,
"loss": 0.5827,
"step": 654
},
{
"epoch": 1.6093366093366095,
"grad_norm": 0.30665436759125925,
"learning_rate": 2.5774134790528237e-05,
"loss": 0.5576,
"step": 655
},
{
"epoch": 1.6117936117936118,
"grad_norm": 0.34092613831052127,
"learning_rate": 2.5728597449908924e-05,
"loss": 0.5054,
"step": 656
},
{
"epoch": 1.6142506142506141,
"grad_norm": 0.28798725261513564,
"learning_rate": 2.568306010928962e-05,
"loss": 0.5379,
"step": 657
},
{
"epoch": 1.6167076167076169,
"grad_norm": 0.3089417920348662,
"learning_rate": 2.5637522768670314e-05,
"loss": 0.5599,
"step": 658
},
{
"epoch": 1.6191646191646192,
"grad_norm": 0.30820249889026247,
"learning_rate": 2.5591985428051e-05,
"loss": 0.4972,
"step": 659
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.27710041943109465,
"learning_rate": 2.5546448087431697e-05,
"loss": 0.5631,
"step": 660
},
{
"epoch": 1.6240786240786242,
"grad_norm": 0.30117632251651183,
"learning_rate": 2.550091074681239e-05,
"loss": 0.6309,
"step": 661
},
{
"epoch": 1.6265356265356266,
"grad_norm": 0.3148919358125854,
"learning_rate": 2.5455373406193077e-05,
"loss": 0.6265,
"step": 662
},
{
"epoch": 1.6289926289926289,
"grad_norm": 0.2912887580005306,
"learning_rate": 2.540983606557377e-05,
"loss": 0.5901,
"step": 663
},
{
"epoch": 1.6314496314496314,
"grad_norm": 0.24051170702858976,
"learning_rate": 2.5364298724954468e-05,
"loss": 0.4714,
"step": 664
},
{
"epoch": 1.633906633906634,
"grad_norm": 0.3126823231242451,
"learning_rate": 2.5318761384335154e-05,
"loss": 0.5539,
"step": 665
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.32930595380736144,
"learning_rate": 2.5273224043715848e-05,
"loss": 0.5895,
"step": 666
},
{
"epoch": 1.6388206388206388,
"grad_norm": 0.2932745657575283,
"learning_rate": 2.522768670309654e-05,
"loss": 0.5388,
"step": 667
},
{
"epoch": 1.6412776412776413,
"grad_norm": 0.2787597154650259,
"learning_rate": 2.518214936247723e-05,
"loss": 0.5179,
"step": 668
},
{
"epoch": 1.6437346437346436,
"grad_norm": 0.2937841728580581,
"learning_rate": 2.5136612021857924e-05,
"loss": 0.5083,
"step": 669
},
{
"epoch": 1.6461916461916462,
"grad_norm": 0.32130959163428363,
"learning_rate": 2.5091074681238618e-05,
"loss": 0.5693,
"step": 670
},
{
"epoch": 1.6486486486486487,
"grad_norm": 0.2991851088280298,
"learning_rate": 2.5045537340619308e-05,
"loss": 0.4979,
"step": 671
},
{
"epoch": 1.651105651105651,
"grad_norm": 0.2776565257867249,
"learning_rate": 2.5e-05,
"loss": 0.5318,
"step": 672
},
{
"epoch": 1.6535626535626535,
"grad_norm": 0.36657164420346156,
"learning_rate": 2.495446265938069e-05,
"loss": 0.5043,
"step": 673
},
{
"epoch": 1.656019656019656,
"grad_norm": 0.3215204376240886,
"learning_rate": 2.4908925318761388e-05,
"loss": 0.5854,
"step": 674
},
{
"epoch": 1.6584766584766584,
"grad_norm": 0.2950829336587517,
"learning_rate": 2.4863387978142078e-05,
"loss": 0.6343,
"step": 675
},
{
"epoch": 1.660933660933661,
"grad_norm": 0.3113697420564982,
"learning_rate": 2.4817850637522768e-05,
"loss": 0.5147,
"step": 676
},
{
"epoch": 1.6633906633906634,
"grad_norm": 0.31302535665995557,
"learning_rate": 2.477231329690346e-05,
"loss": 0.49,
"step": 677
},
{
"epoch": 1.6658476658476657,
"grad_norm": 0.34035592954838445,
"learning_rate": 2.4726775956284155e-05,
"loss": 0.5816,
"step": 678
},
{
"epoch": 1.6683046683046683,
"grad_norm": 0.3015369953798126,
"learning_rate": 2.4681238615664845e-05,
"loss": 0.5775,
"step": 679
},
{
"epoch": 1.6707616707616708,
"grad_norm": 0.31787296914393265,
"learning_rate": 2.4635701275045538e-05,
"loss": 0.5725,
"step": 680
},
{
"epoch": 1.6732186732186731,
"grad_norm": 0.3379584502023648,
"learning_rate": 2.459016393442623e-05,
"loss": 0.5642,
"step": 681
},
{
"epoch": 1.6756756756756757,
"grad_norm": 0.27840602624421484,
"learning_rate": 2.4544626593806925e-05,
"loss": 0.615,
"step": 682
},
{
"epoch": 1.6781326781326782,
"grad_norm": 0.28097823151212464,
"learning_rate": 2.4499089253187615e-05,
"loss": 0.547,
"step": 683
},
{
"epoch": 1.6805896805896805,
"grad_norm": 0.2616920934490369,
"learning_rate": 2.4453551912568305e-05,
"loss": 0.4777,
"step": 684
},
{
"epoch": 1.683046683046683,
"grad_norm": 0.2787698127004213,
"learning_rate": 2.4408014571949e-05,
"loss": 0.5141,
"step": 685
},
{
"epoch": 1.6855036855036856,
"grad_norm": 0.27981914048115714,
"learning_rate": 2.436247723132969e-05,
"loss": 0.5716,
"step": 686
},
{
"epoch": 1.6879606879606879,
"grad_norm": 4.088562590331179,
"learning_rate": 2.431693989071038e-05,
"loss": 0.6609,
"step": 687
},
{
"epoch": 1.6904176904176904,
"grad_norm": 0.2956395589638685,
"learning_rate": 2.427140255009108e-05,
"loss": 0.5403,
"step": 688
},
{
"epoch": 1.692874692874693,
"grad_norm": 0.29438181310455147,
"learning_rate": 2.422586520947177e-05,
"loss": 0.5685,
"step": 689
},
{
"epoch": 1.6953316953316953,
"grad_norm": 0.2538193516711629,
"learning_rate": 2.418032786885246e-05,
"loss": 0.4785,
"step": 690
},
{
"epoch": 1.6977886977886978,
"grad_norm": 0.28796731794186514,
"learning_rate": 2.4134790528233152e-05,
"loss": 0.552,
"step": 691
},
{
"epoch": 1.7002457002457003,
"grad_norm": 0.27868424952411996,
"learning_rate": 2.4089253187613845e-05,
"loss": 0.5076,
"step": 692
},
{
"epoch": 1.7027027027027026,
"grad_norm": 0.25990529519697514,
"learning_rate": 2.4043715846994535e-05,
"loss": 0.4533,
"step": 693
},
{
"epoch": 1.7051597051597052,
"grad_norm": 0.2873379074909231,
"learning_rate": 2.399817850637523e-05,
"loss": 0.5913,
"step": 694
},
{
"epoch": 1.7076167076167077,
"grad_norm": 0.29088350023709175,
"learning_rate": 2.3952641165755922e-05,
"loss": 0.5959,
"step": 695
},
{
"epoch": 1.71007371007371,
"grad_norm": 0.2573364666726,
"learning_rate": 2.3907103825136612e-05,
"loss": 0.5062,
"step": 696
},
{
"epoch": 1.7125307125307125,
"grad_norm": 0.29392565603255266,
"learning_rate": 2.3861566484517305e-05,
"loss": 0.5687,
"step": 697
},
{
"epoch": 1.714987714987715,
"grad_norm": 0.2891193206942597,
"learning_rate": 2.3816029143898e-05,
"loss": 0.6049,
"step": 698
},
{
"epoch": 1.7174447174447174,
"grad_norm": 0.2840198076767787,
"learning_rate": 2.377049180327869e-05,
"loss": 0.507,
"step": 699
},
{
"epoch": 1.71990171990172,
"grad_norm": 0.29919110648065483,
"learning_rate": 2.3724954462659382e-05,
"loss": 0.5467,
"step": 700
},
{
"epoch": 1.7223587223587224,
"grad_norm": 0.3058328148398321,
"learning_rate": 2.3679417122040072e-05,
"loss": 0.5812,
"step": 701
},
{
"epoch": 1.7248157248157248,
"grad_norm": 0.30016823104589047,
"learning_rate": 2.363387978142077e-05,
"loss": 0.5755,
"step": 702
},
{
"epoch": 1.7272727272727273,
"grad_norm": 0.3499121631564174,
"learning_rate": 2.358834244080146e-05,
"loss": 0.6278,
"step": 703
},
{
"epoch": 1.7297297297297298,
"grad_norm": 0.7047977484338855,
"learning_rate": 2.354280510018215e-05,
"loss": 0.4715,
"step": 704
},
{
"epoch": 1.7321867321867321,
"grad_norm": 0.3627891702679587,
"learning_rate": 2.3497267759562842e-05,
"loss": 0.5641,
"step": 705
},
{
"epoch": 1.7346437346437347,
"grad_norm": 0.3269601035291933,
"learning_rate": 2.3451730418943536e-05,
"loss": 0.5644,
"step": 706
},
{
"epoch": 1.7371007371007372,
"grad_norm": 0.3229301454671492,
"learning_rate": 2.3406193078324226e-05,
"loss": 0.5623,
"step": 707
},
{
"epoch": 1.7395577395577395,
"grad_norm": 0.2621133990792928,
"learning_rate": 2.336065573770492e-05,
"loss": 0.4495,
"step": 708
},
{
"epoch": 1.742014742014742,
"grad_norm": 0.3105707711781621,
"learning_rate": 2.3315118397085612e-05,
"loss": 0.6677,
"step": 709
},
{
"epoch": 1.7444717444717446,
"grad_norm": 0.31405789889240876,
"learning_rate": 2.3269581056466302e-05,
"loss": 0.6109,
"step": 710
},
{
"epoch": 1.746928746928747,
"grad_norm": 0.27862780228274875,
"learning_rate": 2.3224043715846996e-05,
"loss": 0.4707,
"step": 711
},
{
"epoch": 1.7493857493857494,
"grad_norm": 0.2951135319765008,
"learning_rate": 2.317850637522769e-05,
"loss": 0.5152,
"step": 712
},
{
"epoch": 1.751842751842752,
"grad_norm": 0.23868591982715384,
"learning_rate": 2.313296903460838e-05,
"loss": 0.4805,
"step": 713
},
{
"epoch": 1.7542997542997543,
"grad_norm": 0.2809998521081784,
"learning_rate": 2.3087431693989073e-05,
"loss": 0.6071,
"step": 714
},
{
"epoch": 1.7567567567567568,
"grad_norm": 0.3062020782313415,
"learning_rate": 2.3041894353369763e-05,
"loss": 0.5542,
"step": 715
},
{
"epoch": 1.7592137592137593,
"grad_norm": 0.25160094772031133,
"learning_rate": 2.2996357012750456e-05,
"loss": 0.5262,
"step": 716
},
{
"epoch": 1.7616707616707616,
"grad_norm": 0.2648834976152306,
"learning_rate": 2.295081967213115e-05,
"loss": 0.4888,
"step": 717
},
{
"epoch": 1.7641277641277642,
"grad_norm": 0.2692198246802372,
"learning_rate": 2.290528233151184e-05,
"loss": 0.5413,
"step": 718
},
{
"epoch": 1.7665847665847667,
"grad_norm": 0.302608144563453,
"learning_rate": 2.2859744990892533e-05,
"loss": 0.5485,
"step": 719
},
{
"epoch": 1.769041769041769,
"grad_norm": 0.3024049619053678,
"learning_rate": 2.2814207650273226e-05,
"loss": 0.5612,
"step": 720
},
{
"epoch": 1.7714987714987716,
"grad_norm": 0.2686755624508314,
"learning_rate": 2.2768670309653916e-05,
"loss": 0.5532,
"step": 721
},
{
"epoch": 1.773955773955774,
"grad_norm": 0.31355914454819966,
"learning_rate": 2.272313296903461e-05,
"loss": 0.545,
"step": 722
},
{
"epoch": 1.7764127764127764,
"grad_norm": 0.2679523528547601,
"learning_rate": 2.2677595628415303e-05,
"loss": 0.5404,
"step": 723
},
{
"epoch": 1.7788697788697787,
"grad_norm": 0.34380417593496515,
"learning_rate": 2.2632058287795993e-05,
"loss": 0.6352,
"step": 724
},
{
"epoch": 1.7813267813267815,
"grad_norm": 0.29712724540471824,
"learning_rate": 2.2586520947176686e-05,
"loss": 0.5769,
"step": 725
},
{
"epoch": 1.7837837837837838,
"grad_norm": 0.29704261087468237,
"learning_rate": 2.254098360655738e-05,
"loss": 0.5088,
"step": 726
},
{
"epoch": 1.786240786240786,
"grad_norm": 0.2823609172880149,
"learning_rate": 2.249544626593807e-05,
"loss": 0.5047,
"step": 727
},
{
"epoch": 1.7886977886977888,
"grad_norm": 0.5140965758355988,
"learning_rate": 2.2449908925318763e-05,
"loss": 0.4842,
"step": 728
},
{
"epoch": 1.7911547911547911,
"grad_norm": 1.7892960579058013,
"learning_rate": 2.2404371584699453e-05,
"loss": 0.5787,
"step": 729
},
{
"epoch": 1.7936117936117935,
"grad_norm": 0.31920880587878125,
"learning_rate": 2.2358834244080147e-05,
"loss": 0.5224,
"step": 730
},
{
"epoch": 1.7960687960687962,
"grad_norm": 0.2842312803943501,
"learning_rate": 2.231329690346084e-05,
"loss": 0.5373,
"step": 731
},
{
"epoch": 1.7985257985257985,
"grad_norm": 0.3016887670720209,
"learning_rate": 2.226775956284153e-05,
"loss": 0.542,
"step": 732
},
{
"epoch": 1.8009828009828008,
"grad_norm": 0.3107473379471071,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.555,
"step": 733
},
{
"epoch": 1.8034398034398036,
"grad_norm": 0.2955503666728652,
"learning_rate": 2.2176684881602917e-05,
"loss": 0.5613,
"step": 734
},
{
"epoch": 1.805896805896806,
"grad_norm": 0.30524826919317594,
"learning_rate": 2.2131147540983607e-05,
"loss": 0.5297,
"step": 735
},
{
"epoch": 1.8083538083538082,
"grad_norm": 0.27729189858459274,
"learning_rate": 2.20856102003643e-05,
"loss": 0.6061,
"step": 736
},
{
"epoch": 1.810810810810811,
"grad_norm": 0.2893287066030788,
"learning_rate": 2.2040072859744993e-05,
"loss": 0.4896,
"step": 737
},
{
"epoch": 1.8132678132678133,
"grad_norm": 0.2606901787727459,
"learning_rate": 2.1994535519125683e-05,
"loss": 0.5773,
"step": 738
},
{
"epoch": 1.8157248157248156,
"grad_norm": 0.25527945735655144,
"learning_rate": 2.1948998178506377e-05,
"loss": 0.561,
"step": 739
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.2669859467920838,
"learning_rate": 2.190346083788707e-05,
"loss": 0.5482,
"step": 740
},
{
"epoch": 1.8206388206388207,
"grad_norm": 0.2693248782403161,
"learning_rate": 2.185792349726776e-05,
"loss": 0.5919,
"step": 741
},
{
"epoch": 1.823095823095823,
"grad_norm": 0.2554414448707284,
"learning_rate": 2.1812386156648454e-05,
"loss": 0.5184,
"step": 742
},
{
"epoch": 1.8255528255528255,
"grad_norm": 0.2945019560024116,
"learning_rate": 2.1766848816029144e-05,
"loss": 0.5633,
"step": 743
},
{
"epoch": 1.828009828009828,
"grad_norm": 0.30074361548984935,
"learning_rate": 2.1721311475409837e-05,
"loss": 0.6057,
"step": 744
},
{
"epoch": 1.8304668304668303,
"grad_norm": 0.31888019740726103,
"learning_rate": 2.167577413479053e-05,
"loss": 0.6217,
"step": 745
},
{
"epoch": 1.8329238329238329,
"grad_norm": 0.6953990362734012,
"learning_rate": 2.163023679417122e-05,
"loss": 0.5032,
"step": 746
},
{
"epoch": 1.8353808353808354,
"grad_norm": 0.2638969682962359,
"learning_rate": 2.1584699453551914e-05,
"loss": 0.5814,
"step": 747
},
{
"epoch": 1.8378378378378377,
"grad_norm": 0.2673502531002082,
"learning_rate": 2.1539162112932607e-05,
"loss": 0.5353,
"step": 748
},
{
"epoch": 1.8402948402948403,
"grad_norm": 0.3052402733466028,
"learning_rate": 2.1493624772313297e-05,
"loss": 0.6075,
"step": 749
},
{
"epoch": 1.8427518427518428,
"grad_norm": 0.2590866717496352,
"learning_rate": 2.144808743169399e-05,
"loss": 0.548,
"step": 750
},
{
"epoch": 1.845208845208845,
"grad_norm": 0.2800576089744623,
"learning_rate": 2.1402550091074684e-05,
"loss": 0.5195,
"step": 751
},
{
"epoch": 1.8476658476658476,
"grad_norm": 0.3032887173826026,
"learning_rate": 2.1357012750455374e-05,
"loss": 0.482,
"step": 752
},
{
"epoch": 1.8501228501228502,
"grad_norm": 0.2780282487888591,
"learning_rate": 2.1311475409836064e-05,
"loss": 0.5939,
"step": 753
},
{
"epoch": 1.8525798525798525,
"grad_norm": 0.28853976340491677,
"learning_rate": 2.126593806921676e-05,
"loss": 0.5743,
"step": 754
},
{
"epoch": 1.855036855036855,
"grad_norm": 0.3327786395512487,
"learning_rate": 2.122040072859745e-05,
"loss": 0.4186,
"step": 755
},
{
"epoch": 1.8574938574938575,
"grad_norm": 0.2819311307178514,
"learning_rate": 2.1174863387978144e-05,
"loss": 0.5976,
"step": 756
},
{
"epoch": 1.8599508599508598,
"grad_norm": 0.27219577380560167,
"learning_rate": 2.1129326047358834e-05,
"loss": 0.481,
"step": 757
},
{
"epoch": 1.8624078624078624,
"grad_norm": 0.2888069852070096,
"learning_rate": 2.1083788706739527e-05,
"loss": 0.5633,
"step": 758
},
{
"epoch": 1.864864864864865,
"grad_norm": 0.283281648136038,
"learning_rate": 2.103825136612022e-05,
"loss": 0.4768,
"step": 759
},
{
"epoch": 1.8673218673218672,
"grad_norm": 0.2506456753016858,
"learning_rate": 2.099271402550091e-05,
"loss": 0.4806,
"step": 760
},
{
"epoch": 1.8697788697788698,
"grad_norm": 0.259014545169757,
"learning_rate": 2.0947176684881604e-05,
"loss": 0.5095,
"step": 761
},
{
"epoch": 1.8722358722358723,
"grad_norm": 0.28690420297792946,
"learning_rate": 2.0901639344262298e-05,
"loss": 0.5739,
"step": 762
},
{
"epoch": 1.8746928746928746,
"grad_norm": 0.3067416454066446,
"learning_rate": 2.0856102003642988e-05,
"loss": 0.5557,
"step": 763
},
{
"epoch": 1.8771498771498771,
"grad_norm": 0.2557249566617281,
"learning_rate": 2.081056466302368e-05,
"loss": 0.5136,
"step": 764
},
{
"epoch": 1.8796068796068797,
"grad_norm": 0.27522324877940546,
"learning_rate": 2.0765027322404374e-05,
"loss": 0.582,
"step": 765
},
{
"epoch": 1.882063882063882,
"grad_norm": 0.27502624886911736,
"learning_rate": 2.0719489981785064e-05,
"loss": 0.5115,
"step": 766
},
{
"epoch": 1.8845208845208845,
"grad_norm": 0.28524062704028064,
"learning_rate": 2.0673952641165754e-05,
"loss": 0.604,
"step": 767
},
{
"epoch": 1.886977886977887,
"grad_norm": 0.2896306842805975,
"learning_rate": 2.062841530054645e-05,
"loss": 0.5909,
"step": 768
},
{
"epoch": 1.8894348894348894,
"grad_norm": 0.37017739402403504,
"learning_rate": 2.058287795992714e-05,
"loss": 0.6113,
"step": 769
},
{
"epoch": 1.8918918918918919,
"grad_norm": 0.25659739921327007,
"learning_rate": 2.053734061930783e-05,
"loss": 0.533,
"step": 770
},
{
"epoch": 1.8943488943488944,
"grad_norm": 0.2631965030301113,
"learning_rate": 2.0491803278688525e-05,
"loss": 0.4888,
"step": 771
},
{
"epoch": 1.8968058968058967,
"grad_norm": 0.26335776548343703,
"learning_rate": 2.0446265938069218e-05,
"loss": 0.5527,
"step": 772
},
{
"epoch": 1.8992628992628993,
"grad_norm": 0.2754894594590262,
"learning_rate": 2.040072859744991e-05,
"loss": 0.5974,
"step": 773
},
{
"epoch": 1.9017199017199018,
"grad_norm": 0.25145939998899,
"learning_rate": 2.03551912568306e-05,
"loss": 0.4814,
"step": 774
},
{
"epoch": 1.904176904176904,
"grad_norm": 0.24080950527586228,
"learning_rate": 2.0309653916211295e-05,
"loss": 0.485,
"step": 775
},
{
"epoch": 1.9066339066339066,
"grad_norm": 0.26980458286482356,
"learning_rate": 2.0264116575591988e-05,
"loss": 0.5519,
"step": 776
},
{
"epoch": 1.9090909090909092,
"grad_norm": 0.26842981889888856,
"learning_rate": 2.0218579234972678e-05,
"loss": 0.5098,
"step": 777
},
{
"epoch": 1.9115479115479115,
"grad_norm": 0.2524704758947662,
"learning_rate": 2.017304189435337e-05,
"loss": 0.4943,
"step": 778
},
{
"epoch": 1.914004914004914,
"grad_norm": 0.2856037686590571,
"learning_rate": 2.0127504553734065e-05,
"loss": 0.5953,
"step": 779
},
{
"epoch": 1.9164619164619165,
"grad_norm": 0.30220124538301824,
"learning_rate": 2.0081967213114755e-05,
"loss": 0.6225,
"step": 780
},
{
"epoch": 1.9189189189189189,
"grad_norm": 0.2680433180647516,
"learning_rate": 2.0036429872495445e-05,
"loss": 0.5303,
"step": 781
},
{
"epoch": 1.9213759213759214,
"grad_norm": 0.2658779045053125,
"learning_rate": 1.999089253187614e-05,
"loss": 0.5525,
"step": 782
},
{
"epoch": 1.923832923832924,
"grad_norm": 0.2962893495403355,
"learning_rate": 1.994535519125683e-05,
"loss": 0.5522,
"step": 783
},
{
"epoch": 1.9262899262899262,
"grad_norm": 0.5328833977576326,
"learning_rate": 1.989981785063752e-05,
"loss": 0.5646,
"step": 784
},
{
"epoch": 1.9287469287469288,
"grad_norm": 0.3653824185491529,
"learning_rate": 1.9854280510018215e-05,
"loss": 0.6461,
"step": 785
},
{
"epoch": 1.9312039312039313,
"grad_norm": 0.342083225179309,
"learning_rate": 1.980874316939891e-05,
"loss": 0.5784,
"step": 786
},
{
"epoch": 1.9336609336609336,
"grad_norm": 0.2879236786453721,
"learning_rate": 1.97632058287796e-05,
"loss": 0.4979,
"step": 787
},
{
"epoch": 1.9361179361179361,
"grad_norm": 0.2837537460473016,
"learning_rate": 1.9717668488160292e-05,
"loss": 0.5029,
"step": 788
},
{
"epoch": 1.9385749385749387,
"grad_norm": 0.32014780260432985,
"learning_rate": 1.9672131147540985e-05,
"loss": 0.5573,
"step": 789
},
{
"epoch": 1.941031941031941,
"grad_norm": 0.32817327775242966,
"learning_rate": 1.9626593806921675e-05,
"loss": 0.4987,
"step": 790
},
{
"epoch": 1.9434889434889435,
"grad_norm": 0.28163717402922606,
"learning_rate": 1.958105646630237e-05,
"loss": 0.4783,
"step": 791
},
{
"epoch": 1.945945945945946,
"grad_norm": 0.2861392501523952,
"learning_rate": 1.9535519125683062e-05,
"loss": 0.4826,
"step": 792
},
{
"epoch": 1.9484029484029484,
"grad_norm": 0.3363642864939868,
"learning_rate": 1.9489981785063755e-05,
"loss": 0.5295,
"step": 793
},
{
"epoch": 1.950859950859951,
"grad_norm": 0.297653344647604,
"learning_rate": 1.9444444444444445e-05,
"loss": 0.5454,
"step": 794
},
{
"epoch": 1.9533169533169534,
"grad_norm": 0.3270453521765983,
"learning_rate": 1.9398907103825135e-05,
"loss": 0.6322,
"step": 795
},
{
"epoch": 1.9557739557739557,
"grad_norm": 0.26046016384211484,
"learning_rate": 1.9353369763205832e-05,
"loss": 0.5203,
"step": 796
},
{
"epoch": 1.9582309582309583,
"grad_norm": 0.2770362852807057,
"learning_rate": 1.9307832422586522e-05,
"loss": 0.5273,
"step": 797
},
{
"epoch": 1.9606879606879608,
"grad_norm": 0.29331896952461095,
"learning_rate": 1.9262295081967212e-05,
"loss": 0.6334,
"step": 798
},
{
"epoch": 1.9631449631449631,
"grad_norm": 0.8983263494767803,
"learning_rate": 1.9216757741347906e-05,
"loss": 0.5353,
"step": 799
},
{
"epoch": 1.9656019656019657,
"grad_norm": 0.354421880818103,
"learning_rate": 1.91712204007286e-05,
"loss": 0.5813,
"step": 800
},
{
"epoch": 1.9680589680589682,
"grad_norm": 0.2927169517127379,
"learning_rate": 1.912568306010929e-05,
"loss": 0.579,
"step": 801
},
{
"epoch": 1.9705159705159705,
"grad_norm": 0.32721857789898107,
"learning_rate": 1.9080145719489982e-05,
"loss": 0.5296,
"step": 802
},
{
"epoch": 1.972972972972973,
"grad_norm": 0.3808551126968657,
"learning_rate": 1.9034608378870676e-05,
"loss": 0.568,
"step": 803
},
{
"epoch": 1.9754299754299756,
"grad_norm": 0.29090526952275514,
"learning_rate": 1.8989071038251366e-05,
"loss": 0.5479,
"step": 804
},
{
"epoch": 1.9778869778869779,
"grad_norm": 0.31614970674893583,
"learning_rate": 1.894353369763206e-05,
"loss": 0.5712,
"step": 805
},
{
"epoch": 1.9803439803439802,
"grad_norm": 0.32886598817833446,
"learning_rate": 1.8897996357012752e-05,
"loss": 0.5644,
"step": 806
},
{
"epoch": 1.982800982800983,
"grad_norm": 0.29937607015091,
"learning_rate": 1.8852459016393442e-05,
"loss": 0.5381,
"step": 807
},
{
"epoch": 1.9852579852579852,
"grad_norm": 0.30106855434356256,
"learning_rate": 1.8806921675774136e-05,
"loss": 0.5981,
"step": 808
},
{
"epoch": 1.9877149877149876,
"grad_norm": 0.2800181256297378,
"learning_rate": 1.8761384335154826e-05,
"loss": 0.551,
"step": 809
},
{
"epoch": 1.9901719901719903,
"grad_norm": 0.28586036377645657,
"learning_rate": 1.8715846994535523e-05,
"loss": 0.5275,
"step": 810
},
{
"epoch": 1.9926289926289926,
"grad_norm": 0.31385447188308085,
"learning_rate": 1.8670309653916213e-05,
"loss": 0.5601,
"step": 811
},
{
"epoch": 1.995085995085995,
"grad_norm": 0.28851307782462143,
"learning_rate": 1.8624772313296903e-05,
"loss": 0.5781,
"step": 812
},
{
"epoch": 1.9975429975429977,
"grad_norm": 0.34464670180519497,
"learning_rate": 1.85792349726776e-05,
"loss": 0.6451,
"step": 813
},
{
"epoch": 2.0,
"grad_norm": 0.29338564297775654,
"learning_rate": 1.853369763205829e-05,
"loss": 0.5088,
"step": 814
},
{
"epoch": 2.0024570024570023,
"grad_norm": 0.41071893724186426,
"learning_rate": 1.848816029143898e-05,
"loss": 0.4005,
"step": 815
},
{
"epoch": 2.004914004914005,
"grad_norm": 0.35729203255082986,
"learning_rate": 1.8442622950819673e-05,
"loss": 0.5375,
"step": 816
},
{
"epoch": 2.0073710073710074,
"grad_norm": 0.35116777825965795,
"learning_rate": 1.8397085610200366e-05,
"loss": 0.3852,
"step": 817
},
{
"epoch": 2.0098280098280097,
"grad_norm": 0.4283969986464076,
"learning_rate": 1.8351548269581056e-05,
"loss": 0.3566,
"step": 818
},
{
"epoch": 2.0122850122850124,
"grad_norm": 0.35665717517061507,
"learning_rate": 1.830601092896175e-05,
"loss": 0.4491,
"step": 819
},
{
"epoch": 2.0147420147420148,
"grad_norm": 0.29607963350631866,
"learning_rate": 1.8260473588342443e-05,
"loss": 0.4156,
"step": 820
},
{
"epoch": 2.017199017199017,
"grad_norm": 0.40471159128724016,
"learning_rate": 1.8214936247723133e-05,
"loss": 0.4252,
"step": 821
},
{
"epoch": 2.01965601965602,
"grad_norm": 0.4023037095439346,
"learning_rate": 1.8169398907103826e-05,
"loss": 0.4303,
"step": 822
},
{
"epoch": 2.022113022113022,
"grad_norm": 0.31491067990723465,
"learning_rate": 1.8123861566484516e-05,
"loss": 0.4405,
"step": 823
},
{
"epoch": 2.0245700245700244,
"grad_norm": 0.32098076190832764,
"learning_rate": 1.807832422586521e-05,
"loss": 0.4037,
"step": 824
},
{
"epoch": 2.027027027027027,
"grad_norm": 0.4117176838641387,
"learning_rate": 1.8032786885245903e-05,
"loss": 0.4702,
"step": 825
},
{
"epoch": 2.0294840294840295,
"grad_norm": 0.36001699667740716,
"learning_rate": 1.7987249544626593e-05,
"loss": 0.4989,
"step": 826
},
{
"epoch": 2.031941031941032,
"grad_norm": 0.3396068846506985,
"learning_rate": 1.7941712204007287e-05,
"loss": 0.4954,
"step": 827
},
{
"epoch": 2.0343980343980346,
"grad_norm": 0.32243687243046554,
"learning_rate": 1.789617486338798e-05,
"loss": 0.4664,
"step": 828
},
{
"epoch": 2.036855036855037,
"grad_norm": 0.27969879077206583,
"learning_rate": 1.785063752276867e-05,
"loss": 0.4429,
"step": 829
},
{
"epoch": 2.039312039312039,
"grad_norm": 0.31574022998246704,
"learning_rate": 1.7805100182149363e-05,
"loss": 0.4584,
"step": 830
},
{
"epoch": 2.041769041769042,
"grad_norm": 0.2688789472016761,
"learning_rate": 1.7759562841530057e-05,
"loss": 0.3876,
"step": 831
},
{
"epoch": 2.0442260442260443,
"grad_norm": 0.2784054655160525,
"learning_rate": 1.7714025500910747e-05,
"loss": 0.3927,
"step": 832
},
{
"epoch": 2.0466830466830466,
"grad_norm": 0.2673023986899889,
"learning_rate": 1.766848816029144e-05,
"loss": 0.386,
"step": 833
},
{
"epoch": 2.0491400491400493,
"grad_norm": 0.2800382150316435,
"learning_rate": 1.7622950819672133e-05,
"loss": 0.4102,
"step": 834
},
{
"epoch": 2.0515970515970516,
"grad_norm": 0.2893871141752102,
"learning_rate": 1.7577413479052823e-05,
"loss": 0.4367,
"step": 835
},
{
"epoch": 2.054054054054054,
"grad_norm": 0.32085451551691285,
"learning_rate": 1.7531876138433517e-05,
"loss": 0.438,
"step": 836
},
{
"epoch": 2.0565110565110567,
"grad_norm": 0.26054641681204144,
"learning_rate": 1.7486338797814207e-05,
"loss": 0.4182,
"step": 837
},
{
"epoch": 2.058968058968059,
"grad_norm": 0.2876218424667253,
"learning_rate": 1.74408014571949e-05,
"loss": 0.4671,
"step": 838
},
{
"epoch": 2.0614250614250613,
"grad_norm": 0.32477671982156153,
"learning_rate": 1.7395264116575594e-05,
"loss": 0.5294,
"step": 839
},
{
"epoch": 2.063882063882064,
"grad_norm": 0.25297866530499685,
"learning_rate": 1.7349726775956284e-05,
"loss": 0.3794,
"step": 840
},
{
"epoch": 2.0663390663390664,
"grad_norm": 0.3220090607744593,
"learning_rate": 1.7304189435336977e-05,
"loss": 0.4592,
"step": 841
},
{
"epoch": 2.0687960687960687,
"grad_norm": 0.27031704077453683,
"learning_rate": 1.725865209471767e-05,
"loss": 0.3944,
"step": 842
},
{
"epoch": 2.0712530712530715,
"grad_norm": 0.27581141711381324,
"learning_rate": 1.721311475409836e-05,
"loss": 0.4197,
"step": 843
},
{
"epoch": 2.0737100737100738,
"grad_norm": 0.272567864438195,
"learning_rate": 1.7167577413479054e-05,
"loss": 0.4428,
"step": 844
},
{
"epoch": 2.076167076167076,
"grad_norm": 0.2968432656835,
"learning_rate": 1.7122040072859747e-05,
"loss": 0.4456,
"step": 845
},
{
"epoch": 2.078624078624079,
"grad_norm": 0.25441437304353254,
"learning_rate": 1.7076502732240437e-05,
"loss": 0.4155,
"step": 846
},
{
"epoch": 2.081081081081081,
"grad_norm": 0.5659771360772702,
"learning_rate": 1.703096539162113e-05,
"loss": 0.4397,
"step": 847
},
{
"epoch": 2.0835380835380835,
"grad_norm": 0.2472720529621334,
"learning_rate": 1.6985428051001824e-05,
"loss": 0.3765,
"step": 848
},
{
"epoch": 2.085995085995086,
"grad_norm": 0.2859826191779256,
"learning_rate": 1.6939890710382514e-05,
"loss": 0.4637,
"step": 849
},
{
"epoch": 2.0884520884520885,
"grad_norm": 0.2785861022601944,
"learning_rate": 1.6894353369763207e-05,
"loss": 0.4135,
"step": 850
},
{
"epoch": 2.090909090909091,
"grad_norm": 0.25864220657304277,
"learning_rate": 1.68488160291439e-05,
"loss": 0.4325,
"step": 851
},
{
"epoch": 2.093366093366093,
"grad_norm": 0.31600684812535784,
"learning_rate": 1.680327868852459e-05,
"loss": 0.4794,
"step": 852
},
{
"epoch": 2.095823095823096,
"grad_norm": 0.22407053363373733,
"learning_rate": 1.6757741347905284e-05,
"loss": 0.3725,
"step": 853
},
{
"epoch": 2.098280098280098,
"grad_norm": 0.2784163632616066,
"learning_rate": 1.6712204007285974e-05,
"loss": 0.4596,
"step": 854
},
{
"epoch": 2.100737100737101,
"grad_norm": 0.2615510772868201,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.4238,
"step": 855
},
{
"epoch": 2.1031941031941033,
"grad_norm": 0.23026759434611474,
"learning_rate": 1.662112932604736e-05,
"loss": 0.3978,
"step": 856
},
{
"epoch": 2.1056511056511056,
"grad_norm": 0.2676230826969792,
"learning_rate": 1.657559198542805e-05,
"loss": 0.3808,
"step": 857
},
{
"epoch": 2.108108108108108,
"grad_norm": 0.28732333343272015,
"learning_rate": 1.6530054644808744e-05,
"loss": 0.422,
"step": 858
},
{
"epoch": 2.1105651105651106,
"grad_norm": 0.2275060472400037,
"learning_rate": 1.6484517304189438e-05,
"loss": 0.3937,
"step": 859
},
{
"epoch": 2.113022113022113,
"grad_norm": 0.26578354958753214,
"learning_rate": 1.6438979963570128e-05,
"loss": 0.4314,
"step": 860
},
{
"epoch": 2.1154791154791153,
"grad_norm": 0.2738579827950538,
"learning_rate": 1.6393442622950818e-05,
"loss": 0.4774,
"step": 861
},
{
"epoch": 2.117936117936118,
"grad_norm": 0.26017321347892347,
"learning_rate": 1.6347905282331514e-05,
"loss": 0.4136,
"step": 862
},
{
"epoch": 2.1203931203931203,
"grad_norm": 0.25094754715043494,
"learning_rate": 1.6302367941712204e-05,
"loss": 0.3995,
"step": 863
},
{
"epoch": 2.1228501228501226,
"grad_norm": 0.24234839713703257,
"learning_rate": 1.6256830601092894e-05,
"loss": 0.4119,
"step": 864
},
{
"epoch": 2.1253071253071254,
"grad_norm": 0.25163386424602613,
"learning_rate": 1.621129326047359e-05,
"loss": 0.3784,
"step": 865
},
{
"epoch": 2.1277641277641277,
"grad_norm": 0.27936129219089983,
"learning_rate": 1.616575591985428e-05,
"loss": 0.4346,
"step": 866
},
{
"epoch": 2.13022113022113,
"grad_norm": 0.25895393324808647,
"learning_rate": 1.6120218579234975e-05,
"loss": 0.4425,
"step": 867
},
{
"epoch": 2.1326781326781328,
"grad_norm": 0.23027249451614298,
"learning_rate": 1.6074681238615665e-05,
"loss": 0.3955,
"step": 868
},
{
"epoch": 2.135135135135135,
"grad_norm": 0.24009714066156393,
"learning_rate": 1.6029143897996358e-05,
"loss": 0.4008,
"step": 869
},
{
"epoch": 2.1375921375921374,
"grad_norm": 0.2608536899762057,
"learning_rate": 1.598360655737705e-05,
"loss": 0.4489,
"step": 870
},
{
"epoch": 2.14004914004914,
"grad_norm": 0.27104838457856334,
"learning_rate": 1.593806921675774e-05,
"loss": 0.4567,
"step": 871
},
{
"epoch": 2.1425061425061425,
"grad_norm": 0.23710136887205738,
"learning_rate": 1.5892531876138435e-05,
"loss": 0.4177,
"step": 872
},
{
"epoch": 2.1449631449631448,
"grad_norm": 0.2614832850724873,
"learning_rate": 1.5846994535519128e-05,
"loss": 0.4159,
"step": 873
},
{
"epoch": 2.1474201474201475,
"grad_norm": 0.2493244212239117,
"learning_rate": 1.5801457194899818e-05,
"loss": 0.398,
"step": 874
},
{
"epoch": 2.14987714987715,
"grad_norm": 0.2336464998631902,
"learning_rate": 1.5755919854280508e-05,
"loss": 0.4269,
"step": 875
},
{
"epoch": 2.152334152334152,
"grad_norm": 0.2500529632050007,
"learning_rate": 1.5710382513661205e-05,
"loss": 0.4597,
"step": 876
},
{
"epoch": 2.154791154791155,
"grad_norm": 0.237091380401223,
"learning_rate": 1.5664845173041895e-05,
"loss": 0.3458,
"step": 877
},
{
"epoch": 2.157248157248157,
"grad_norm": 0.2654558367364787,
"learning_rate": 1.5619307832422585e-05,
"loss": 0.4016,
"step": 878
},
{
"epoch": 2.1597051597051595,
"grad_norm": 0.2378495764493538,
"learning_rate": 1.557377049180328e-05,
"loss": 0.3668,
"step": 879
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.24037789383707953,
"learning_rate": 1.552823315118397e-05,
"loss": 0.4122,
"step": 880
},
{
"epoch": 2.1646191646191646,
"grad_norm": 0.2774310296361704,
"learning_rate": 1.548269581056466e-05,
"loss": 0.4947,
"step": 881
},
{
"epoch": 2.167076167076167,
"grad_norm": 0.24152670773336085,
"learning_rate": 1.5437158469945355e-05,
"loss": 0.3944,
"step": 882
},
{
"epoch": 2.1695331695331697,
"grad_norm": 0.2598802823682941,
"learning_rate": 1.539162112932605e-05,
"loss": 0.452,
"step": 883
},
{
"epoch": 2.171990171990172,
"grad_norm": 4.841124303991015,
"learning_rate": 1.5346083788706742e-05,
"loss": 0.4445,
"step": 884
},
{
"epoch": 2.1744471744471743,
"grad_norm": 0.9349040267922862,
"learning_rate": 1.5300546448087432e-05,
"loss": 0.4991,
"step": 885
},
{
"epoch": 2.176904176904177,
"grad_norm": 0.27687057595557907,
"learning_rate": 1.5255009107468124e-05,
"loss": 0.4169,
"step": 886
},
{
"epoch": 2.1793611793611793,
"grad_norm": 0.29678484834818836,
"learning_rate": 1.5209471766848819e-05,
"loss": 0.4114,
"step": 887
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.2530513739903439,
"learning_rate": 1.5163934426229509e-05,
"loss": 0.4305,
"step": 888
},
{
"epoch": 2.1842751842751844,
"grad_norm": 0.2882228696525778,
"learning_rate": 1.51183970856102e-05,
"loss": 0.498,
"step": 889
},
{
"epoch": 2.1867321867321867,
"grad_norm": 0.2888696434121559,
"learning_rate": 1.5072859744990894e-05,
"loss": 0.4209,
"step": 890
},
{
"epoch": 2.189189189189189,
"grad_norm": 0.26095945742134186,
"learning_rate": 1.5027322404371585e-05,
"loss": 0.4539,
"step": 891
},
{
"epoch": 2.191646191646192,
"grad_norm": 0.25838164536911407,
"learning_rate": 1.4981785063752277e-05,
"loss": 0.4293,
"step": 892
},
{
"epoch": 2.194103194103194,
"grad_norm": 0.28809740633168196,
"learning_rate": 1.493624772313297e-05,
"loss": 0.4173,
"step": 893
},
{
"epoch": 2.1965601965601964,
"grad_norm": 0.2800744500661992,
"learning_rate": 1.4890710382513662e-05,
"loss": 0.4365,
"step": 894
},
{
"epoch": 2.199017199017199,
"grad_norm": 0.2502242180844428,
"learning_rate": 1.4845173041894352e-05,
"loss": 0.4286,
"step": 895
},
{
"epoch": 2.2014742014742015,
"grad_norm": 0.27065496051514343,
"learning_rate": 1.4799635701275047e-05,
"loss": 0.4674,
"step": 896
},
{
"epoch": 2.203931203931204,
"grad_norm": 0.2683214829245222,
"learning_rate": 1.4754098360655739e-05,
"loss": 0.4248,
"step": 897
},
{
"epoch": 2.2063882063882065,
"grad_norm": 0.2504102146869169,
"learning_rate": 1.4708561020036429e-05,
"loss": 0.4289,
"step": 898
},
{
"epoch": 2.208845208845209,
"grad_norm": 0.2577283264465404,
"learning_rate": 1.4663023679417124e-05,
"loss": 0.4386,
"step": 899
},
{
"epoch": 2.211302211302211,
"grad_norm": 0.26573181605705387,
"learning_rate": 1.4617486338797814e-05,
"loss": 0.4475,
"step": 900
},
{
"epoch": 2.213759213759214,
"grad_norm": 0.24365626007806804,
"learning_rate": 1.4571948998178509e-05,
"loss": 0.4331,
"step": 901
},
{
"epoch": 2.2162162162162162,
"grad_norm": 0.28062941136254116,
"learning_rate": 1.4526411657559199e-05,
"loss": 0.4328,
"step": 902
},
{
"epoch": 2.2186732186732185,
"grad_norm": 2.972065788354103,
"learning_rate": 1.448087431693989e-05,
"loss": 0.4546,
"step": 903
},
{
"epoch": 2.2211302211302213,
"grad_norm": 0.2534606607226359,
"learning_rate": 1.4435336976320584e-05,
"loss": 0.3829,
"step": 904
},
{
"epoch": 2.2235872235872236,
"grad_norm": 0.274929708261248,
"learning_rate": 1.4389799635701276e-05,
"loss": 0.4797,
"step": 905
},
{
"epoch": 2.226044226044226,
"grad_norm": 0.23105916503325502,
"learning_rate": 1.4344262295081968e-05,
"loss": 0.3747,
"step": 906
},
{
"epoch": 2.2285012285012287,
"grad_norm": 0.2868497407842456,
"learning_rate": 1.4298724954462661e-05,
"loss": 0.4016,
"step": 907
},
{
"epoch": 2.230958230958231,
"grad_norm": 0.27039212267154017,
"learning_rate": 1.4253187613843353e-05,
"loss": 0.4429,
"step": 908
},
{
"epoch": 2.2334152334152333,
"grad_norm": 0.2626645153376362,
"learning_rate": 1.4207650273224044e-05,
"loss": 0.3925,
"step": 909
},
{
"epoch": 2.235872235872236,
"grad_norm": 0.2857586938595,
"learning_rate": 1.4162112932604738e-05,
"loss": 0.4041,
"step": 910
},
{
"epoch": 2.2383292383292384,
"grad_norm": 0.24210750827605218,
"learning_rate": 1.411657559198543e-05,
"loss": 0.3824,
"step": 911
},
{
"epoch": 2.2407862407862407,
"grad_norm": 0.23567329374252635,
"learning_rate": 1.407103825136612e-05,
"loss": 0.4054,
"step": 912
},
{
"epoch": 2.2432432432432434,
"grad_norm": 0.2608042036778111,
"learning_rate": 1.4025500910746814e-05,
"loss": 0.4663,
"step": 913
},
{
"epoch": 2.2457002457002457,
"grad_norm": 0.2516718644133891,
"learning_rate": 1.3979963570127504e-05,
"loss": 0.3965,
"step": 914
},
{
"epoch": 2.248157248157248,
"grad_norm": 0.2547812213730915,
"learning_rate": 1.3934426229508196e-05,
"loss": 0.4178,
"step": 915
},
{
"epoch": 2.250614250614251,
"grad_norm": 0.24505968428973618,
"learning_rate": 1.388888888888889e-05,
"loss": 0.377,
"step": 916
},
{
"epoch": 2.253071253071253,
"grad_norm": 0.2726142469594438,
"learning_rate": 1.3843351548269581e-05,
"loss": 0.5143,
"step": 917
},
{
"epoch": 2.2555282555282554,
"grad_norm": 0.2349026070277474,
"learning_rate": 1.3797814207650273e-05,
"loss": 0.3882,
"step": 918
},
{
"epoch": 2.257985257985258,
"grad_norm": 0.22293153420044365,
"learning_rate": 1.3752276867030966e-05,
"loss": 0.3853,
"step": 919
},
{
"epoch": 2.2604422604422605,
"grad_norm": 0.25743572034407713,
"learning_rate": 1.3706739526411658e-05,
"loss": 0.4365,
"step": 920
},
{
"epoch": 2.262899262899263,
"grad_norm": 0.2579052038508534,
"learning_rate": 1.3661202185792351e-05,
"loss": 0.4398,
"step": 921
},
{
"epoch": 2.2653562653562656,
"grad_norm": 0.2440626223986384,
"learning_rate": 1.3615664845173043e-05,
"loss": 0.4438,
"step": 922
},
{
"epoch": 2.267813267813268,
"grad_norm": 0.24281445788603354,
"learning_rate": 1.3570127504553735e-05,
"loss": 0.4001,
"step": 923
},
{
"epoch": 2.27027027027027,
"grad_norm": 0.2552660754956044,
"learning_rate": 1.3524590163934428e-05,
"loss": 0.4111,
"step": 924
},
{
"epoch": 2.2727272727272725,
"grad_norm": 0.2457448775461868,
"learning_rate": 1.347905282331512e-05,
"loss": 0.4873,
"step": 925
},
{
"epoch": 2.2751842751842752,
"grad_norm": 0.24684307678902845,
"learning_rate": 1.343351548269581e-05,
"loss": 0.4106,
"step": 926
},
{
"epoch": 2.2776412776412776,
"grad_norm": 0.26175150698727817,
"learning_rate": 1.3387978142076505e-05,
"loss": 0.3655,
"step": 927
},
{
"epoch": 2.2800982800982803,
"grad_norm": 0.22280106083223208,
"learning_rate": 1.3342440801457195e-05,
"loss": 0.386,
"step": 928
},
{
"epoch": 2.2825552825552826,
"grad_norm": 0.2615527014422742,
"learning_rate": 1.3296903460837887e-05,
"loss": 0.4426,
"step": 929
},
{
"epoch": 2.285012285012285,
"grad_norm": 0.27081169659666804,
"learning_rate": 1.325136612021858e-05,
"loss": 0.4449,
"step": 930
},
{
"epoch": 2.2874692874692872,
"grad_norm": 0.2664738790934516,
"learning_rate": 1.3205828779599272e-05,
"loss": 0.3878,
"step": 931
},
{
"epoch": 2.28992628992629,
"grad_norm": 0.25492227378588317,
"learning_rate": 1.3160291438979963e-05,
"loss": 0.4008,
"step": 932
},
{
"epoch": 2.2923832923832923,
"grad_norm": 0.26063202213635495,
"learning_rate": 1.3114754098360657e-05,
"loss": 0.4289,
"step": 933
},
{
"epoch": 2.294840294840295,
"grad_norm": 0.2910730570058918,
"learning_rate": 1.3069216757741349e-05,
"loss": 0.5056,
"step": 934
},
{
"epoch": 2.2972972972972974,
"grad_norm": 0.25949619296926774,
"learning_rate": 1.302367941712204e-05,
"loss": 0.4352,
"step": 935
},
{
"epoch": 2.2997542997542997,
"grad_norm": 0.27651669705889614,
"learning_rate": 1.2978142076502734e-05,
"loss": 0.4198,
"step": 936
},
{
"epoch": 2.302211302211302,
"grad_norm": 0.24886536308925195,
"learning_rate": 1.2932604735883425e-05,
"loss": 0.4334,
"step": 937
},
{
"epoch": 2.3046683046683047,
"grad_norm": 0.2354118767882423,
"learning_rate": 1.2887067395264119e-05,
"loss": 0.4045,
"step": 938
},
{
"epoch": 2.307125307125307,
"grad_norm": 0.25840133517937125,
"learning_rate": 1.284153005464481e-05,
"loss": 0.4153,
"step": 939
},
{
"epoch": 2.30958230958231,
"grad_norm": 0.24820967640391792,
"learning_rate": 1.27959927140255e-05,
"loss": 0.4295,
"step": 940
},
{
"epoch": 2.312039312039312,
"grad_norm": 0.2682235670401214,
"learning_rate": 1.2750455373406195e-05,
"loss": 0.3975,
"step": 941
},
{
"epoch": 2.3144963144963144,
"grad_norm": 0.24936889462306489,
"learning_rate": 1.2704918032786885e-05,
"loss": 0.4019,
"step": 942
},
{
"epoch": 2.3169533169533167,
"grad_norm": 0.25313590598342434,
"learning_rate": 1.2659380692167577e-05,
"loss": 0.4121,
"step": 943
},
{
"epoch": 2.3194103194103195,
"grad_norm": 0.2661342959579272,
"learning_rate": 1.261384335154827e-05,
"loss": 0.4755,
"step": 944
},
{
"epoch": 2.321867321867322,
"grad_norm": 0.24872736039438376,
"learning_rate": 1.2568306010928962e-05,
"loss": 0.4223,
"step": 945
},
{
"epoch": 2.3243243243243246,
"grad_norm": 0.2687245129503849,
"learning_rate": 1.2522768670309654e-05,
"loss": 0.4267,
"step": 946
},
{
"epoch": 2.326781326781327,
"grad_norm": 0.22779727771546104,
"learning_rate": 1.2477231329690346e-05,
"loss": 0.3892,
"step": 947
},
{
"epoch": 2.329238329238329,
"grad_norm": 0.24571522522484693,
"learning_rate": 1.2431693989071039e-05,
"loss": 0.4346,
"step": 948
},
{
"epoch": 2.3316953316953315,
"grad_norm": 0.25561261326404666,
"learning_rate": 1.238615664845173e-05,
"loss": 0.4389,
"step": 949
},
{
"epoch": 2.3341523341523343,
"grad_norm": 0.22939877730223857,
"learning_rate": 1.2340619307832422e-05,
"loss": 0.4432,
"step": 950
},
{
"epoch": 2.3366093366093366,
"grad_norm": 0.2357383557445305,
"learning_rate": 1.2295081967213116e-05,
"loss": 0.4173,
"step": 951
},
{
"epoch": 2.339066339066339,
"grad_norm": 0.7364831086181507,
"learning_rate": 1.2249544626593807e-05,
"loss": 0.4907,
"step": 952
},
{
"epoch": 2.3415233415233416,
"grad_norm": 0.44572692986041285,
"learning_rate": 1.22040072859745e-05,
"loss": 0.4205,
"step": 953
},
{
"epoch": 2.343980343980344,
"grad_norm": 3.567246948098372,
"learning_rate": 1.215846994535519e-05,
"loss": 0.4788,
"step": 954
},
{
"epoch": 2.3464373464373462,
"grad_norm": 0.30309787203713356,
"learning_rate": 1.2112932604735884e-05,
"loss": 0.4275,
"step": 955
},
{
"epoch": 2.348894348894349,
"grad_norm": 0.24996522048036657,
"learning_rate": 1.2067395264116576e-05,
"loss": 0.4782,
"step": 956
},
{
"epoch": 2.3513513513513513,
"grad_norm": 0.24778131630911673,
"learning_rate": 1.2021857923497268e-05,
"loss": 0.4645,
"step": 957
},
{
"epoch": 2.3538083538083536,
"grad_norm": 0.29025219100431315,
"learning_rate": 1.1976320582877961e-05,
"loss": 0.4398,
"step": 958
},
{
"epoch": 2.3562653562653564,
"grad_norm": 0.27470165797569124,
"learning_rate": 1.1930783242258653e-05,
"loss": 0.4196,
"step": 959
},
{
"epoch": 2.3587223587223587,
"grad_norm": 0.24980686026607077,
"learning_rate": 1.1885245901639344e-05,
"loss": 0.4048,
"step": 960
},
{
"epoch": 2.361179361179361,
"grad_norm": 0.23877293904046448,
"learning_rate": 1.1839708561020036e-05,
"loss": 0.4014,
"step": 961
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.2727423118228381,
"learning_rate": 1.179417122040073e-05,
"loss": 0.4454,
"step": 962
},
{
"epoch": 2.366093366093366,
"grad_norm": 1.7198763345457913,
"learning_rate": 1.1748633879781421e-05,
"loss": 0.4657,
"step": 963
},
{
"epoch": 2.3685503685503684,
"grad_norm": 0.22671067308180004,
"learning_rate": 1.1703096539162113e-05,
"loss": 0.3986,
"step": 964
},
{
"epoch": 2.371007371007371,
"grad_norm": 0.26385523396246985,
"learning_rate": 1.1657559198542806e-05,
"loss": 0.4469,
"step": 965
},
{
"epoch": 2.3734643734643734,
"grad_norm": 0.5084136843839772,
"learning_rate": 1.1612021857923498e-05,
"loss": 0.3711,
"step": 966
},
{
"epoch": 2.3759213759213758,
"grad_norm": 0.27655806457414583,
"learning_rate": 1.156648451730419e-05,
"loss": 0.4551,
"step": 967
},
{
"epoch": 2.3783783783783785,
"grad_norm": 0.2410876717300484,
"learning_rate": 1.1520947176684881e-05,
"loss": 0.4496,
"step": 968
},
{
"epoch": 2.380835380835381,
"grad_norm": 0.24309059276058065,
"learning_rate": 1.1475409836065575e-05,
"loss": 0.3511,
"step": 969
},
{
"epoch": 2.383292383292383,
"grad_norm": 0.30117433755614303,
"learning_rate": 1.1429872495446266e-05,
"loss": 0.4416,
"step": 970
},
{
"epoch": 2.385749385749386,
"grad_norm": 0.25582765879217223,
"learning_rate": 1.1384335154826958e-05,
"loss": 0.4337,
"step": 971
},
{
"epoch": 2.388206388206388,
"grad_norm": 0.22539920788225698,
"learning_rate": 1.1338797814207651e-05,
"loss": 0.3714,
"step": 972
},
{
"epoch": 2.3906633906633905,
"grad_norm": 0.29931134171407514,
"learning_rate": 1.1293260473588343e-05,
"loss": 0.5106,
"step": 973
},
{
"epoch": 2.3931203931203933,
"grad_norm": 0.24423619232997276,
"learning_rate": 1.1247723132969035e-05,
"loss": 0.4172,
"step": 974
},
{
"epoch": 2.3955773955773956,
"grad_norm": 0.24339190265600077,
"learning_rate": 1.1202185792349727e-05,
"loss": 0.4833,
"step": 975
},
{
"epoch": 2.398034398034398,
"grad_norm": 0.24812658878123112,
"learning_rate": 1.115664845173042e-05,
"loss": 0.4091,
"step": 976
},
{
"epoch": 2.4004914004914006,
"grad_norm": 0.28759159231823356,
"learning_rate": 1.1111111111111112e-05,
"loss": 0.4221,
"step": 977
},
{
"epoch": 2.402948402948403,
"grad_norm": 0.23293079513866843,
"learning_rate": 1.1065573770491803e-05,
"loss": 0.4131,
"step": 978
},
{
"epoch": 2.4054054054054053,
"grad_norm": 7.749543886893285,
"learning_rate": 1.1020036429872497e-05,
"loss": 0.7646,
"step": 979
},
{
"epoch": 2.407862407862408,
"grad_norm": 0.23764368732650262,
"learning_rate": 1.0974499089253188e-05,
"loss": 0.4017,
"step": 980
},
{
"epoch": 2.4103194103194103,
"grad_norm": 0.2849297299623391,
"learning_rate": 1.092896174863388e-05,
"loss": 0.3994,
"step": 981
},
{
"epoch": 2.4127764127764126,
"grad_norm": 0.25594559931712746,
"learning_rate": 1.0883424408014572e-05,
"loss": 0.3598,
"step": 982
},
{
"epoch": 2.4152334152334154,
"grad_norm": 0.24347214875726386,
"learning_rate": 1.0837887067395265e-05,
"loss": 0.4782,
"step": 983
},
{
"epoch": 2.4176904176904177,
"grad_norm": 0.2667475881035064,
"learning_rate": 1.0792349726775957e-05,
"loss": 0.478,
"step": 984
},
{
"epoch": 2.42014742014742,
"grad_norm": 0.25874841731246545,
"learning_rate": 1.0746812386156649e-05,
"loss": 0.4026,
"step": 985
},
{
"epoch": 2.4226044226044228,
"grad_norm": 0.25268556183613805,
"learning_rate": 1.0701275045537342e-05,
"loss": 0.4032,
"step": 986
},
{
"epoch": 2.425061425061425,
"grad_norm": 0.2395640061506075,
"learning_rate": 1.0655737704918032e-05,
"loss": 0.4052,
"step": 987
},
{
"epoch": 2.4275184275184274,
"grad_norm": 0.2750057820112185,
"learning_rate": 1.0610200364298725e-05,
"loss": 0.5181,
"step": 988
},
{
"epoch": 2.42997542997543,
"grad_norm": 0.23956660835432816,
"learning_rate": 1.0564663023679417e-05,
"loss": 0.4087,
"step": 989
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.2682821537572547,
"learning_rate": 1.051912568306011e-05,
"loss": 0.3539,
"step": 990
},
{
"epoch": 2.4348894348894348,
"grad_norm": 0.253635569161534,
"learning_rate": 1.0473588342440802e-05,
"loss": 0.4304,
"step": 991
},
{
"epoch": 2.4373464373464375,
"grad_norm": 0.25725469397538964,
"learning_rate": 1.0428051001821494e-05,
"loss": 0.4665,
"step": 992
},
{
"epoch": 2.43980343980344,
"grad_norm": 0.21869469055564097,
"learning_rate": 1.0382513661202187e-05,
"loss": 0.3391,
"step": 993
},
{
"epoch": 2.442260442260442,
"grad_norm": 0.23932164929486857,
"learning_rate": 1.0336976320582877e-05,
"loss": 0.4214,
"step": 994
},
{
"epoch": 2.444717444717445,
"grad_norm": 0.249887483386558,
"learning_rate": 1.029143897996357e-05,
"loss": 0.4299,
"step": 995
},
{
"epoch": 2.447174447174447,
"grad_norm": 0.2448604037219602,
"learning_rate": 1.0245901639344262e-05,
"loss": 0.3619,
"step": 996
},
{
"epoch": 2.4496314496314495,
"grad_norm": 0.23238074536535608,
"learning_rate": 1.0200364298724956e-05,
"loss": 0.4014,
"step": 997
},
{
"epoch": 2.4520884520884523,
"grad_norm": 0.6771008652415798,
"learning_rate": 1.0154826958105647e-05,
"loss": 0.4356,
"step": 998
},
{
"epoch": 2.4545454545454546,
"grad_norm": 0.2590635077299185,
"learning_rate": 1.0109289617486339e-05,
"loss": 0.4019,
"step": 999
},
{
"epoch": 2.457002457002457,
"grad_norm": 0.2546310175980953,
"learning_rate": 1.0063752276867032e-05,
"loss": 0.4573,
"step": 1000
},
{
"epoch": 2.4594594594594597,
"grad_norm": 0.22827326397492897,
"learning_rate": 1.0018214936247722e-05,
"loss": 0.3811,
"step": 1001
},
{
"epoch": 2.461916461916462,
"grad_norm": 0.2433313320364577,
"learning_rate": 9.972677595628416e-06,
"loss": 0.429,
"step": 1002
},
{
"epoch": 2.4643734643734643,
"grad_norm": 0.26498928945986583,
"learning_rate": 9.927140255009108e-06,
"loss": 0.4492,
"step": 1003
},
{
"epoch": 2.4668304668304666,
"grad_norm": 0.2375299896617999,
"learning_rate": 9.8816029143898e-06,
"loss": 0.4213,
"step": 1004
},
{
"epoch": 2.4692874692874693,
"grad_norm": 0.2395572196206859,
"learning_rate": 9.836065573770493e-06,
"loss": 0.4685,
"step": 1005
},
{
"epoch": 2.4717444717444716,
"grad_norm": 0.2537090566068248,
"learning_rate": 9.790528233151184e-06,
"loss": 0.4412,
"step": 1006
},
{
"epoch": 2.4742014742014744,
"grad_norm": 0.25208141661171674,
"learning_rate": 9.744990892531878e-06,
"loss": 0.3854,
"step": 1007
},
{
"epoch": 2.4766584766584767,
"grad_norm": 0.2642583302690173,
"learning_rate": 9.699453551912568e-06,
"loss": 0.4179,
"step": 1008
},
{
"epoch": 2.479115479115479,
"grad_norm": 0.23767421601073707,
"learning_rate": 9.653916211293261e-06,
"loss": 0.4385,
"step": 1009
},
{
"epoch": 2.4815724815724813,
"grad_norm": 0.24744117798052945,
"learning_rate": 9.608378870673953e-06,
"loss": 0.4231,
"step": 1010
},
{
"epoch": 2.484029484029484,
"grad_norm": 0.23898600337582362,
"learning_rate": 9.562841530054644e-06,
"loss": 0.4217,
"step": 1011
},
{
"epoch": 2.4864864864864864,
"grad_norm": 0.25968529590110145,
"learning_rate": 9.517304189435338e-06,
"loss": 0.4322,
"step": 1012
},
{
"epoch": 2.488943488943489,
"grad_norm": 0.2552860811626078,
"learning_rate": 9.47176684881603e-06,
"loss": 0.4059,
"step": 1013
},
{
"epoch": 2.4914004914004915,
"grad_norm": 0.2535129474406969,
"learning_rate": 9.426229508196721e-06,
"loss": 0.4199,
"step": 1014
},
{
"epoch": 2.493857493857494,
"grad_norm": 0.23391837558421832,
"learning_rate": 9.380692167577413e-06,
"loss": 0.4875,
"step": 1015
},
{
"epoch": 2.496314496314496,
"grad_norm": 0.24298822208561335,
"learning_rate": 9.335154826958106e-06,
"loss": 0.4601,
"step": 1016
},
{
"epoch": 2.498771498771499,
"grad_norm": 0.9278988295393633,
"learning_rate": 9.2896174863388e-06,
"loss": 0.4622,
"step": 1017
},
{
"epoch": 2.501228501228501,
"grad_norm": 0.2392256552257423,
"learning_rate": 9.24408014571949e-06,
"loss": 0.4355,
"step": 1018
},
{
"epoch": 2.503685503685504,
"grad_norm": 0.2330893968890112,
"learning_rate": 9.198542805100183e-06,
"loss": 0.3879,
"step": 1019
},
{
"epoch": 2.506142506142506,
"grad_norm": 0.25975387161037466,
"learning_rate": 9.153005464480875e-06,
"loss": 0.5102,
"step": 1020
},
{
"epoch": 2.5085995085995085,
"grad_norm": 0.23725040470847344,
"learning_rate": 9.107468123861566e-06,
"loss": 0.3736,
"step": 1021
},
{
"epoch": 2.511056511056511,
"grad_norm": 0.2313618576568214,
"learning_rate": 9.061930783242258e-06,
"loss": 0.4254,
"step": 1022
},
{
"epoch": 2.5135135135135136,
"grad_norm": 0.24699951638595727,
"learning_rate": 9.016393442622952e-06,
"loss": 0.4583,
"step": 1023
},
{
"epoch": 2.515970515970516,
"grad_norm": 0.25544750210043377,
"learning_rate": 8.970856102003643e-06,
"loss": 0.3913,
"step": 1024
},
{
"epoch": 2.5184275184275187,
"grad_norm": 0.255332626493528,
"learning_rate": 8.925318761384335e-06,
"loss": 0.443,
"step": 1025
},
{
"epoch": 2.520884520884521,
"grad_norm": 0.24789597999286547,
"learning_rate": 8.879781420765028e-06,
"loss": 0.3952,
"step": 1026
},
{
"epoch": 2.5233415233415233,
"grad_norm": 0.23241695995795617,
"learning_rate": 8.83424408014572e-06,
"loss": 0.3795,
"step": 1027
},
{
"epoch": 2.5257985257985256,
"grad_norm": 0.22579417060539125,
"learning_rate": 8.788706739526412e-06,
"loss": 0.3723,
"step": 1028
},
{
"epoch": 2.5282555282555284,
"grad_norm": 0.22283238385668375,
"learning_rate": 8.743169398907103e-06,
"loss": 0.391,
"step": 1029
},
{
"epoch": 2.5307125307125307,
"grad_norm": 0.2246271370216966,
"learning_rate": 8.697632058287797e-06,
"loss": 0.3482,
"step": 1030
},
{
"epoch": 2.5331695331695334,
"grad_norm": 0.24663076613217097,
"learning_rate": 8.652094717668488e-06,
"loss": 0.4507,
"step": 1031
},
{
"epoch": 2.5356265356265357,
"grad_norm": 0.26050722671516013,
"learning_rate": 8.60655737704918e-06,
"loss": 0.3843,
"step": 1032
},
{
"epoch": 2.538083538083538,
"grad_norm": 0.2331322639700471,
"learning_rate": 8.561020036429874e-06,
"loss": 0.3914,
"step": 1033
},
{
"epoch": 2.5405405405405403,
"grad_norm": 0.24551537000672188,
"learning_rate": 8.515482695810565e-06,
"loss": 0.4184,
"step": 1034
},
{
"epoch": 2.542997542997543,
"grad_norm": 0.24176204248205907,
"learning_rate": 8.469945355191257e-06,
"loss": 0.3952,
"step": 1035
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.23506445053772151,
"learning_rate": 8.42440801457195e-06,
"loss": 0.4188,
"step": 1036
},
{
"epoch": 2.547911547911548,
"grad_norm": 0.23936543990096357,
"learning_rate": 8.378870673952642e-06,
"loss": 0.4283,
"step": 1037
},
{
"epoch": 2.5503685503685505,
"grad_norm": 0.2291008508057063,
"learning_rate": 8.333333333333334e-06,
"loss": 0.3352,
"step": 1038
},
{
"epoch": 2.552825552825553,
"grad_norm": 0.24326786660676292,
"learning_rate": 8.287795992714025e-06,
"loss": 0.447,
"step": 1039
},
{
"epoch": 2.555282555282555,
"grad_norm": 0.2275126614890594,
"learning_rate": 8.242258652094719e-06,
"loss": 0.4059,
"step": 1040
},
{
"epoch": 2.557739557739558,
"grad_norm": 0.23651265321324066,
"learning_rate": 8.196721311475409e-06,
"loss": 0.4521,
"step": 1041
},
{
"epoch": 2.56019656019656,
"grad_norm": 0.26505811816438285,
"learning_rate": 8.151183970856102e-06,
"loss": 0.4815,
"step": 1042
},
{
"epoch": 2.562653562653563,
"grad_norm": 0.2568061760157405,
"learning_rate": 8.105646630236796e-06,
"loss": 0.4375,
"step": 1043
},
{
"epoch": 2.5651105651105652,
"grad_norm": 0.22708969537757387,
"learning_rate": 8.060109289617487e-06,
"loss": 0.4164,
"step": 1044
},
{
"epoch": 2.5675675675675675,
"grad_norm": 0.22323849393344242,
"learning_rate": 8.014571948998179e-06,
"loss": 0.4076,
"step": 1045
},
{
"epoch": 2.57002457002457,
"grad_norm": 0.25111824108906006,
"learning_rate": 7.96903460837887e-06,
"loss": 0.4767,
"step": 1046
},
{
"epoch": 2.5724815724815726,
"grad_norm": 0.260989307316713,
"learning_rate": 7.923497267759564e-06,
"loss": 0.433,
"step": 1047
},
{
"epoch": 2.574938574938575,
"grad_norm": 0.26052793667495083,
"learning_rate": 7.877959927140254e-06,
"loss": 0.5054,
"step": 1048
},
{
"epoch": 2.5773955773955772,
"grad_norm": 0.24915581597371805,
"learning_rate": 7.832422586520947e-06,
"loss": 0.4814,
"step": 1049
},
{
"epoch": 2.57985257985258,
"grad_norm": 0.2543219048137912,
"learning_rate": 7.78688524590164e-06,
"loss": 0.4697,
"step": 1050
},
{
"epoch": 2.5823095823095823,
"grad_norm": 0.24030777009135726,
"learning_rate": 7.74134790528233e-06,
"loss": 0.4742,
"step": 1051
},
{
"epoch": 2.5847665847665846,
"grad_norm": 0.2231426133500323,
"learning_rate": 7.695810564663024e-06,
"loss": 0.3994,
"step": 1052
},
{
"epoch": 2.5872235872235874,
"grad_norm": 0.22308991337870562,
"learning_rate": 7.650273224043716e-06,
"loss": 0.3573,
"step": 1053
},
{
"epoch": 2.5896805896805897,
"grad_norm": 0.23216474618491506,
"learning_rate": 7.604735883424409e-06,
"loss": 0.4204,
"step": 1054
},
{
"epoch": 2.592137592137592,
"grad_norm": 0.2673680272419057,
"learning_rate": 7.5591985428051e-06,
"loss": 0.4019,
"step": 1055
},
{
"epoch": 2.5945945945945947,
"grad_norm": 0.2427622295285736,
"learning_rate": 7.513661202185793e-06,
"loss": 0.4387,
"step": 1056
},
{
"epoch": 2.597051597051597,
"grad_norm": 0.22860426639394132,
"learning_rate": 7.468123861566485e-06,
"loss": 0.3722,
"step": 1057
},
{
"epoch": 2.5995085995085994,
"grad_norm": 0.22665737637936162,
"learning_rate": 7.422586520947176e-06,
"loss": 0.3897,
"step": 1058
},
{
"epoch": 2.601965601965602,
"grad_norm": 0.23303711982308306,
"learning_rate": 7.3770491803278695e-06,
"loss": 0.4285,
"step": 1059
},
{
"epoch": 2.6044226044226044,
"grad_norm": 0.26128393033135633,
"learning_rate": 7.331511839708562e-06,
"loss": 0.4375,
"step": 1060
},
{
"epoch": 2.6068796068796067,
"grad_norm": 0.25239192353421025,
"learning_rate": 7.2859744990892545e-06,
"loss": 0.4289,
"step": 1061
},
{
"epoch": 2.6093366093366095,
"grad_norm": 0.2528160456529098,
"learning_rate": 7.240437158469945e-06,
"loss": 0.4413,
"step": 1062
},
{
"epoch": 2.611793611793612,
"grad_norm": 0.23646099966620737,
"learning_rate": 7.194899817850638e-06,
"loss": 0.47,
"step": 1063
},
{
"epoch": 2.614250614250614,
"grad_norm": 0.24423364418249202,
"learning_rate": 7.1493624772313305e-06,
"loss": 0.3873,
"step": 1064
},
{
"epoch": 2.616707616707617,
"grad_norm": 0.39790290853859617,
"learning_rate": 7.103825136612022e-06,
"loss": 0.4088,
"step": 1065
},
{
"epoch": 2.619164619164619,
"grad_norm": 0.24950104367468617,
"learning_rate": 7.058287795992715e-06,
"loss": 0.4047,
"step": 1066
},
{
"epoch": 2.6216216216216215,
"grad_norm": 0.2130145601569131,
"learning_rate": 7.012750455373407e-06,
"loss": 0.3621,
"step": 1067
},
{
"epoch": 2.6240786240786242,
"grad_norm": 0.24418610790883985,
"learning_rate": 6.967213114754098e-06,
"loss": 0.3692,
"step": 1068
},
{
"epoch": 2.6265356265356266,
"grad_norm": 0.24162683820897035,
"learning_rate": 6.921675774134791e-06,
"loss": 0.4028,
"step": 1069
},
{
"epoch": 2.628992628992629,
"grad_norm": 0.23928139013451663,
"learning_rate": 6.876138433515483e-06,
"loss": 0.3879,
"step": 1070
},
{
"epoch": 2.631449631449631,
"grad_norm": 0.23045710828373894,
"learning_rate": 6.830601092896176e-06,
"loss": 0.4,
"step": 1071
},
{
"epoch": 2.633906633906634,
"grad_norm": 0.2697298886110676,
"learning_rate": 6.785063752276867e-06,
"loss": 0.4548,
"step": 1072
},
{
"epoch": 2.6363636363636362,
"grad_norm": 0.24532148513902288,
"learning_rate": 6.73952641165756e-06,
"loss": 0.3913,
"step": 1073
},
{
"epoch": 2.638820638820639,
"grad_norm": 0.2337130540549015,
"learning_rate": 6.6939890710382525e-06,
"loss": 0.4059,
"step": 1074
},
{
"epoch": 2.6412776412776413,
"grad_norm": 0.2528405150657174,
"learning_rate": 6.648451730418943e-06,
"loss": 0.3687,
"step": 1075
},
{
"epoch": 2.6437346437346436,
"grad_norm": 0.23392129641022022,
"learning_rate": 6.602914389799636e-06,
"loss": 0.4117,
"step": 1076
},
{
"epoch": 2.646191646191646,
"grad_norm": 0.2462017004070107,
"learning_rate": 6.557377049180328e-06,
"loss": 0.4551,
"step": 1077
},
{
"epoch": 2.6486486486486487,
"grad_norm": 0.23207850038278316,
"learning_rate": 6.51183970856102e-06,
"loss": 0.4533,
"step": 1078
},
{
"epoch": 2.651105651105651,
"grad_norm": 0.22221002852774863,
"learning_rate": 6.466302367941713e-06,
"loss": 0.3899,
"step": 1079
},
{
"epoch": 2.6535626535626538,
"grad_norm": 0.22345355290268304,
"learning_rate": 6.420765027322405e-06,
"loss": 0.4201,
"step": 1080
},
{
"epoch": 2.656019656019656,
"grad_norm": 0.24769684733774203,
"learning_rate": 6.375227686703098e-06,
"loss": 0.4161,
"step": 1081
},
{
"epoch": 2.6584766584766584,
"grad_norm": 0.2407677406182194,
"learning_rate": 6.3296903460837886e-06,
"loss": 0.4225,
"step": 1082
},
{
"epoch": 2.6609336609336607,
"grad_norm": 0.2688663002431461,
"learning_rate": 6.284153005464481e-06,
"loss": 0.4576,
"step": 1083
},
{
"epoch": 2.6633906633906634,
"grad_norm": 0.21352788135666395,
"learning_rate": 6.238615664845173e-06,
"loss": 0.3745,
"step": 1084
},
{
"epoch": 2.6658476658476657,
"grad_norm": 0.2343987025317479,
"learning_rate": 6.193078324225865e-06,
"loss": 0.437,
"step": 1085
},
{
"epoch": 2.6683046683046685,
"grad_norm": 0.23634741722118774,
"learning_rate": 6.147540983606558e-06,
"loss": 0.4755,
"step": 1086
},
{
"epoch": 2.670761670761671,
"grad_norm": 0.2333977046249411,
"learning_rate": 6.10200364298725e-06,
"loss": 0.4226,
"step": 1087
},
{
"epoch": 2.673218673218673,
"grad_norm": 0.24140034792380946,
"learning_rate": 6.056466302367942e-06,
"loss": 0.4381,
"step": 1088
},
{
"epoch": 2.6756756756756754,
"grad_norm": 0.24610102385252078,
"learning_rate": 6.010928961748634e-06,
"loss": 0.4788,
"step": 1089
},
{
"epoch": 2.678132678132678,
"grad_norm": 0.21651298490313028,
"learning_rate": 5.965391621129326e-06,
"loss": 0.379,
"step": 1090
},
{
"epoch": 2.6805896805896805,
"grad_norm": 0.24002364672689916,
"learning_rate": 5.919854280510018e-06,
"loss": 0.4639,
"step": 1091
},
{
"epoch": 2.6830466830466833,
"grad_norm": 0.2401559730905027,
"learning_rate": 5.874316939890711e-06,
"loss": 0.4222,
"step": 1092
},
{
"epoch": 2.6855036855036856,
"grad_norm": 0.2296738324030033,
"learning_rate": 5.828779599271403e-06,
"loss": 0.4562,
"step": 1093
},
{
"epoch": 2.687960687960688,
"grad_norm": 0.24627104393871396,
"learning_rate": 5.783242258652095e-06,
"loss": 0.443,
"step": 1094
},
{
"epoch": 2.69041769041769,
"grad_norm": 0.23495018354076735,
"learning_rate": 5.737704918032787e-06,
"loss": 0.4446,
"step": 1095
},
{
"epoch": 2.692874692874693,
"grad_norm": 0.2515349565358722,
"learning_rate": 5.692167577413479e-06,
"loss": 0.4446,
"step": 1096
},
{
"epoch": 2.6953316953316953,
"grad_norm": 0.23661569080996545,
"learning_rate": 5.646630236794172e-06,
"loss": 0.4218,
"step": 1097
},
{
"epoch": 2.697788697788698,
"grad_norm": 0.25053626642778104,
"learning_rate": 5.601092896174863e-06,
"loss": 0.4072,
"step": 1098
},
{
"epoch": 2.7002457002457003,
"grad_norm": 0.23587139645082844,
"learning_rate": 5.555555555555556e-06,
"loss": 0.4173,
"step": 1099
},
{
"epoch": 2.7027027027027026,
"grad_norm": 0.2273539299051293,
"learning_rate": 5.510018214936248e-06,
"loss": 0.378,
"step": 1100
},
{
"epoch": 2.705159705159705,
"grad_norm": 0.24970174549990606,
"learning_rate": 5.46448087431694e-06,
"loss": 0.4712,
"step": 1101
},
{
"epoch": 2.7076167076167077,
"grad_norm": 0.22200832971393533,
"learning_rate": 5.418943533697633e-06,
"loss": 0.4103,
"step": 1102
},
{
"epoch": 2.71007371007371,
"grad_norm": 0.2269123817589866,
"learning_rate": 5.373406193078324e-06,
"loss": 0.4032,
"step": 1103
},
{
"epoch": 2.7125307125307128,
"grad_norm": 0.25690335857720653,
"learning_rate": 5.327868852459016e-06,
"loss": 0.4299,
"step": 1104
},
{
"epoch": 2.714987714987715,
"grad_norm": 0.24544690097434016,
"learning_rate": 5.2823315118397085e-06,
"loss": 0.4116,
"step": 1105
},
{
"epoch": 2.7174447174447174,
"grad_norm": 0.2319686270048615,
"learning_rate": 5.236794171220401e-06,
"loss": 0.3995,
"step": 1106
},
{
"epoch": 2.7199017199017197,
"grad_norm": 0.23710945966143346,
"learning_rate": 5.191256830601094e-06,
"loss": 0.4631,
"step": 1107
},
{
"epoch": 2.7223587223587224,
"grad_norm": 0.2259046385681155,
"learning_rate": 5.145719489981785e-06,
"loss": 0.4243,
"step": 1108
},
{
"epoch": 2.7248157248157248,
"grad_norm": 0.2257040623568255,
"learning_rate": 5.100182149362478e-06,
"loss": 0.4126,
"step": 1109
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.31168912185145126,
"learning_rate": 5.0546448087431695e-06,
"loss": 0.4209,
"step": 1110
},
{
"epoch": 2.72972972972973,
"grad_norm": 0.22549576040927052,
"learning_rate": 5.009107468123861e-06,
"loss": 0.3731,
"step": 1111
},
{
"epoch": 2.732186732186732,
"grad_norm": 0.24143398730766474,
"learning_rate": 4.963570127504554e-06,
"loss": 0.3649,
"step": 1112
},
{
"epoch": 2.7346437346437344,
"grad_norm": 0.22716863260793815,
"learning_rate": 4.918032786885246e-06,
"loss": 0.4307,
"step": 1113
},
{
"epoch": 2.737100737100737,
"grad_norm": 0.22591051220728886,
"learning_rate": 4.872495446265939e-06,
"loss": 0.3708,
"step": 1114
},
{
"epoch": 2.7395577395577395,
"grad_norm": 0.24622339640633006,
"learning_rate": 4.8269581056466305e-06,
"loss": 0.4399,
"step": 1115
},
{
"epoch": 2.7420147420147423,
"grad_norm": 0.2822872905637631,
"learning_rate": 4.781420765027322e-06,
"loss": 0.4346,
"step": 1116
},
{
"epoch": 2.7444717444717446,
"grad_norm": 0.22801344793802245,
"learning_rate": 4.735883424408015e-06,
"loss": 0.3956,
"step": 1117
},
{
"epoch": 2.746928746928747,
"grad_norm": 0.22155930655884537,
"learning_rate": 4.6903460837887065e-06,
"loss": 0.4215,
"step": 1118
},
{
"epoch": 2.749385749385749,
"grad_norm": 0.25029715906542904,
"learning_rate": 4.6448087431694e-06,
"loss": 0.488,
"step": 1119
},
{
"epoch": 2.751842751842752,
"grad_norm": 0.24274141839591315,
"learning_rate": 4.5992714025500915e-06,
"loss": 0.4195,
"step": 1120
},
{
"epoch": 2.7542997542997543,
"grad_norm": 0.23028544702814976,
"learning_rate": 4.553734061930783e-06,
"loss": 0.3788,
"step": 1121
},
{
"epoch": 2.756756756756757,
"grad_norm": 0.24916297952456937,
"learning_rate": 4.508196721311476e-06,
"loss": 0.4244,
"step": 1122
},
{
"epoch": 2.7592137592137593,
"grad_norm": 0.5209218145104284,
"learning_rate": 4.4626593806921675e-06,
"loss": 0.4771,
"step": 1123
},
{
"epoch": 2.7616707616707616,
"grad_norm": 0.2149644103008232,
"learning_rate": 4.41712204007286e-06,
"loss": 0.38,
"step": 1124
},
{
"epoch": 2.764127764127764,
"grad_norm": 3.4124629297354665,
"learning_rate": 4.371584699453552e-06,
"loss": 0.5629,
"step": 1125
},
{
"epoch": 2.7665847665847667,
"grad_norm": 0.2530450553088245,
"learning_rate": 4.326047358834244e-06,
"loss": 0.458,
"step": 1126
},
{
"epoch": 2.769041769041769,
"grad_norm": 0.24743507038028553,
"learning_rate": 4.280510018214937e-06,
"loss": 0.4901,
"step": 1127
},
{
"epoch": 2.7714987714987718,
"grad_norm": 0.24238514475994,
"learning_rate": 4.2349726775956285e-06,
"loss": 0.4958,
"step": 1128
},
{
"epoch": 2.773955773955774,
"grad_norm": 0.22170693726990862,
"learning_rate": 4.189435336976321e-06,
"loss": 0.3619,
"step": 1129
},
{
"epoch": 2.7764127764127764,
"grad_norm": 0.22200899039595443,
"learning_rate": 4.143897996357013e-06,
"loss": 0.3722,
"step": 1130
},
{
"epoch": 2.7788697788697787,
"grad_norm": 0.21557815228436708,
"learning_rate": 4.098360655737704e-06,
"loss": 0.4045,
"step": 1131
},
{
"epoch": 2.7813267813267815,
"grad_norm": 0.23101269211409964,
"learning_rate": 4.052823315118398e-06,
"loss": 0.3993,
"step": 1132
},
{
"epoch": 2.7837837837837838,
"grad_norm": 0.2506113219628401,
"learning_rate": 4.0072859744990895e-06,
"loss": 0.4394,
"step": 1133
},
{
"epoch": 2.786240786240786,
"grad_norm": 0.25829653944630426,
"learning_rate": 3.961748633879782e-06,
"loss": 0.4781,
"step": 1134
},
{
"epoch": 2.788697788697789,
"grad_norm": 0.2283437552109554,
"learning_rate": 3.916211293260474e-06,
"loss": 0.4199,
"step": 1135
},
{
"epoch": 2.791154791154791,
"grad_norm": 0.23261934599464665,
"learning_rate": 3.870673952641165e-06,
"loss": 0.4376,
"step": 1136
},
{
"epoch": 2.7936117936117935,
"grad_norm": 0.22397178290472075,
"learning_rate": 3.825136612021858e-06,
"loss": 0.3982,
"step": 1137
},
{
"epoch": 2.796068796068796,
"grad_norm": 0.2398679252310125,
"learning_rate": 3.77959927140255e-06,
"loss": 0.4303,
"step": 1138
},
{
"epoch": 2.7985257985257985,
"grad_norm": 0.2521861139284355,
"learning_rate": 3.7340619307832426e-06,
"loss": 0.3795,
"step": 1139
},
{
"epoch": 2.800982800982801,
"grad_norm": 0.22383712405363193,
"learning_rate": 3.6885245901639347e-06,
"loss": 0.4348,
"step": 1140
},
{
"epoch": 2.8034398034398036,
"grad_norm": 0.22946747083819977,
"learning_rate": 3.6429872495446273e-06,
"loss": 0.4206,
"step": 1141
},
{
"epoch": 2.805896805896806,
"grad_norm": 0.21677708187128097,
"learning_rate": 3.597449908925319e-06,
"loss": 0.4147,
"step": 1142
},
{
"epoch": 2.808353808353808,
"grad_norm": 0.21734633915684778,
"learning_rate": 3.551912568306011e-06,
"loss": 0.4093,
"step": 1143
},
{
"epoch": 2.810810810810811,
"grad_norm": 0.24321384485885375,
"learning_rate": 3.5063752276867036e-06,
"loss": 0.3783,
"step": 1144
},
{
"epoch": 2.8132678132678133,
"grad_norm": 0.24297335222493194,
"learning_rate": 3.4608378870673953e-06,
"loss": 0.5014,
"step": 1145
},
{
"epoch": 2.8157248157248156,
"grad_norm": 0.23892842770463382,
"learning_rate": 3.415300546448088e-06,
"loss": 0.4653,
"step": 1146
},
{
"epoch": 2.8181818181818183,
"grad_norm": 0.23160806679913898,
"learning_rate": 3.36976320582878e-06,
"loss": 0.422,
"step": 1147
},
{
"epoch": 2.8206388206388207,
"grad_norm": 0.22304524872709652,
"learning_rate": 3.3242258652094717e-06,
"loss": 0.3961,
"step": 1148
},
{
"epoch": 2.823095823095823,
"grad_norm": 0.2139600997939772,
"learning_rate": 3.278688524590164e-06,
"loss": 0.398,
"step": 1149
},
{
"epoch": 2.8255528255528253,
"grad_norm": 0.22956564153660272,
"learning_rate": 3.2331511839708563e-06,
"loss": 0.4151,
"step": 1150
},
{
"epoch": 2.828009828009828,
"grad_norm": 0.21826847683156242,
"learning_rate": 3.187613843351549e-06,
"loss": 0.3886,
"step": 1151
},
{
"epoch": 2.8304668304668303,
"grad_norm": 0.21141003742321426,
"learning_rate": 3.1420765027322406e-06,
"loss": 0.3906,
"step": 1152
},
{
"epoch": 2.832923832923833,
"grad_norm": 0.2357603696949525,
"learning_rate": 3.0965391621129327e-06,
"loss": 0.451,
"step": 1153
},
{
"epoch": 2.8353808353808354,
"grad_norm": 0.225812066393447,
"learning_rate": 3.051001821493625e-06,
"loss": 0.4064,
"step": 1154
},
{
"epoch": 2.8378378378378377,
"grad_norm": 0.21406448287602542,
"learning_rate": 3.005464480874317e-06,
"loss": 0.3674,
"step": 1155
},
{
"epoch": 2.84029484029484,
"grad_norm": 0.23290227939520636,
"learning_rate": 2.959927140255009e-06,
"loss": 0.4354,
"step": 1156
},
{
"epoch": 2.842751842751843,
"grad_norm": 0.2242096154461336,
"learning_rate": 2.9143897996357016e-06,
"loss": 0.4045,
"step": 1157
},
{
"epoch": 2.845208845208845,
"grad_norm": 0.23016845169620148,
"learning_rate": 2.8688524590163937e-06,
"loss": 0.4196,
"step": 1158
},
{
"epoch": 2.847665847665848,
"grad_norm": 0.21852096936661997,
"learning_rate": 2.823315118397086e-06,
"loss": 0.3982,
"step": 1159
},
{
"epoch": 2.85012285012285,
"grad_norm": 0.2108274165792627,
"learning_rate": 2.777777777777778e-06,
"loss": 0.3749,
"step": 1160
},
{
"epoch": 2.8525798525798525,
"grad_norm": 0.24160370753779067,
"learning_rate": 2.73224043715847e-06,
"loss": 0.4542,
"step": 1161
},
{
"epoch": 2.855036855036855,
"grad_norm": 1.6244239610624458,
"learning_rate": 2.686703096539162e-06,
"loss": 0.4585,
"step": 1162
},
{
"epoch": 2.8574938574938575,
"grad_norm": 0.22148267445895137,
"learning_rate": 2.6411657559198543e-06,
"loss": 0.3932,
"step": 1163
},
{
"epoch": 2.85995085995086,
"grad_norm": 0.23686699702884864,
"learning_rate": 2.595628415300547e-06,
"loss": 0.4043,
"step": 1164
},
{
"epoch": 2.8624078624078626,
"grad_norm": 0.2253190526682749,
"learning_rate": 2.550091074681239e-06,
"loss": 0.3865,
"step": 1165
},
{
"epoch": 2.864864864864865,
"grad_norm": 0.23283682010046694,
"learning_rate": 2.5045537340619306e-06,
"loss": 0.4241,
"step": 1166
},
{
"epoch": 2.8673218673218672,
"grad_norm": 0.22637501007112718,
"learning_rate": 2.459016393442623e-06,
"loss": 0.4535,
"step": 1167
},
{
"epoch": 2.8697788697788695,
"grad_norm": 0.23528102347160695,
"learning_rate": 2.4134790528233153e-06,
"loss": 0.485,
"step": 1168
},
{
"epoch": 2.8722358722358723,
"grad_norm": 0.23425484374934466,
"learning_rate": 2.3679417122040074e-06,
"loss": 0.4475,
"step": 1169
},
{
"epoch": 2.8746928746928746,
"grad_norm": 0.22149092071411625,
"learning_rate": 2.3224043715847e-06,
"loss": 0.4022,
"step": 1170
},
{
"epoch": 2.8771498771498774,
"grad_norm": 0.23427790871326182,
"learning_rate": 2.2768670309653916e-06,
"loss": 0.4638,
"step": 1171
},
{
"epoch": 2.8796068796068797,
"grad_norm": 0.23231854684157077,
"learning_rate": 2.2313296903460837e-06,
"loss": 0.4394,
"step": 1172
},
{
"epoch": 2.882063882063882,
"grad_norm": 0.2342789974677895,
"learning_rate": 2.185792349726776e-06,
"loss": 0.4825,
"step": 1173
},
{
"epoch": 2.8845208845208843,
"grad_norm": 0.22158002172153052,
"learning_rate": 2.1402550091074684e-06,
"loss": 0.386,
"step": 1174
},
{
"epoch": 2.886977886977887,
"grad_norm": 0.20696142757418035,
"learning_rate": 2.0947176684881605e-06,
"loss": 0.374,
"step": 1175
},
{
"epoch": 2.8894348894348894,
"grad_norm": 0.23895640881238192,
"learning_rate": 2.049180327868852e-06,
"loss": 0.4433,
"step": 1176
},
{
"epoch": 2.891891891891892,
"grad_norm": 0.21999586387865822,
"learning_rate": 2.0036429872495447e-06,
"loss": 0.3954,
"step": 1177
},
{
"epoch": 2.8943488943488944,
"grad_norm": 7.714103219123681,
"learning_rate": 1.958105646630237e-06,
"loss": 0.5093,
"step": 1178
},
{
"epoch": 2.8968058968058967,
"grad_norm": 0.20872227311945366,
"learning_rate": 1.912568306010929e-06,
"loss": 0.3708,
"step": 1179
},
{
"epoch": 2.899262899262899,
"grad_norm": 0.23835713585529297,
"learning_rate": 1.8670309653916213e-06,
"loss": 0.4284,
"step": 1180
},
{
"epoch": 2.901719901719902,
"grad_norm": 0.22864449909911705,
"learning_rate": 1.8214936247723136e-06,
"loss": 0.428,
"step": 1181
},
{
"epoch": 2.904176904176904,
"grad_norm": 0.2406324576550951,
"learning_rate": 1.7759562841530055e-06,
"loss": 0.4624,
"step": 1182
},
{
"epoch": 2.906633906633907,
"grad_norm": 0.23431139400422057,
"learning_rate": 1.7304189435336977e-06,
"loss": 0.4387,
"step": 1183
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.21843455420917768,
"learning_rate": 1.68488160291439e-06,
"loss": 0.4249,
"step": 1184
},
{
"epoch": 2.9115479115479115,
"grad_norm": 0.2095922735664185,
"learning_rate": 1.639344262295082e-06,
"loss": 0.395,
"step": 1185
},
{
"epoch": 2.914004914004914,
"grad_norm": 0.2269069829992154,
"learning_rate": 1.5938069216757744e-06,
"loss": 0.4245,
"step": 1186
},
{
"epoch": 2.9164619164619165,
"grad_norm": 0.2119797460187829,
"learning_rate": 1.5482695810564663e-06,
"loss": 0.3898,
"step": 1187
},
{
"epoch": 2.918918918918919,
"grad_norm": 0.23711678102007225,
"learning_rate": 1.5027322404371585e-06,
"loss": 0.4812,
"step": 1188
},
{
"epoch": 2.9213759213759216,
"grad_norm": 0.2191981863306329,
"learning_rate": 1.4571948998178508e-06,
"loss": 0.4322,
"step": 1189
},
{
"epoch": 2.923832923832924,
"grad_norm": 0.22589359686916735,
"learning_rate": 1.411657559198543e-06,
"loss": 0.3886,
"step": 1190
},
{
"epoch": 2.9262899262899262,
"grad_norm": 0.24232246496079973,
"learning_rate": 1.366120218579235e-06,
"loss": 0.4787,
"step": 1191
},
{
"epoch": 2.9287469287469285,
"grad_norm": 0.23337563262261982,
"learning_rate": 1.3205828779599271e-06,
"loss": 0.4604,
"step": 1192
},
{
"epoch": 2.9312039312039313,
"grad_norm": 0.22211870990294277,
"learning_rate": 1.2750455373406195e-06,
"loss": 0.4262,
"step": 1193
},
{
"epoch": 2.9336609336609336,
"grad_norm": 0.2299266125727697,
"learning_rate": 1.2295081967213116e-06,
"loss": 0.4976,
"step": 1194
},
{
"epoch": 2.9361179361179364,
"grad_norm": 0.2213619140432279,
"learning_rate": 1.1839708561020037e-06,
"loss": 0.4219,
"step": 1195
},
{
"epoch": 2.9385749385749387,
"grad_norm": 0.21746767504739525,
"learning_rate": 1.1384335154826958e-06,
"loss": 0.4351,
"step": 1196
},
{
"epoch": 2.941031941031941,
"grad_norm": 0.23322041860850679,
"learning_rate": 1.092896174863388e-06,
"loss": 0.4407,
"step": 1197
},
{
"epoch": 2.9434889434889433,
"grad_norm": 0.21436428057177767,
"learning_rate": 1.0473588342440803e-06,
"loss": 0.3869,
"step": 1198
},
{
"epoch": 2.945945945945946,
"grad_norm": 0.23891934387649086,
"learning_rate": 1.0018214936247724e-06,
"loss": 0.4645,
"step": 1199
},
{
"epoch": 2.9484029484029484,
"grad_norm": 0.2278055983373202,
"learning_rate": 9.562841530054645e-07,
"loss": 0.4638,
"step": 1200
},
{
"epoch": 2.950859950859951,
"grad_norm": 0.3776834168448288,
"learning_rate": 9.107468123861568e-07,
"loss": 0.3994,
"step": 1201
},
{
"epoch": 2.9533169533169534,
"grad_norm": 0.22934397142430324,
"learning_rate": 8.652094717668488e-07,
"loss": 0.4639,
"step": 1202
},
{
"epoch": 2.9557739557739557,
"grad_norm": 0.2063303716692425,
"learning_rate": 8.19672131147541e-07,
"loss": 0.3707,
"step": 1203
},
{
"epoch": 2.958230958230958,
"grad_norm": 0.2351672804832617,
"learning_rate": 7.741347905282332e-07,
"loss": 0.4168,
"step": 1204
},
{
"epoch": 2.960687960687961,
"grad_norm": 0.22161703857940737,
"learning_rate": 7.285974499089254e-07,
"loss": 0.4014,
"step": 1205
},
{
"epoch": 2.963144963144963,
"grad_norm": 0.3343193912872951,
"learning_rate": 6.830601092896175e-07,
"loss": 0.3978,
"step": 1206
},
{
"epoch": 2.965601965601966,
"grad_norm": 0.22304950387987088,
"learning_rate": 6.375227686703097e-07,
"loss": 0.4135,
"step": 1207
},
{
"epoch": 2.968058968058968,
"grad_norm": 0.20953255552793454,
"learning_rate": 5.919854280510018e-07,
"loss": 0.3811,
"step": 1208
},
{
"epoch": 2.9705159705159705,
"grad_norm": 0.2329344600488229,
"learning_rate": 5.46448087431694e-07,
"loss": 0.4343,
"step": 1209
},
{
"epoch": 2.972972972972973,
"grad_norm": 0.21299980955076325,
"learning_rate": 5.009107468123862e-07,
"loss": 0.3967,
"step": 1210
},
{
"epoch": 2.9754299754299756,
"grad_norm": 0.22514110450209485,
"learning_rate": 4.553734061930784e-07,
"loss": 0.4594,
"step": 1211
},
{
"epoch": 2.977886977886978,
"grad_norm": 0.22706086722772673,
"learning_rate": 4.098360655737705e-07,
"loss": 0.4452,
"step": 1212
},
{
"epoch": 2.98034398034398,
"grad_norm": 0.22420225612233982,
"learning_rate": 3.642987249544627e-07,
"loss": 0.451,
"step": 1213
},
{
"epoch": 2.982800982800983,
"grad_norm": 0.2278750274266437,
"learning_rate": 3.1876138433515486e-07,
"loss": 0.4298,
"step": 1214
},
{
"epoch": 2.9852579852579852,
"grad_norm": 0.22450421754608937,
"learning_rate": 2.73224043715847e-07,
"loss": 0.4549,
"step": 1215
},
{
"epoch": 2.9877149877149876,
"grad_norm": 0.24947496879816358,
"learning_rate": 2.276867030965392e-07,
"loss": 0.4612,
"step": 1216
},
{
"epoch": 2.9901719901719903,
"grad_norm": 0.22458707836271088,
"learning_rate": 1.8214936247723135e-07,
"loss": 0.4051,
"step": 1217
},
{
"epoch": 2.9926289926289926,
"grad_norm": 0.21227609068441602,
"learning_rate": 1.366120218579235e-07,
"loss": 0.3807,
"step": 1218
},
{
"epoch": 2.995085995085995,
"grad_norm": 0.22465026975771382,
"learning_rate": 9.107468123861567e-08,
"loss": 0.4377,
"step": 1219
},
{
"epoch": 2.9975429975429977,
"grad_norm": 0.22388402187392278,
"learning_rate": 4.553734061930784e-08,
"loss": 0.4133,
"step": 1220
},
{
"epoch": 3.0,
"grad_norm": 0.2335653945488078,
"learning_rate": 0.0,
"loss": 0.4107,
"step": 1221
},
{
"epoch": 3.0,
"step": 1221,
"total_flos": 1.0279209431224812e+18,
"train_loss": 0.6874593345968573,
"train_runtime": 70837.5294,
"train_samples_per_second": 0.275,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 1221,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0279209431224812e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}