diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8589 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1221, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002457002457002457, + "grad_norm": 51.00658947196253, + "learning_rate": 4.0650406504065046e-07, + "loss": 11.5201, + "step": 1 + }, + { + "epoch": 0.004914004914004914, + "grad_norm": 41.32182940742031, + "learning_rate": 8.130081300813009e-07, + "loss": 11.7707, + "step": 2 + }, + { + "epoch": 0.007371007371007371, + "grad_norm": 44.33185040770822, + "learning_rate": 1.2195121951219514e-06, + "loss": 11.6448, + "step": 3 + }, + { + "epoch": 0.009828009828009828, + "grad_norm": 42.42836066927598, + "learning_rate": 1.6260162601626018e-06, + "loss": 11.8086, + "step": 4 + }, + { + "epoch": 0.012285012285012284, + "grad_norm": 53.775532456381285, + "learning_rate": 2.0325203252032523e-06, + "loss": 11.3357, + "step": 5 + }, + { + "epoch": 0.014742014742014743, + "grad_norm": 47.62249981599493, + "learning_rate": 2.4390243902439027e-06, + "loss": 11.3818, + "step": 6 + }, + { + "epoch": 0.0171990171990172, + "grad_norm": 55.13732030606171, + "learning_rate": 2.8455284552845528e-06, + "loss": 11.142, + "step": 7 + }, + { + "epoch": 0.019656019656019656, + "grad_norm": 62.52065331239275, + "learning_rate": 3.2520325203252037e-06, + "loss": 10.5585, + "step": 8 + }, + { + "epoch": 0.022113022113022112, + "grad_norm": 69.61760079081881, + "learning_rate": 3.6585365853658537e-06, + "loss": 10.4944, + "step": 9 + }, + { + "epoch": 0.02457002457002457, + "grad_norm": 101.43566538305599, + "learning_rate": 4.0650406504065046e-06, + "loss": 9.3135, + "step": 10 + }, + { + "epoch": 0.02702702702702703, + "grad_norm": 78.43100915045316, + "learning_rate": 4.471544715447155e-06, + "loss": 5.2457, + "step": 11 + }, + { + "epoch": 0.029484029484029485, + "grad_norm": 73.16500005409208, + "learning_rate": 4.8780487804878055e-06, + "loss": 5.3303, + "step": 12 + }, + { + "epoch": 0.03194103194103194, + "grad_norm": 47.86656696315455, + "learning_rate": 5.2845528455284555e-06, + "loss": 3.6113, + "step": 13 + }, + { + "epoch": 0.0343980343980344, + "grad_norm": 8.67227796195133, + "learning_rate": 5.6910569105691056e-06, + "loss": 2.1041, + "step": 14 + }, + { + "epoch": 0.036855036855036855, + "grad_norm": 7.283413327700143, + "learning_rate": 6.0975609756097564e-06, + "loss": 2.0566, + "step": 15 + }, + { + "epoch": 0.03931203931203931, + "grad_norm": 5.08768956210286, + "learning_rate": 6.504065040650407e-06, + "loss": 1.7846, + "step": 16 + }, + { + "epoch": 0.04176904176904177, + "grad_norm": 3.7989995223107624, + "learning_rate": 6.910569105691057e-06, + "loss": 1.6511, + "step": 17 + }, + { + "epoch": 0.044226044226044224, + "grad_norm": 3.5721481727371764, + "learning_rate": 7.317073170731707e-06, + "loss": 1.9222, + "step": 18 + }, + { + "epoch": 0.04668304668304668, + "grad_norm": 2.2512893668476988, + "learning_rate": 7.723577235772358e-06, + "loss": 1.6941, + "step": 19 + }, + { + "epoch": 0.04914004914004914, + "grad_norm": 2.274570626749542, + "learning_rate": 8.130081300813009e-06, + "loss": 1.3336, + "step": 20 + }, + { + "epoch": 0.051597051597051594, + "grad_norm": 1.759146954439502, + "learning_rate": 8.53658536585366e-06, + "loss": 1.6479, + "step": 21 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 1.5309831654707053, + "learning_rate": 8.94308943089431e-06, + "loss": 1.6839, + "step": 22 + }, + { + "epoch": 0.056511056511056514, + "grad_norm": 1.7232978302647235, + "learning_rate": 9.34959349593496e-06, + "loss": 1.4097, + "step": 23 + }, + { + "epoch": 0.05896805896805897, + "grad_norm": 0.9050344472252703, + "learning_rate": 9.756097560975611e-06, + "loss": 1.3058, + "step": 24 + }, + { + "epoch": 0.06142506142506143, + "grad_norm": 1.011046912711339, + "learning_rate": 1.016260162601626e-05, + "loss": 1.3016, + "step": 25 + }, + { + "epoch": 0.06388206388206388, + "grad_norm": 0.7633443815628498, + "learning_rate": 1.0569105691056911e-05, + "loss": 1.0767, + "step": 26 + }, + { + "epoch": 0.06633906633906633, + "grad_norm": 3.711382173921332, + "learning_rate": 1.0975609756097562e-05, + "loss": 1.2445, + "step": 27 + }, + { + "epoch": 0.0687960687960688, + "grad_norm": 0.9084685938028465, + "learning_rate": 1.1382113821138211e-05, + "loss": 1.3219, + "step": 28 + }, + { + "epoch": 0.07125307125307126, + "grad_norm": 0.7278631490873225, + "learning_rate": 1.1788617886178862e-05, + "loss": 1.344, + "step": 29 + }, + { + "epoch": 0.07371007371007371, + "grad_norm": 0.8118157029372023, + "learning_rate": 1.2195121951219513e-05, + "loss": 1.2438, + "step": 30 + }, + { + "epoch": 0.07616707616707617, + "grad_norm": 0.7833625240021413, + "learning_rate": 1.2601626016260162e-05, + "loss": 1.3219, + "step": 31 + }, + { + "epoch": 0.07862407862407862, + "grad_norm": 0.5647517543959654, + "learning_rate": 1.3008130081300815e-05, + "loss": 0.9937, + "step": 32 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 0.6618891423388924, + "learning_rate": 1.3414634146341466e-05, + "loss": 0.9841, + "step": 33 + }, + { + "epoch": 0.08353808353808354, + "grad_norm": 0.8111611221708586, + "learning_rate": 1.3821138211382115e-05, + "loss": 1.1497, + "step": 34 + }, + { + "epoch": 0.085995085995086, + "grad_norm": 0.5989650541769594, + "learning_rate": 1.4227642276422764e-05, + "loss": 1.0802, + "step": 35 + }, + { + "epoch": 0.08845208845208845, + "grad_norm": 0.5601453621258067, + "learning_rate": 1.4634146341463415e-05, + "loss": 0.9953, + "step": 36 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 0.546088498086313, + "learning_rate": 1.5040650406504067e-05, + "loss": 0.9963, + "step": 37 + }, + { + "epoch": 0.09336609336609336, + "grad_norm": 0.5184238539089115, + "learning_rate": 1.5447154471544717e-05, + "loss": 0.9913, + "step": 38 + }, + { + "epoch": 0.09582309582309582, + "grad_norm": 0.4859364925634494, + "learning_rate": 1.5853658536585366e-05, + "loss": 0.9131, + "step": 39 + }, + { + "epoch": 0.09828009828009827, + "grad_norm": 0.5263041709899442, + "learning_rate": 1.6260162601626018e-05, + "loss": 1.2145, + "step": 40 + }, + { + "epoch": 0.10073710073710074, + "grad_norm": 0.48323570525096055, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.093, + "step": 41 + }, + { + "epoch": 0.10319410319410319, + "grad_norm": 0.39786588976887655, + "learning_rate": 1.707317073170732e-05, + "loss": 1.0228, + "step": 42 + }, + { + "epoch": 0.10565110565110565, + "grad_norm": 0.4353003519119437, + "learning_rate": 1.747967479674797e-05, + "loss": 0.8313, + "step": 43 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 0.5099701425209221, + "learning_rate": 1.788617886178862e-05, + "loss": 1.0385, + "step": 44 + }, + { + "epoch": 0.11056511056511056, + "grad_norm": 0.4247114077933769, + "learning_rate": 1.8292682926829268e-05, + "loss": 1.0362, + "step": 45 + }, + { + "epoch": 0.11302211302211303, + "grad_norm": 0.4278915097042231, + "learning_rate": 1.869918699186992e-05, + "loss": 1.0663, + "step": 46 + }, + { + "epoch": 0.11547911547911548, + "grad_norm": 0.42504196406677935, + "learning_rate": 1.9105691056910573e-05, + "loss": 1.065, + "step": 47 + }, + { + "epoch": 0.11793611793611794, + "grad_norm": 0.39322409819280146, + "learning_rate": 1.9512195121951222e-05, + "loss": 0.831, + "step": 48 + }, + { + "epoch": 0.12039312039312039, + "grad_norm": 0.38178816332973403, + "learning_rate": 1.991869918699187e-05, + "loss": 1.024, + "step": 49 + }, + { + "epoch": 0.12285012285012285, + "grad_norm": 0.32488634343203454, + "learning_rate": 2.032520325203252e-05, + "loss": 0.8349, + "step": 50 + }, + { + "epoch": 0.12530712530712532, + "grad_norm": 0.3782484182668685, + "learning_rate": 2.073170731707317e-05, + "loss": 0.9342, + "step": 51 + }, + { + "epoch": 0.12776412776412777, + "grad_norm": 0.36030842714472017, + "learning_rate": 2.1138211382113822e-05, + "loss": 1.0332, + "step": 52 + }, + { + "epoch": 0.13022113022113022, + "grad_norm": 0.3504763804177174, + "learning_rate": 2.1544715447154475e-05, + "loss": 1.0438, + "step": 53 + }, + { + "epoch": 0.13267813267813267, + "grad_norm": 0.3121087782309304, + "learning_rate": 2.1951219512195124e-05, + "loss": 0.8683, + "step": 54 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 0.4132956337094442, + "learning_rate": 2.2357723577235773e-05, + "loss": 1.0546, + "step": 55 + }, + { + "epoch": 0.1375921375921376, + "grad_norm": 0.3356523934810729, + "learning_rate": 2.2764227642276422e-05, + "loss": 0.8938, + "step": 56 + }, + { + "epoch": 0.14004914004914004, + "grad_norm": 0.3394592355583364, + "learning_rate": 2.3170731707317075e-05, + "loss": 0.8029, + "step": 57 + }, + { + "epoch": 0.14250614250614252, + "grad_norm": 0.9025202002420913, + "learning_rate": 2.3577235772357724e-05, + "loss": 0.836, + "step": 58 + }, + { + "epoch": 0.14496314496314497, + "grad_norm": 0.33122567373181955, + "learning_rate": 2.3983739837398377e-05, + "loss": 0.9265, + "step": 59 + }, + { + "epoch": 0.14742014742014742, + "grad_norm": 0.386487486127247, + "learning_rate": 2.4390243902439026e-05, + "loss": 1.0522, + "step": 60 + }, + { + "epoch": 0.14987714987714987, + "grad_norm": 0.3180551010846452, + "learning_rate": 2.4796747967479675e-05, + "loss": 0.8465, + "step": 61 + }, + { + "epoch": 0.15233415233415235, + "grad_norm": 0.3886943015332388, + "learning_rate": 2.5203252032520324e-05, + "loss": 0.8106, + "step": 62 + }, + { + "epoch": 0.1547911547911548, + "grad_norm": 0.36483367577896464, + "learning_rate": 2.5609756097560977e-05, + "loss": 0.9219, + "step": 63 + }, + { + "epoch": 0.15724815724815724, + "grad_norm": 0.3476022996526318, + "learning_rate": 2.601626016260163e-05, + "loss": 0.7888, + "step": 64 + }, + { + "epoch": 0.1597051597051597, + "grad_norm": 0.33332604952333145, + "learning_rate": 2.642276422764228e-05, + "loss": 0.8336, + "step": 65 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 0.3210922545885254, + "learning_rate": 2.682926829268293e-05, + "loss": 0.8325, + "step": 66 + }, + { + "epoch": 0.16461916461916462, + "grad_norm": 0.34166973327455336, + "learning_rate": 2.7235772357723577e-05, + "loss": 0.8286, + "step": 67 + }, + { + "epoch": 0.16707616707616707, + "grad_norm": 0.2772568838407044, + "learning_rate": 2.764227642276423e-05, + "loss": 0.6857, + "step": 68 + }, + { + "epoch": 0.16953316953316952, + "grad_norm": 1.0755515057999228, + "learning_rate": 2.8048780487804882e-05, + "loss": 0.8337, + "step": 69 + }, + { + "epoch": 0.171990171990172, + "grad_norm": 0.774515155788574, + "learning_rate": 2.8455284552845528e-05, + "loss": 0.9026, + "step": 70 + }, + { + "epoch": 0.17444717444717445, + "grad_norm": 0.343996443532602, + "learning_rate": 2.886178861788618e-05, + "loss": 0.858, + "step": 71 + }, + { + "epoch": 0.1769041769041769, + "grad_norm": 0.37464578169776397, + "learning_rate": 2.926829268292683e-05, + "loss": 0.8868, + "step": 72 + }, + { + "epoch": 0.17936117936117937, + "grad_norm": 0.30780292194750675, + "learning_rate": 2.9674796747967482e-05, + "loss": 0.8738, + "step": 73 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.7080920118302183, + "learning_rate": 3.0081300813008135e-05, + "loss": 1.0803, + "step": 74 + }, + { + "epoch": 0.18427518427518427, + "grad_norm": 0.3195310398410445, + "learning_rate": 3.048780487804878e-05, + "loss": 0.7552, + "step": 75 + }, + { + "epoch": 0.18673218673218672, + "grad_norm": 0.3398724677379115, + "learning_rate": 3.089430894308943e-05, + "loss": 0.915, + "step": 76 + }, + { + "epoch": 0.1891891891891892, + "grad_norm": 1.409643650018611, + "learning_rate": 3.130081300813008e-05, + "loss": 0.8522, + "step": 77 + }, + { + "epoch": 0.19164619164619165, + "grad_norm": 0.9400426414745835, + "learning_rate": 3.170731707317073e-05, + "loss": 0.9234, + "step": 78 + }, + { + "epoch": 0.1941031941031941, + "grad_norm": 0.44549756510252503, + "learning_rate": 3.2113821138211384e-05, + "loss": 0.8354, + "step": 79 + }, + { + "epoch": 0.19656019656019655, + "grad_norm": 0.31409628217862606, + "learning_rate": 3.2520325203252037e-05, + "loss": 0.8491, + "step": 80 + }, + { + "epoch": 0.19901719901719903, + "grad_norm": 0.4537000801486613, + "learning_rate": 3.292682926829269e-05, + "loss": 0.85, + "step": 81 + }, + { + "epoch": 0.20147420147420148, + "grad_norm": 0.42406673549654195, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.053, + "step": 82 + }, + { + "epoch": 0.20393120393120392, + "grad_norm": 0.3789788855142771, + "learning_rate": 3.373983739837399e-05, + "loss": 0.8627, + "step": 83 + }, + { + "epoch": 0.20638820638820637, + "grad_norm": 0.408375242705326, + "learning_rate": 3.414634146341464e-05, + "loss": 0.9088, + "step": 84 + }, + { + "epoch": 0.20884520884520885, + "grad_norm": 0.4269743612762991, + "learning_rate": 3.4552845528455286e-05, + "loss": 0.9017, + "step": 85 + }, + { + "epoch": 0.2113022113022113, + "grad_norm": 0.3983104483895218, + "learning_rate": 3.495934959349594e-05, + "loss": 0.8781, + "step": 86 + }, + { + "epoch": 0.21375921375921375, + "grad_norm": 0.4289837220182789, + "learning_rate": 3.5365853658536584e-05, + "loss": 0.7913, + "step": 87 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 0.4383253801829447, + "learning_rate": 3.577235772357724e-05, + "loss": 0.8579, + "step": 88 + }, + { + "epoch": 0.21867321867321868, + "grad_norm": 0.3815114297981113, + "learning_rate": 3.617886178861789e-05, + "loss": 0.7926, + "step": 89 + }, + { + "epoch": 0.22113022113022113, + "grad_norm": 0.4460874540522612, + "learning_rate": 3.6585365853658535e-05, + "loss": 0.8682, + "step": 90 + }, + { + "epoch": 0.22358722358722358, + "grad_norm": 0.4242618487534378, + "learning_rate": 3.699186991869919e-05, + "loss": 0.8574, + "step": 91 + }, + { + "epoch": 0.22604422604422605, + "grad_norm": 0.3784544099868278, + "learning_rate": 3.739837398373984e-05, + "loss": 0.7585, + "step": 92 + }, + { + "epoch": 0.2285012285012285, + "grad_norm": 0.4216052185506308, + "learning_rate": 3.780487804878049e-05, + "loss": 0.7668, + "step": 93 + }, + { + "epoch": 0.23095823095823095, + "grad_norm": 0.43197147956134363, + "learning_rate": 3.8211382113821145e-05, + "loss": 0.9439, + "step": 94 + }, + { + "epoch": 0.2334152334152334, + "grad_norm": 0.35661007106689985, + "learning_rate": 3.861788617886179e-05, + "loss": 0.745, + "step": 95 + }, + { + "epoch": 0.23587223587223588, + "grad_norm": 0.35038751371475896, + "learning_rate": 3.9024390243902444e-05, + "loss": 0.8148, + "step": 96 + }, + { + "epoch": 0.23832923832923833, + "grad_norm": 0.3269434336747683, + "learning_rate": 3.943089430894309e-05, + "loss": 0.7587, + "step": 97 + }, + { + "epoch": 0.24078624078624078, + "grad_norm": 0.3927980260401744, + "learning_rate": 3.983739837398374e-05, + "loss": 0.8683, + "step": 98 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 0.3655921622998464, + "learning_rate": 4.0243902439024395e-05, + "loss": 0.7903, + "step": 99 + }, + { + "epoch": 0.2457002457002457, + "grad_norm": 7.434933759364144, + "learning_rate": 4.065040650406504e-05, + "loss": 0.9234, + "step": 100 + }, + { + "epoch": 0.24815724815724816, + "grad_norm": 0.5057132697370877, + "learning_rate": 4.105691056910569e-05, + "loss": 0.8459, + "step": 101 + }, + { + "epoch": 0.25061425061425063, + "grad_norm": 0.35608438610939613, + "learning_rate": 4.146341463414634e-05, + "loss": 0.9301, + "step": 102 + }, + { + "epoch": 0.25307125307125306, + "grad_norm": 0.4378277298361604, + "learning_rate": 4.186991869918699e-05, + "loss": 0.751, + "step": 103 + }, + { + "epoch": 0.25552825552825553, + "grad_norm": 0.39957854645534735, + "learning_rate": 4.2276422764227644e-05, + "loss": 0.6775, + "step": 104 + }, + { + "epoch": 0.257985257985258, + "grad_norm": 0.4425372497170904, + "learning_rate": 4.26829268292683e-05, + "loss": 0.7637, + "step": 105 + }, + { + "epoch": 0.26044226044226043, + "grad_norm": 0.5055020698531032, + "learning_rate": 4.308943089430895e-05, + "loss": 0.9547, + "step": 106 + }, + { + "epoch": 0.2628992628992629, + "grad_norm": 0.48084566592201927, + "learning_rate": 4.3495934959349595e-05, + "loss": 0.7968, + "step": 107 + }, + { + "epoch": 0.26535626535626533, + "grad_norm": 0.44969395374862164, + "learning_rate": 4.390243902439025e-05, + "loss": 0.9182, + "step": 108 + }, + { + "epoch": 0.2678132678132678, + "grad_norm": 0.5044687667724931, + "learning_rate": 4.43089430894309e-05, + "loss": 0.8516, + "step": 109 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 0.46505802585107076, + "learning_rate": 4.4715447154471546e-05, + "loss": 0.8654, + "step": 110 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 0.5806796709997633, + "learning_rate": 4.51219512195122e-05, + "loss": 0.906, + "step": 111 + }, + { + "epoch": 0.2751842751842752, + "grad_norm": 0.4210793238854805, + "learning_rate": 4.5528455284552844e-05, + "loss": 0.7159, + "step": 112 + }, + { + "epoch": 0.27764127764127766, + "grad_norm": 0.45861184290594337, + "learning_rate": 4.59349593495935e-05, + "loss": 0.8742, + "step": 113 + }, + { + "epoch": 0.2800982800982801, + "grad_norm": 0.474280060915593, + "learning_rate": 4.634146341463415e-05, + "loss": 0.9078, + "step": 114 + }, + { + "epoch": 0.28255528255528256, + "grad_norm": 3.4497188666996608, + "learning_rate": 4.6747967479674795e-05, + "loss": 0.7941, + "step": 115 + }, + { + "epoch": 0.28501228501228504, + "grad_norm": 0.6859838903958281, + "learning_rate": 4.715447154471545e-05, + "loss": 0.829, + "step": 116 + }, + { + "epoch": 0.28746928746928746, + "grad_norm": 0.4257855688775576, + "learning_rate": 4.75609756097561e-05, + "loss": 0.684, + "step": 117 + }, + { + "epoch": 0.28992628992628994, + "grad_norm": 0.7209470061472436, + "learning_rate": 4.796747967479675e-05, + "loss": 0.8426, + "step": 118 + }, + { + "epoch": 0.29238329238329236, + "grad_norm": 0.4348904611702599, + "learning_rate": 4.8373983739837406e-05, + "loss": 0.8974, + "step": 119 + }, + { + "epoch": 0.29484029484029484, + "grad_norm": 0.6022119693773859, + "learning_rate": 4.878048780487805e-05, + "loss": 0.7381, + "step": 120 + }, + { + "epoch": 0.2972972972972973, + "grad_norm": 0.49356434831001184, + "learning_rate": 4.9186991869918704e-05, + "loss": 0.8491, + "step": 121 + }, + { + "epoch": 0.29975429975429974, + "grad_norm": 0.5199694383515181, + "learning_rate": 4.959349593495935e-05, + "loss": 0.9368, + "step": 122 + }, + { + "epoch": 0.3022113022113022, + "grad_norm": 0.6287882015300568, + "learning_rate": 5e-05, + "loss": 0.7733, + "step": 123 + }, + { + "epoch": 0.3046683046683047, + "grad_norm": 0.47882310869561157, + "learning_rate": 4.99544626593807e-05, + "loss": 0.828, + "step": 124 + }, + { + "epoch": 0.3071253071253071, + "grad_norm": 0.47321265037200055, + "learning_rate": 4.990892531876138e-05, + "loss": 0.7783, + "step": 125 + }, + { + "epoch": 0.3095823095823096, + "grad_norm": 5.521364106361822, + "learning_rate": 4.986338797814208e-05, + "loss": 0.8306, + "step": 126 + }, + { + "epoch": 0.31203931203931207, + "grad_norm": 0.6130054000501296, + "learning_rate": 4.9817850637522776e-05, + "loss": 0.7226, + "step": 127 + }, + { + "epoch": 0.3144963144963145, + "grad_norm": 0.5002677223225025, + "learning_rate": 4.977231329690346e-05, + "loss": 0.842, + "step": 128 + }, + { + "epoch": 0.31695331695331697, + "grad_norm": 0.5097826151334071, + "learning_rate": 4.9726775956284156e-05, + "loss": 0.8517, + "step": 129 + }, + { + "epoch": 0.3194103194103194, + "grad_norm": 0.4823779081669877, + "learning_rate": 4.9681238615664846e-05, + "loss": 0.6543, + "step": 130 + }, + { + "epoch": 0.32186732186732187, + "grad_norm": 0.6212379815651925, + "learning_rate": 4.9635701275045536e-05, + "loss": 0.8558, + "step": 131 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 0.46984602438360945, + "learning_rate": 4.959016393442623e-05, + "loss": 0.9246, + "step": 132 + }, + { + "epoch": 0.32678132678132676, + "grad_norm": 0.4983418640829827, + "learning_rate": 4.954462659380692e-05, + "loss": 0.7805, + "step": 133 + }, + { + "epoch": 0.32923832923832924, + "grad_norm": 0.6115973164236492, + "learning_rate": 4.949908925318761e-05, + "loss": 0.8777, + "step": 134 + }, + { + "epoch": 0.3316953316953317, + "grad_norm": 0.425339912596782, + "learning_rate": 4.945355191256831e-05, + "loss": 0.8364, + "step": 135 + }, + { + "epoch": 0.33415233415233414, + "grad_norm": 0.5081656348291814, + "learning_rate": 4.9408014571949e-05, + "loss": 0.8182, + "step": 136 + }, + { + "epoch": 0.3366093366093366, + "grad_norm": 0.4507877289201634, + "learning_rate": 4.936247723132969e-05, + "loss": 0.7059, + "step": 137 + }, + { + "epoch": 0.33906633906633904, + "grad_norm": 0.4259156527505649, + "learning_rate": 4.9316939890710386e-05, + "loss": 0.7337, + "step": 138 + }, + { + "epoch": 0.3415233415233415, + "grad_norm": 0.4870288608531628, + "learning_rate": 4.9271402550091076e-05, + "loss": 0.6361, + "step": 139 + }, + { + "epoch": 0.343980343980344, + "grad_norm": 0.3911986963597503, + "learning_rate": 4.9225865209471766e-05, + "loss": 0.7603, + "step": 140 + }, + { + "epoch": 0.3464373464373464, + "grad_norm": 0.7264714511948328, + "learning_rate": 4.918032786885246e-05, + "loss": 0.9062, + "step": 141 + }, + { + "epoch": 0.3488943488943489, + "grad_norm": 0.47242198367565236, + "learning_rate": 4.913479052823315e-05, + "loss": 0.7663, + "step": 142 + }, + { + "epoch": 0.35135135135135137, + "grad_norm": 0.47328040906145535, + "learning_rate": 4.908925318761385e-05, + "loss": 0.6299, + "step": 143 + }, + { + "epoch": 0.3538083538083538, + "grad_norm": 0.5189395807696658, + "learning_rate": 4.904371584699454e-05, + "loss": 0.7318, + "step": 144 + }, + { + "epoch": 0.35626535626535627, + "grad_norm": 0.45277914852216605, + "learning_rate": 4.899817850637523e-05, + "loss": 0.7552, + "step": 145 + }, + { + "epoch": 0.35872235872235875, + "grad_norm": 0.4495050775600071, + "learning_rate": 4.8952641165755927e-05, + "loss": 0.7726, + "step": 146 + }, + { + "epoch": 0.36117936117936117, + "grad_norm": 0.5562436967325994, + "learning_rate": 4.890710382513661e-05, + "loss": 0.7093, + "step": 147 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.4021268067280976, + "learning_rate": 4.8861566484517307e-05, + "loss": 0.7734, + "step": 148 + }, + { + "epoch": 0.36609336609336607, + "grad_norm": 0.5900140052185344, + "learning_rate": 4.8816029143898e-05, + "loss": 0.898, + "step": 149 + }, + { + "epoch": 0.36855036855036855, + "grad_norm": 0.40531749565353, + "learning_rate": 4.8770491803278687e-05, + "loss": 0.8673, + "step": 150 + }, + { + "epoch": 0.371007371007371, + "grad_norm": 0.5257544978960317, + "learning_rate": 4.872495446265938e-05, + "loss": 0.8524, + "step": 151 + }, + { + "epoch": 0.37346437346437344, + "grad_norm": 0.3709941280583937, + "learning_rate": 4.867941712204008e-05, + "loss": 0.7762, + "step": 152 + }, + { + "epoch": 0.3759213759213759, + "grad_norm": 0.6733613092717959, + "learning_rate": 4.863387978142076e-05, + "loss": 0.7244, + "step": 153 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 0.38207770249074585, + "learning_rate": 4.858834244080146e-05, + "loss": 0.7583, + "step": 154 + }, + { + "epoch": 0.3808353808353808, + "grad_norm": 0.6562455849116627, + "learning_rate": 4.854280510018216e-05, + "loss": 0.8073, + "step": 155 + }, + { + "epoch": 0.3832923832923833, + "grad_norm": 0.39748554973014666, + "learning_rate": 4.849726775956284e-05, + "loss": 0.7228, + "step": 156 + }, + { + "epoch": 0.3857493857493858, + "grad_norm": 0.3661785544020982, + "learning_rate": 4.845173041894354e-05, + "loss": 0.6666, + "step": 157 + }, + { + "epoch": 0.3882063882063882, + "grad_norm": 0.5268622303781758, + "learning_rate": 4.840619307832423e-05, + "loss": 0.7831, + "step": 158 + }, + { + "epoch": 0.3906633906633907, + "grad_norm": 0.49812730319416026, + "learning_rate": 4.836065573770492e-05, + "loss": 0.7754, + "step": 159 + }, + { + "epoch": 0.3931203931203931, + "grad_norm": 0.4157898663987463, + "learning_rate": 4.8315118397085614e-05, + "loss": 0.7964, + "step": 160 + }, + { + "epoch": 0.3955773955773956, + "grad_norm": 0.47307293289652125, + "learning_rate": 4.8269581056466304e-05, + "loss": 0.8665, + "step": 161 + }, + { + "epoch": 0.39803439803439805, + "grad_norm": 0.5347821089983137, + "learning_rate": 4.8224043715846994e-05, + "loss": 0.9031, + "step": 162 + }, + { + "epoch": 0.4004914004914005, + "grad_norm": 0.4416804543349193, + "learning_rate": 4.817850637522769e-05, + "loss": 0.8018, + "step": 163 + }, + { + "epoch": 0.40294840294840295, + "grad_norm": 0.38242798734043876, + "learning_rate": 4.813296903460838e-05, + "loss": 0.8083, + "step": 164 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 0.46628473043304297, + "learning_rate": 4.808743169398907e-05, + "loss": 0.7786, + "step": 165 + }, + { + "epoch": 0.40786240786240785, + "grad_norm": 0.8957454928357931, + "learning_rate": 4.804189435336977e-05, + "loss": 0.7697, + "step": 166 + }, + { + "epoch": 0.4103194103194103, + "grad_norm": 0.7840441840091149, + "learning_rate": 4.799635701275046e-05, + "loss": 0.9073, + "step": 167 + }, + { + "epoch": 0.41277641277641275, + "grad_norm": 0.7288094214817507, + "learning_rate": 4.795081967213115e-05, + "loss": 0.7594, + "step": 168 + }, + { + "epoch": 0.4152334152334152, + "grad_norm": 0.6041127252970878, + "learning_rate": 4.7905282331511844e-05, + "loss": 0.7313, + "step": 169 + }, + { + "epoch": 0.4176904176904177, + "grad_norm": 0.8145997847617484, + "learning_rate": 4.7859744990892534e-05, + "loss": 0.6774, + "step": 170 + }, + { + "epoch": 0.4201474201474201, + "grad_norm": 0.5528790249808274, + "learning_rate": 4.7814207650273224e-05, + "loss": 0.8161, + "step": 171 + }, + { + "epoch": 0.4226044226044226, + "grad_norm": 0.5818892388787992, + "learning_rate": 4.776867030965392e-05, + "loss": 0.6353, + "step": 172 + }, + { + "epoch": 0.4250614250614251, + "grad_norm": 0.5028845858663835, + "learning_rate": 4.772313296903461e-05, + "loss": 0.7811, + "step": 173 + }, + { + "epoch": 0.4275184275184275, + "grad_norm": 0.5094764920597807, + "learning_rate": 4.76775956284153e-05, + "loss": 0.77, + "step": 174 + }, + { + "epoch": 0.42997542997543, + "grad_norm": 0.40339340341267327, + "learning_rate": 4.7632058287796e-05, + "loss": 0.6796, + "step": 175 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 0.47577207705852176, + "learning_rate": 4.758652094717669e-05, + "loss": 0.7034, + "step": 176 + }, + { + "epoch": 0.4348894348894349, + "grad_norm": 0.39888134217182175, + "learning_rate": 4.754098360655738e-05, + "loss": 0.6607, + "step": 177 + }, + { + "epoch": 0.43734643734643736, + "grad_norm": 0.3965895014017134, + "learning_rate": 4.749544626593807e-05, + "loss": 0.7624, + "step": 178 + }, + { + "epoch": 0.4398034398034398, + "grad_norm": 0.4709202993164332, + "learning_rate": 4.7449908925318764e-05, + "loss": 0.8225, + "step": 179 + }, + { + "epoch": 0.44226044226044225, + "grad_norm": 0.382474212228653, + "learning_rate": 4.740437158469946e-05, + "loss": 0.8546, + "step": 180 + }, + { + "epoch": 0.44471744471744473, + "grad_norm": 0.4231565796785838, + "learning_rate": 4.7358834244080144e-05, + "loss": 0.771, + "step": 181 + }, + { + "epoch": 0.44717444717444715, + "grad_norm": 0.38054832898962976, + "learning_rate": 4.731329690346084e-05, + "loss": 0.6595, + "step": 182 + }, + { + "epoch": 0.44963144963144963, + "grad_norm": 0.3547946010093686, + "learning_rate": 4.726775956284154e-05, + "loss": 0.6817, + "step": 183 + }, + { + "epoch": 0.4520884520884521, + "grad_norm": 0.3945726785571152, + "learning_rate": 4.722222222222222e-05, + "loss": 0.7525, + "step": 184 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.3928424227592678, + "learning_rate": 4.717668488160292e-05, + "loss": 0.7477, + "step": 185 + }, + { + "epoch": 0.457002457002457, + "grad_norm": 0.4426304351649171, + "learning_rate": 4.713114754098361e-05, + "loss": 0.7423, + "step": 186 + }, + { + "epoch": 0.4594594594594595, + "grad_norm": 1.6332435201318054, + "learning_rate": 4.70856102003643e-05, + "loss": 0.7035, + "step": 187 + }, + { + "epoch": 0.4619164619164619, + "grad_norm": 0.4178581553378913, + "learning_rate": 4.7040072859744995e-05, + "loss": 0.7806, + "step": 188 + }, + { + "epoch": 0.4643734643734644, + "grad_norm": 0.37923597472442744, + "learning_rate": 4.6994535519125685e-05, + "loss": 0.7407, + "step": 189 + }, + { + "epoch": 0.4668304668304668, + "grad_norm": 0.4449909952237191, + "learning_rate": 4.6948998178506375e-05, + "loss": 0.7735, + "step": 190 + }, + { + "epoch": 0.4692874692874693, + "grad_norm": 1.3250256301620615, + "learning_rate": 4.690346083788707e-05, + "loss": 0.7303, + "step": 191 + }, + { + "epoch": 0.47174447174447176, + "grad_norm": 0.9645765967219847, + "learning_rate": 4.685792349726776e-05, + "loss": 0.7097, + "step": 192 + }, + { + "epoch": 0.4742014742014742, + "grad_norm": 0.6361558765678473, + "learning_rate": 4.681238615664845e-05, + "loss": 0.7363, + "step": 193 + }, + { + "epoch": 0.47665847665847666, + "grad_norm": 0.45389595024787915, + "learning_rate": 4.676684881602915e-05, + "loss": 0.7055, + "step": 194 + }, + { + "epoch": 0.47911547911547914, + "grad_norm": 0.6258698325325335, + "learning_rate": 4.672131147540984e-05, + "loss": 0.6585, + "step": 195 + }, + { + "epoch": 0.48157248157248156, + "grad_norm": 1.0118318676213243, + "learning_rate": 4.667577413479053e-05, + "loss": 0.7367, + "step": 196 + }, + { + "epoch": 0.48402948402948404, + "grad_norm": 0.4198144007946843, + "learning_rate": 4.6630236794171225e-05, + "loss": 0.6616, + "step": 197 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 0.5913511667595013, + "learning_rate": 4.6584699453551915e-05, + "loss": 0.7845, + "step": 198 + }, + { + "epoch": 0.48894348894348894, + "grad_norm": 0.33611220980462847, + "learning_rate": 4.6539162112932605e-05, + "loss": 0.7603, + "step": 199 + }, + { + "epoch": 0.4914004914004914, + "grad_norm": 0.5711542431602112, + "learning_rate": 4.64936247723133e-05, + "loss": 0.7322, + "step": 200 + }, + { + "epoch": 0.49385749385749383, + "grad_norm": 0.3189868792840459, + "learning_rate": 4.644808743169399e-05, + "loss": 0.6661, + "step": 201 + }, + { + "epoch": 0.4963144963144963, + "grad_norm": 0.4826389836871673, + "learning_rate": 4.640255009107468e-05, + "loss": 0.6749, + "step": 202 + }, + { + "epoch": 0.4987714987714988, + "grad_norm": 0.40951703225470715, + "learning_rate": 4.635701275045538e-05, + "loss": 0.7387, + "step": 203 + }, + { + "epoch": 0.5012285012285013, + "grad_norm": 0.38375829145246065, + "learning_rate": 4.631147540983607e-05, + "loss": 0.7027, + "step": 204 + }, + { + "epoch": 0.5036855036855037, + "grad_norm": 0.4733049578471896, + "learning_rate": 4.626593806921676e-05, + "loss": 0.7509, + "step": 205 + }, + { + "epoch": 0.5061425061425061, + "grad_norm": 0.3474159718643396, + "learning_rate": 4.622040072859745e-05, + "loss": 0.7367, + "step": 206 + }, + { + "epoch": 0.5085995085995086, + "grad_norm": 0.48857281066114416, + "learning_rate": 4.6174863387978145e-05, + "loss": 0.8525, + "step": 207 + }, + { + "epoch": 0.5110565110565111, + "grad_norm": 0.38214808096990427, + "learning_rate": 4.6129326047358835e-05, + "loss": 0.7906, + "step": 208 + }, + { + "epoch": 0.5135135135135135, + "grad_norm": 0.33815932263073856, + "learning_rate": 4.6083788706739525e-05, + "loss": 0.7306, + "step": 209 + }, + { + "epoch": 0.515970515970516, + "grad_norm": 0.4339469943504887, + "learning_rate": 4.603825136612022e-05, + "loss": 0.8031, + "step": 210 + }, + { + "epoch": 0.5184275184275184, + "grad_norm": 0.3911806777997916, + "learning_rate": 4.599271402550091e-05, + "loss": 0.6236, + "step": 211 + }, + { + "epoch": 0.5208845208845209, + "grad_norm": 0.4169040746627703, + "learning_rate": 4.59471766848816e-05, + "loss": 0.6954, + "step": 212 + }, + { + "epoch": 0.5233415233415234, + "grad_norm": 0.409930981249451, + "learning_rate": 4.59016393442623e-05, + "loss": 0.6972, + "step": 213 + }, + { + "epoch": 0.5257985257985258, + "grad_norm": 0.3662077296397301, + "learning_rate": 4.585610200364299e-05, + "loss": 0.7205, + "step": 214 + }, + { + "epoch": 0.5282555282555282, + "grad_norm": 0.3999793098185867, + "learning_rate": 4.581056466302368e-05, + "loss": 0.7142, + "step": 215 + }, + { + "epoch": 0.5307125307125307, + "grad_norm": 0.33426678861834175, + "learning_rate": 4.5765027322404376e-05, + "loss": 0.7806, + "step": 216 + }, + { + "epoch": 0.5331695331695332, + "grad_norm": 0.2920950465438566, + "learning_rate": 4.5719489981785066e-05, + "loss": 0.5735, + "step": 217 + }, + { + "epoch": 0.5356265356265356, + "grad_norm": 0.4387714217174655, + "learning_rate": 4.5673952641165756e-05, + "loss": 0.7661, + "step": 218 + }, + { + "epoch": 0.538083538083538, + "grad_norm": 0.40724721414199005, + "learning_rate": 4.562841530054645e-05, + "loss": 0.7578, + "step": 219 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.4361008788632283, + "learning_rate": 4.558287795992714e-05, + "loss": 0.6755, + "step": 220 + }, + { + "epoch": 0.542997542997543, + "grad_norm": 0.4246249810597821, + "learning_rate": 4.553734061930783e-05, + "loss": 0.7546, + "step": 221 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.36118319320850206, + "learning_rate": 4.549180327868853e-05, + "loss": 0.7669, + "step": 222 + }, + { + "epoch": 0.547911547911548, + "grad_norm": 0.908289119148723, + "learning_rate": 4.544626593806922e-05, + "loss": 0.7135, + "step": 223 + }, + { + "epoch": 0.5503685503685504, + "grad_norm": 0.39602734595220085, + "learning_rate": 4.540072859744991e-05, + "loss": 0.749, + "step": 224 + }, + { + "epoch": 0.5528255528255528, + "grad_norm": 0.5078448020996696, + "learning_rate": 4.5355191256830606e-05, + "loss": 0.6208, + "step": 225 + }, + { + "epoch": 0.5552825552825553, + "grad_norm": 0.3443372372601607, + "learning_rate": 4.5309653916211296e-05, + "loss": 0.7046, + "step": 226 + }, + { + "epoch": 0.5577395577395577, + "grad_norm": 0.4525893747493054, + "learning_rate": 4.5264116575591986e-05, + "loss": 0.7592, + "step": 227 + }, + { + "epoch": 0.5601965601965602, + "grad_norm": 0.40243874841518706, + "learning_rate": 4.521857923497268e-05, + "loss": 0.8445, + "step": 228 + }, + { + "epoch": 0.5626535626535627, + "grad_norm": 0.35161294551869515, + "learning_rate": 4.517304189435337e-05, + "loss": 0.677, + "step": 229 + }, + { + "epoch": 0.5651105651105651, + "grad_norm": 0.41535550493065193, + "learning_rate": 4.512750455373406e-05, + "loss": 0.7478, + "step": 230 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 0.4226366849862933, + "learning_rate": 4.508196721311476e-05, + "loss": 0.745, + "step": 231 + }, + { + "epoch": 0.5700245700245701, + "grad_norm": 0.3673983419967179, + "learning_rate": 4.503642987249545e-05, + "loss": 0.7015, + "step": 232 + }, + { + "epoch": 0.5724815724815725, + "grad_norm": 0.38024111457034476, + "learning_rate": 4.499089253187614e-05, + "loss": 0.7877, + "step": 233 + }, + { + "epoch": 0.5749385749385749, + "grad_norm": 0.38382167053979005, + "learning_rate": 4.494535519125683e-05, + "loss": 0.6943, + "step": 234 + }, + { + "epoch": 0.5773955773955773, + "grad_norm": 0.3773460766513446, + "learning_rate": 4.4899817850637526e-05, + "loss": 0.7944, + "step": 235 + }, + { + "epoch": 0.5798525798525799, + "grad_norm": 0.4206436428227826, + "learning_rate": 4.4854280510018216e-05, + "loss": 0.6814, + "step": 236 + }, + { + "epoch": 0.5823095823095823, + "grad_norm": 6.225234570790709, + "learning_rate": 4.4808743169398906e-05, + "loss": 0.7907, + "step": 237 + }, + { + "epoch": 0.5847665847665847, + "grad_norm": 0.4921907401337786, + "learning_rate": 4.47632058287796e-05, + "loss": 0.6665, + "step": 238 + }, + { + "epoch": 0.5872235872235873, + "grad_norm": 0.48327648449237093, + "learning_rate": 4.471766848816029e-05, + "loss": 0.7715, + "step": 239 + }, + { + "epoch": 0.5896805896805897, + "grad_norm": 0.4791973859907425, + "learning_rate": 4.467213114754098e-05, + "loss": 0.6644, + "step": 240 + }, + { + "epoch": 0.5921375921375921, + "grad_norm": 0.5219036090133962, + "learning_rate": 4.462659380692168e-05, + "loss": 0.8049, + "step": 241 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 0.5456422166867602, + "learning_rate": 4.458105646630237e-05, + "loss": 0.7501, + "step": 242 + }, + { + "epoch": 0.597051597051597, + "grad_norm": 0.42200513727398753, + "learning_rate": 4.453551912568306e-05, + "loss": 0.887, + "step": 243 + }, + { + "epoch": 0.5995085995085995, + "grad_norm": 0.4322560276672431, + "learning_rate": 4.4489981785063757e-05, + "loss": 0.7695, + "step": 244 + }, + { + "epoch": 0.601965601965602, + "grad_norm": 0.6813701089189296, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.8039, + "step": 245 + }, + { + "epoch": 0.6044226044226044, + "grad_norm": 0.34727875514808987, + "learning_rate": 4.4398907103825137e-05, + "loss": 0.6736, + "step": 246 + }, + { + "epoch": 0.6068796068796068, + "grad_norm": 0.5097357043993563, + "learning_rate": 4.435336976320583e-05, + "loss": 0.7733, + "step": 247 + }, + { + "epoch": 0.6093366093366094, + "grad_norm": 0.3917452125453462, + "learning_rate": 4.430783242258652e-05, + "loss": 0.6345, + "step": 248 + }, + { + "epoch": 0.6117936117936118, + "grad_norm": 0.3886971533793202, + "learning_rate": 4.426229508196721e-05, + "loss": 0.6465, + "step": 249 + }, + { + "epoch": 0.6142506142506142, + "grad_norm": 0.42563955199502573, + "learning_rate": 4.421675774134791e-05, + "loss": 0.6499, + "step": 250 + }, + { + "epoch": 0.6167076167076168, + "grad_norm": 0.34063379000466826, + "learning_rate": 4.41712204007286e-05, + "loss": 0.6963, + "step": 251 + }, + { + "epoch": 0.6191646191646192, + "grad_norm": 0.4724839536346018, + "learning_rate": 4.412568306010929e-05, + "loss": 0.7917, + "step": 252 + }, + { + "epoch": 0.6216216216216216, + "grad_norm": 0.43466297048497554, + "learning_rate": 4.408014571948999e-05, + "loss": 0.7627, + "step": 253 + }, + { + "epoch": 0.6240786240786241, + "grad_norm": 0.35263290647277007, + "learning_rate": 4.403460837887068e-05, + "loss": 0.624, + "step": 254 + }, + { + "epoch": 0.6265356265356266, + "grad_norm": 0.41771099490666685, + "learning_rate": 4.398907103825137e-05, + "loss": 0.6774, + "step": 255 + }, + { + "epoch": 0.628992628992629, + "grad_norm": 0.45045654101278304, + "learning_rate": 4.3943533697632064e-05, + "loss": 0.6706, + "step": 256 + }, + { + "epoch": 0.6314496314496314, + "grad_norm": 0.4054524028616639, + "learning_rate": 4.3897996357012754e-05, + "loss": 0.6856, + "step": 257 + }, + { + "epoch": 0.6339066339066339, + "grad_norm": 0.4199071567113292, + "learning_rate": 4.3852459016393444e-05, + "loss": 0.7385, + "step": 258 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 0.4359170619851533, + "learning_rate": 4.380692167577414e-05, + "loss": 0.7095, + "step": 259 + }, + { + "epoch": 0.6388206388206388, + "grad_norm": 0.3850739753964197, + "learning_rate": 4.376138433515483e-05, + "loss": 0.6958, + "step": 260 + }, + { + "epoch": 0.6412776412776413, + "grad_norm": 0.4890138604791565, + "learning_rate": 4.371584699453552e-05, + "loss": 0.7211, + "step": 261 + }, + { + "epoch": 0.6437346437346437, + "grad_norm": 0.38398720286811694, + "learning_rate": 4.367030965391621e-05, + "loss": 0.8539, + "step": 262 + }, + { + "epoch": 0.6461916461916462, + "grad_norm": 0.5242499237496944, + "learning_rate": 4.362477231329691e-05, + "loss": 0.7239, + "step": 263 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 0.6576624559407754, + "learning_rate": 4.35792349726776e-05, + "loss": 0.6224, + "step": 264 + }, + { + "epoch": 0.6511056511056511, + "grad_norm": 0.48964094334247854, + "learning_rate": 4.353369763205829e-05, + "loss": 0.7645, + "step": 265 + }, + { + "epoch": 0.6535626535626535, + "grad_norm": 0.4674980129473235, + "learning_rate": 4.3488160291438984e-05, + "loss": 0.768, + "step": 266 + }, + { + "epoch": 0.6560196560196561, + "grad_norm": 0.4434022776784131, + "learning_rate": 4.3442622950819674e-05, + "loss": 0.7459, + "step": 267 + }, + { + "epoch": 0.6584766584766585, + "grad_norm": 0.538941168132682, + "learning_rate": 4.3397085610200364e-05, + "loss": 0.6968, + "step": 268 + }, + { + "epoch": 0.6609336609336609, + "grad_norm": 0.3624467815465402, + "learning_rate": 4.335154826958106e-05, + "loss": 0.6096, + "step": 269 + }, + { + "epoch": 0.6633906633906634, + "grad_norm": 0.5599889013533942, + "learning_rate": 4.330601092896175e-05, + "loss": 0.7658, + "step": 270 + }, + { + "epoch": 0.6658476658476659, + "grad_norm": 0.690440401509493, + "learning_rate": 4.326047358834244e-05, + "loss": 0.7877, + "step": 271 + }, + { + "epoch": 0.6683046683046683, + "grad_norm": 0.3686357695682294, + "learning_rate": 4.321493624772314e-05, + "loss": 0.6895, + "step": 272 + }, + { + "epoch": 0.6707616707616708, + "grad_norm": 0.545620565235858, + "learning_rate": 4.316939890710383e-05, + "loss": 0.69, + "step": 273 + }, + { + "epoch": 0.6732186732186732, + "grad_norm": 0.4204580863650939, + "learning_rate": 4.312386156648452e-05, + "loss": 0.6768, + "step": 274 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.47301510227399846, + "learning_rate": 4.3078324225865214e-05, + "loss": 0.6414, + "step": 275 + }, + { + "epoch": 0.6781326781326781, + "grad_norm": 0.39120871861762363, + "learning_rate": 4.3032786885245904e-05, + "loss": 0.6393, + "step": 276 + }, + { + "epoch": 0.6805896805896806, + "grad_norm": 0.5663194594331895, + "learning_rate": 4.2987249544626594e-05, + "loss": 0.7721, + "step": 277 + }, + { + "epoch": 0.683046683046683, + "grad_norm": 0.5578558026406056, + "learning_rate": 4.294171220400729e-05, + "loss": 0.679, + "step": 278 + }, + { + "epoch": 0.6855036855036855, + "grad_norm": 0.4785935193977311, + "learning_rate": 4.289617486338798e-05, + "loss": 0.8548, + "step": 279 + }, + { + "epoch": 0.687960687960688, + "grad_norm": 0.7344196795158664, + "learning_rate": 4.285063752276867e-05, + "loss": 0.7421, + "step": 280 + }, + { + "epoch": 0.6904176904176904, + "grad_norm": 0.8908899764975586, + "learning_rate": 4.280510018214937e-05, + "loss": 0.7894, + "step": 281 + }, + { + "epoch": 0.6928746928746928, + "grad_norm": 0.6287419956030045, + "learning_rate": 4.275956284153005e-05, + "loss": 0.6785, + "step": 282 + }, + { + "epoch": 0.6953316953316954, + "grad_norm": 0.5149422483348357, + "learning_rate": 4.271402550091075e-05, + "loss": 0.7382, + "step": 283 + }, + { + "epoch": 0.6977886977886978, + "grad_norm": 0.5454860373961983, + "learning_rate": 4.2668488160291445e-05, + "loss": 0.7274, + "step": 284 + }, + { + "epoch": 0.7002457002457002, + "grad_norm": 0.5477624009062736, + "learning_rate": 4.262295081967213e-05, + "loss": 0.7058, + "step": 285 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 0.5596039899044134, + "learning_rate": 4.2577413479052825e-05, + "loss": 0.8178, + "step": 286 + }, + { + "epoch": 0.7051597051597052, + "grad_norm": 0.5919332487502931, + "learning_rate": 4.253187613843352e-05, + "loss": 0.69, + "step": 287 + }, + { + "epoch": 0.7076167076167076, + "grad_norm": 0.5283900337631473, + "learning_rate": 4.248633879781421e-05, + "loss": 0.8171, + "step": 288 + }, + { + "epoch": 0.7100737100737101, + "grad_norm": 0.7692525624223621, + "learning_rate": 4.24408014571949e-05, + "loss": 0.7239, + "step": 289 + }, + { + "epoch": 0.7125307125307125, + "grad_norm": 0.3863360498506725, + "learning_rate": 4.23952641165756e-05, + "loss": 0.5576, + "step": 290 + }, + { + "epoch": 0.714987714987715, + "grad_norm": 0.7223883296775482, + "learning_rate": 4.234972677595629e-05, + "loss": 0.6975, + "step": 291 + }, + { + "epoch": 0.7174447174447175, + "grad_norm": 1.7771798036626734, + "learning_rate": 4.230418943533698e-05, + "loss": 0.6565, + "step": 292 + }, + { + "epoch": 0.7199017199017199, + "grad_norm": 0.6430310979475962, + "learning_rate": 4.225865209471767e-05, + "loss": 0.6675, + "step": 293 + }, + { + "epoch": 0.7223587223587223, + "grad_norm": 0.3794537639280509, + "learning_rate": 4.2213114754098365e-05, + "loss": 0.7642, + "step": 294 + }, + { + "epoch": 0.7248157248157249, + "grad_norm": 0.60943195656342, + "learning_rate": 4.2167577413479055e-05, + "loss": 0.7247, + "step": 295 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.37289390653274224, + "learning_rate": 4.2122040072859745e-05, + "loss": 0.6255, + "step": 296 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 0.4532436953171903, + "learning_rate": 4.207650273224044e-05, + "loss": 0.7069, + "step": 297 + }, + { + "epoch": 0.7321867321867321, + "grad_norm": 0.40650887131809266, + "learning_rate": 4.203096539162113e-05, + "loss": 0.7403, + "step": 298 + }, + { + "epoch": 0.7346437346437347, + "grad_norm": 0.5965618573882557, + "learning_rate": 4.198542805100182e-05, + "loss": 0.7413, + "step": 299 + }, + { + "epoch": 0.7371007371007371, + "grad_norm": 0.35937646145739954, + "learning_rate": 4.193989071038252e-05, + "loss": 0.7104, + "step": 300 + }, + { + "epoch": 0.7395577395577395, + "grad_norm": 0.45967984584408983, + "learning_rate": 4.189435336976321e-05, + "loss": 0.8102, + "step": 301 + }, + { + "epoch": 0.742014742014742, + "grad_norm": 0.4885635149330037, + "learning_rate": 4.18488160291439e-05, + "loss": 0.7302, + "step": 302 + }, + { + "epoch": 0.7444717444717445, + "grad_norm": 0.3152058972635706, + "learning_rate": 4.1803278688524595e-05, + "loss": 0.6891, + "step": 303 + }, + { + "epoch": 0.7469287469287469, + "grad_norm": 0.4161834589482244, + "learning_rate": 4.1757741347905285e-05, + "loss": 0.6623, + "step": 304 + }, + { + "epoch": 0.7493857493857494, + "grad_norm": 0.36473148815614853, + "learning_rate": 4.1712204007285975e-05, + "loss": 0.7902, + "step": 305 + }, + { + "epoch": 0.7518427518427518, + "grad_norm": 0.4147403697677368, + "learning_rate": 4.166666666666667e-05, + "loss": 0.7875, + "step": 306 + }, + { + "epoch": 0.7542997542997543, + "grad_norm": 0.4077917564117238, + "learning_rate": 4.162112932604736e-05, + "loss": 0.7275, + "step": 307 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 0.4060094467217255, + "learning_rate": 4.157559198542805e-05, + "loss": 0.7783, + "step": 308 + }, + { + "epoch": 0.7592137592137592, + "grad_norm": 0.4130103975738772, + "learning_rate": 4.153005464480875e-05, + "loss": 0.6847, + "step": 309 + }, + { + "epoch": 0.7616707616707616, + "grad_norm": 0.3681636230585068, + "learning_rate": 4.148451730418943e-05, + "loss": 0.7531, + "step": 310 + }, + { + "epoch": 0.7641277641277642, + "grad_norm": 0.3827065341158274, + "learning_rate": 4.143897996357013e-05, + "loss": 0.7141, + "step": 311 + }, + { + "epoch": 0.7665847665847666, + "grad_norm": 0.29238085362688543, + "learning_rate": 4.1393442622950826e-05, + "loss": 0.6273, + "step": 312 + }, + { + "epoch": 0.769041769041769, + "grad_norm": 0.33937884647496835, + "learning_rate": 4.134790528233151e-05, + "loss": 0.6489, + "step": 313 + }, + { + "epoch": 0.7714987714987716, + "grad_norm": 0.3015348898927694, + "learning_rate": 4.1302367941712206e-05, + "loss": 0.5207, + "step": 314 + }, + { + "epoch": 0.773955773955774, + "grad_norm": 0.35134100703007254, + "learning_rate": 4.12568306010929e-05, + "loss": 0.7576, + "step": 315 + }, + { + "epoch": 0.7764127764127764, + "grad_norm": 0.31798902115911587, + "learning_rate": 4.1211293260473586e-05, + "loss": 0.6203, + "step": 316 + }, + { + "epoch": 0.7788697788697788, + "grad_norm": 0.35299888238401994, + "learning_rate": 4.116575591985428e-05, + "loss": 0.6875, + "step": 317 + }, + { + "epoch": 0.7813267813267813, + "grad_norm": 0.3525914582079822, + "learning_rate": 4.112021857923498e-05, + "loss": 0.6804, + "step": 318 + }, + { + "epoch": 0.7837837837837838, + "grad_norm": 0.3006720346358963, + "learning_rate": 4.107468123861566e-05, + "loss": 0.5346, + "step": 319 + }, + { + "epoch": 0.7862407862407862, + "grad_norm": 1.5252533561825474, + "learning_rate": 4.102914389799636e-05, + "loss": 0.7018, + "step": 320 + }, + { + "epoch": 0.7886977886977887, + "grad_norm": 0.32274770353739635, + "learning_rate": 4.098360655737705e-05, + "loss": 0.614, + "step": 321 + }, + { + "epoch": 0.7911547911547911, + "grad_norm": 0.32985165709996966, + "learning_rate": 4.093806921675774e-05, + "loss": 0.6607, + "step": 322 + }, + { + "epoch": 0.7936117936117936, + "grad_norm": 0.30025432983818734, + "learning_rate": 4.0892531876138436e-05, + "loss": 0.6357, + "step": 323 + }, + { + "epoch": 0.7960687960687961, + "grad_norm": 0.3049594116455463, + "learning_rate": 4.0846994535519126e-05, + "loss": 0.5822, + "step": 324 + }, + { + "epoch": 0.7985257985257985, + "grad_norm": 0.3629904661955952, + "learning_rate": 4.080145719489982e-05, + "loss": 0.6978, + "step": 325 + }, + { + "epoch": 0.800982800982801, + "grad_norm": 0.9634279527349047, + "learning_rate": 4.075591985428051e-05, + "loss": 0.8865, + "step": 326 + }, + { + "epoch": 0.8034398034398035, + "grad_norm": 0.45193045970841783, + "learning_rate": 4.07103825136612e-05, + "loss": 0.8065, + "step": 327 + }, + { + "epoch": 0.8058968058968059, + "grad_norm": 0.3177464973567778, + "learning_rate": 4.06648451730419e-05, + "loss": 0.711, + "step": 328 + }, + { + "epoch": 0.8083538083538083, + "grad_norm": 0.5049266007665172, + "learning_rate": 4.061930783242259e-05, + "loss": 0.788, + "step": 329 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.44936451115710957, + "learning_rate": 4.057377049180328e-05, + "loss": 0.6603, + "step": 330 + }, + { + "epoch": 0.8132678132678133, + "grad_norm": 0.40221025853337433, + "learning_rate": 4.0528233151183976e-05, + "loss": 0.6261, + "step": 331 + }, + { + "epoch": 0.8157248157248157, + "grad_norm": 0.38900176002138404, + "learning_rate": 4.0482695810564666e-05, + "loss": 0.6544, + "step": 332 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 0.453208732932394, + "learning_rate": 4.0437158469945356e-05, + "loss": 0.6353, + "step": 333 + }, + { + "epoch": 0.8206388206388207, + "grad_norm": 0.3681796156494085, + "learning_rate": 4.039162112932605e-05, + "loss": 0.6836, + "step": 334 + }, + { + "epoch": 0.8230958230958231, + "grad_norm": 0.468685040057859, + "learning_rate": 4.034608378870674e-05, + "loss": 0.7046, + "step": 335 + }, + { + "epoch": 0.8255528255528255, + "grad_norm": 0.43444130480919046, + "learning_rate": 4.030054644808743e-05, + "loss": 0.6659, + "step": 336 + }, + { + "epoch": 0.828009828009828, + "grad_norm": 0.3619248405794401, + "learning_rate": 4.025500910746813e-05, + "loss": 0.6417, + "step": 337 + }, + { + "epoch": 0.8304668304668305, + "grad_norm": 0.410561658075711, + "learning_rate": 4.020947176684881e-05, + "loss": 0.659, + "step": 338 + }, + { + "epoch": 0.8329238329238329, + "grad_norm": 0.349661211154494, + "learning_rate": 4.016393442622951e-05, + "loss": 0.7009, + "step": 339 + }, + { + "epoch": 0.8353808353808354, + "grad_norm": 0.45025633913904883, + "learning_rate": 4.0118397085610207e-05, + "loss": 0.7118, + "step": 340 + }, + { + "epoch": 0.8378378378378378, + "grad_norm": 0.3491439279038829, + "learning_rate": 4.007285974499089e-05, + "loss": 0.7326, + "step": 341 + }, + { + "epoch": 0.8402948402948403, + "grad_norm": 0.37516636206626935, + "learning_rate": 4.0027322404371587e-05, + "loss": 0.7005, + "step": 342 + }, + { + "epoch": 0.8427518427518428, + "grad_norm": 0.3135717435105698, + "learning_rate": 3.998178506375228e-05, + "loss": 0.6751, + "step": 343 + }, + { + "epoch": 0.8452088452088452, + "grad_norm": 0.45748071875834095, + "learning_rate": 3.9936247723132967e-05, + "loss": 0.7238, + "step": 344 + }, + { + "epoch": 0.8476658476658476, + "grad_norm": 0.43936046038898285, + "learning_rate": 3.989071038251366e-05, + "loss": 0.7568, + "step": 345 + }, + { + "epoch": 0.8501228501228502, + "grad_norm": 0.38829296038456096, + "learning_rate": 3.984517304189436e-05, + "loss": 0.6835, + "step": 346 + }, + { + "epoch": 0.8525798525798526, + "grad_norm": 0.45261007109171814, + "learning_rate": 3.979963570127504e-05, + "loss": 0.7626, + "step": 347 + }, + { + "epoch": 0.855036855036855, + "grad_norm": 0.3469325577394658, + "learning_rate": 3.975409836065574e-05, + "loss": 0.7497, + "step": 348 + }, + { + "epoch": 0.8574938574938575, + "grad_norm": 0.5400301615988978, + "learning_rate": 3.970856102003643e-05, + "loss": 0.8051, + "step": 349 + }, + { + "epoch": 0.85995085995086, + "grad_norm": 0.4001992360407668, + "learning_rate": 3.966302367941712e-05, + "loss": 0.7536, + "step": 350 + }, + { + "epoch": 0.8624078624078624, + "grad_norm": 0.3724180671895729, + "learning_rate": 3.961748633879782e-05, + "loss": 0.6238, + "step": 351 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 0.386974931071893, + "learning_rate": 3.957194899817851e-05, + "loss": 0.6876, + "step": 352 + }, + { + "epoch": 0.8673218673218673, + "grad_norm": 0.372863116265662, + "learning_rate": 3.95264116575592e-05, + "loss": 0.5849, + "step": 353 + }, + { + "epoch": 0.8697788697788698, + "grad_norm": 0.33795820672046467, + "learning_rate": 3.9480874316939894e-05, + "loss": 0.5205, + "step": 354 + }, + { + "epoch": 0.8722358722358723, + "grad_norm": 0.40729933902725135, + "learning_rate": 3.9435336976320584e-05, + "loss": 0.7655, + "step": 355 + }, + { + "epoch": 0.8746928746928747, + "grad_norm": 0.30755968744467366, + "learning_rate": 3.9389799635701274e-05, + "loss": 0.6263, + "step": 356 + }, + { + "epoch": 0.8771498771498771, + "grad_norm": 0.37093708872360476, + "learning_rate": 3.934426229508197e-05, + "loss": 0.7129, + "step": 357 + }, + { + "epoch": 0.8796068796068796, + "grad_norm": 0.37633511734635255, + "learning_rate": 3.929872495446266e-05, + "loss": 0.5872, + "step": 358 + }, + { + "epoch": 0.8820638820638821, + "grad_norm": 0.9614590556739387, + "learning_rate": 3.925318761384335e-05, + "loss": 0.7089, + "step": 359 + }, + { + "epoch": 0.8845208845208845, + "grad_norm": 0.36669325077055215, + "learning_rate": 3.920765027322405e-05, + "loss": 0.5196, + "step": 360 + }, + { + "epoch": 0.8869778869778869, + "grad_norm": 0.36063038368340206, + "learning_rate": 3.916211293260474e-05, + "loss": 0.7037, + "step": 361 + }, + { + "epoch": 0.8894348894348895, + "grad_norm": 0.3844550677877335, + "learning_rate": 3.9116575591985434e-05, + "loss": 0.6472, + "step": 362 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 0.36208926990085244, + "learning_rate": 3.9071038251366124e-05, + "loss": 0.6393, + "step": 363 + }, + { + "epoch": 0.8943488943488943, + "grad_norm": 0.36998305778442386, + "learning_rate": 3.9025500910746814e-05, + "loss": 0.7667, + "step": 364 + }, + { + "epoch": 0.8968058968058968, + "grad_norm": 0.3447294134836953, + "learning_rate": 3.897996357012751e-05, + "loss": 0.605, + "step": 365 + }, + { + "epoch": 0.8992628992628993, + "grad_norm": 0.36709184015795876, + "learning_rate": 3.89344262295082e-05, + "loss": 0.6642, + "step": 366 + }, + { + "epoch": 0.9017199017199017, + "grad_norm": 0.3486298961479053, + "learning_rate": 3.888888888888889e-05, + "loss": 0.6621, + "step": 367 + }, + { + "epoch": 0.9041769041769042, + "grad_norm": 0.4328843991656747, + "learning_rate": 3.884335154826959e-05, + "loss": 0.6797, + "step": 368 + }, + { + "epoch": 0.9066339066339066, + "grad_norm": 0.35617103914532294, + "learning_rate": 3.879781420765027e-05, + "loss": 0.6853, + "step": 369 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.48563000772634657, + "learning_rate": 3.875227686703097e-05, + "loss": 0.6981, + "step": 370 + }, + { + "epoch": 0.9115479115479116, + "grad_norm": 0.3887375137301516, + "learning_rate": 3.8706739526411664e-05, + "loss": 0.603, + "step": 371 + }, + { + "epoch": 0.914004914004914, + "grad_norm": 0.439470097514328, + "learning_rate": 3.866120218579235e-05, + "loss": 0.7077, + "step": 372 + }, + { + "epoch": 0.9164619164619164, + "grad_norm": 0.3403160171473462, + "learning_rate": 3.8615664845173044e-05, + "loss": 0.586, + "step": 373 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 0.42760829158750546, + "learning_rate": 3.857012750455374e-05, + "loss": 0.7303, + "step": 374 + }, + { + "epoch": 0.9213759213759214, + "grad_norm": 0.36489243280535705, + "learning_rate": 3.8524590163934424e-05, + "loss": 0.6256, + "step": 375 + }, + { + "epoch": 0.9238329238329238, + "grad_norm": 0.3808217161262314, + "learning_rate": 3.847905282331512e-05, + "loss": 0.7059, + "step": 376 + }, + { + "epoch": 0.9262899262899262, + "grad_norm": 0.34013903969336157, + "learning_rate": 3.843351548269581e-05, + "loss": 0.7301, + "step": 377 + }, + { + "epoch": 0.9287469287469288, + "grad_norm": 1.1463809470744701, + "learning_rate": 3.83879781420765e-05, + "loss": 0.6969, + "step": 378 + }, + { + "epoch": 0.9312039312039312, + "grad_norm": 0.4235667833129601, + "learning_rate": 3.83424408014572e-05, + "loss": 0.6473, + "step": 379 + }, + { + "epoch": 0.9336609336609336, + "grad_norm": 0.79876765490425, + "learning_rate": 3.829690346083789e-05, + "loss": 0.6183, + "step": 380 + }, + { + "epoch": 0.9361179361179361, + "grad_norm": 0.49555963725341723, + "learning_rate": 3.825136612021858e-05, + "loss": 0.8044, + "step": 381 + }, + { + "epoch": 0.9385749385749386, + "grad_norm": 0.3428503165110703, + "learning_rate": 3.8205828779599275e-05, + "loss": 0.5995, + "step": 382 + }, + { + "epoch": 0.941031941031941, + "grad_norm": 0.44593307884321404, + "learning_rate": 3.8160291438979965e-05, + "loss": 0.7151, + "step": 383 + }, + { + "epoch": 0.9434889434889435, + "grad_norm": 0.37468176709006323, + "learning_rate": 3.8114754098360655e-05, + "loss": 0.7905, + "step": 384 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.5722646888774676, + "learning_rate": 3.806921675774135e-05, + "loss": 0.731, + "step": 385 + }, + { + "epoch": 0.9484029484029484, + "grad_norm": 1.8799684973155986, + "learning_rate": 3.802367941712204e-05, + "loss": 0.7714, + "step": 386 + }, + { + "epoch": 0.9508599508599509, + "grad_norm": 0.5172547101235551, + "learning_rate": 3.797814207650273e-05, + "loss": 0.6399, + "step": 387 + }, + { + "epoch": 0.9533169533169533, + "grad_norm": 0.4418711377815284, + "learning_rate": 3.793260473588343e-05, + "loss": 0.6997, + "step": 388 + }, + { + "epoch": 0.9557739557739557, + "grad_norm": 0.5285652919128196, + "learning_rate": 3.788706739526412e-05, + "loss": 0.704, + "step": 389 + }, + { + "epoch": 0.9582309582309583, + "grad_norm": 0.45024081362204066, + "learning_rate": 3.784153005464481e-05, + "loss": 0.7121, + "step": 390 + }, + { + "epoch": 0.9606879606879607, + "grad_norm": 0.4069199989712789, + "learning_rate": 3.7795992714025505e-05, + "loss": 0.6408, + "step": 391 + }, + { + "epoch": 0.9631449631449631, + "grad_norm": 0.4856083258958585, + "learning_rate": 3.7750455373406195e-05, + "loss": 0.6723, + "step": 392 + }, + { + "epoch": 0.9656019656019657, + "grad_norm": 0.3584054750131388, + "learning_rate": 3.7704918032786885e-05, + "loss": 0.619, + "step": 393 + }, + { + "epoch": 0.9680589680589681, + "grad_norm": 0.46503131404325265, + "learning_rate": 3.765938069216758e-05, + "loss": 0.7499, + "step": 394 + }, + { + "epoch": 0.9705159705159705, + "grad_norm": 0.3568325978396338, + "learning_rate": 3.761384335154827e-05, + "loss": 0.6153, + "step": 395 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 0.35548746505907636, + "learning_rate": 3.756830601092896e-05, + "loss": 0.6644, + "step": 396 + }, + { + "epoch": 0.9754299754299754, + "grad_norm": 3.732099146967768, + "learning_rate": 3.752276867030965e-05, + "loss": 0.7677, + "step": 397 + }, + { + "epoch": 0.9778869778869779, + "grad_norm": 0.6417926585769745, + "learning_rate": 3.747723132969035e-05, + "loss": 0.713, + "step": 398 + }, + { + "epoch": 0.9803439803439803, + "grad_norm": 0.3428338885926231, + "learning_rate": 3.7431693989071045e-05, + "loss": 0.7057, + "step": 399 + }, + { + "epoch": 0.9828009828009828, + "grad_norm": 0.44136452162974704, + "learning_rate": 3.738615664845173e-05, + "loss": 0.5293, + "step": 400 + }, + { + "epoch": 0.9852579852579852, + "grad_norm": 0.4267208521085863, + "learning_rate": 3.7340619307832425e-05, + "loss": 0.6846, + "step": 401 + }, + { + "epoch": 0.9877149877149877, + "grad_norm": 0.36579693412461944, + "learning_rate": 3.729508196721312e-05, + "loss": 0.7243, + "step": 402 + }, + { + "epoch": 0.9901719901719902, + "grad_norm": 0.46204688211658324, + "learning_rate": 3.7249544626593805e-05, + "loss": 0.68, + "step": 403 + }, + { + "epoch": 0.9926289926289926, + "grad_norm": 0.37956013971155556, + "learning_rate": 3.72040072859745e-05, + "loss": 0.7447, + "step": 404 + }, + { + "epoch": 0.995085995085995, + "grad_norm": 0.3910625026439214, + "learning_rate": 3.71584699453552e-05, + "loss": 0.6197, + "step": 405 + }, + { + "epoch": 0.9975429975429976, + "grad_norm": 0.41783305217284267, + "learning_rate": 3.711293260473588e-05, + "loss": 0.6719, + "step": 406 + }, + { + "epoch": 1.0, + "grad_norm": 0.3611623758486256, + "learning_rate": 3.706739526411658e-05, + "loss": 0.5977, + "step": 407 + }, + { + "epoch": 1.0024570024570025, + "grad_norm": 0.4712316660517998, + "learning_rate": 3.702185792349727e-05, + "loss": 0.6267, + "step": 408 + }, + { + "epoch": 1.0049140049140048, + "grad_norm": 0.44510865147589923, + "learning_rate": 3.697632058287796e-05, + "loss": 0.5723, + "step": 409 + }, + { + "epoch": 1.0073710073710074, + "grad_norm": 0.4897737184802636, + "learning_rate": 3.6930783242258656e-05, + "loss": 0.6133, + "step": 410 + }, + { + "epoch": 1.00982800982801, + "grad_norm": 0.4710019531923247, + "learning_rate": 3.6885245901639346e-05, + "loss": 0.6601, + "step": 411 + }, + { + "epoch": 1.0122850122850122, + "grad_norm": 0.4127476864637772, + "learning_rate": 3.6839708561020036e-05, + "loss": 0.5831, + "step": 412 + }, + { + "epoch": 1.0147420147420148, + "grad_norm": 0.3852466347026918, + "learning_rate": 3.679417122040073e-05, + "loss": 0.6171, + "step": 413 + }, + { + "epoch": 1.0171990171990173, + "grad_norm": 0.35722854453354774, + "learning_rate": 3.674863387978142e-05, + "loss": 0.4941, + "step": 414 + }, + { + "epoch": 1.0196560196560196, + "grad_norm": 0.3477409452059263, + "learning_rate": 3.670309653916211e-05, + "loss": 0.6014, + "step": 415 + }, + { + "epoch": 1.0221130221130221, + "grad_norm": 0.38573394146966594, + "learning_rate": 3.665755919854281e-05, + "loss": 0.5435, + "step": 416 + }, + { + "epoch": 1.0245700245700247, + "grad_norm": 0.3152965022867117, + "learning_rate": 3.66120218579235e-05, + "loss": 0.5363, + "step": 417 + }, + { + "epoch": 1.027027027027027, + "grad_norm": 0.37855487804654653, + "learning_rate": 3.656648451730419e-05, + "loss": 0.6216, + "step": 418 + }, + { + "epoch": 1.0294840294840295, + "grad_norm": 0.3915386797411922, + "learning_rate": 3.6520947176684886e-05, + "loss": 0.6468, + "step": 419 + }, + { + "epoch": 1.031941031941032, + "grad_norm": 0.30903418418917916, + "learning_rate": 3.6475409836065576e-05, + "loss": 0.5918, + "step": 420 + }, + { + "epoch": 1.0343980343980343, + "grad_norm": 18.485814215831798, + "learning_rate": 3.6429872495446266e-05, + "loss": 0.7571, + "step": 421 + }, + { + "epoch": 1.0368550368550369, + "grad_norm": 0.43418803474006623, + "learning_rate": 3.638433515482696e-05, + "loss": 0.4651, + "step": 422 + }, + { + "epoch": 1.0393120393120394, + "grad_norm": 0.4296276366274725, + "learning_rate": 3.633879781420765e-05, + "loss": 0.569, + "step": 423 + }, + { + "epoch": 1.0417690417690417, + "grad_norm": 0.3252040498050024, + "learning_rate": 3.629326047358834e-05, + "loss": 0.5682, + "step": 424 + }, + { + "epoch": 1.0442260442260443, + "grad_norm": 0.5555580641102786, + "learning_rate": 3.624772313296903e-05, + "loss": 0.5685, + "step": 425 + }, + { + "epoch": 1.0466830466830466, + "grad_norm": 0.30439876353558465, + "learning_rate": 3.620218579234973e-05, + "loss": 0.5509, + "step": 426 + }, + { + "epoch": 1.049140049140049, + "grad_norm": 0.5257024496923978, + "learning_rate": 3.615664845173042e-05, + "loss": 0.6175, + "step": 427 + }, + { + "epoch": 1.0515970515970516, + "grad_norm": 0.3924880233071523, + "learning_rate": 3.611111111111111e-05, + "loss": 0.5463, + "step": 428 + }, + { + "epoch": 1.054054054054054, + "grad_norm": 0.3912483665248679, + "learning_rate": 3.6065573770491806e-05, + "loss": 0.5172, + "step": 429 + }, + { + "epoch": 1.0565110565110565, + "grad_norm": 0.35522183054743234, + "learning_rate": 3.6020036429872496e-05, + "loss": 0.548, + "step": 430 + }, + { + "epoch": 1.058968058968059, + "grad_norm": 0.41696382741795146, + "learning_rate": 3.5974499089253186e-05, + "loss": 0.6179, + "step": 431 + }, + { + "epoch": 1.0614250614250613, + "grad_norm": 0.34899632677634346, + "learning_rate": 3.592896174863388e-05, + "loss": 0.5848, + "step": 432 + }, + { + "epoch": 1.0638820638820639, + "grad_norm": 0.29764763902529734, + "learning_rate": 3.588342440801457e-05, + "loss": 0.5065, + "step": 433 + }, + { + "epoch": 1.0663390663390664, + "grad_norm": 0.33789418991474374, + "learning_rate": 3.583788706739526e-05, + "loss": 0.557, + "step": 434 + }, + { + "epoch": 1.0687960687960687, + "grad_norm": 0.3817072319681774, + "learning_rate": 3.579234972677596e-05, + "loss": 0.573, + "step": 435 + }, + { + "epoch": 1.0712530712530712, + "grad_norm": 0.27883801849612727, + "learning_rate": 3.574681238615665e-05, + "loss": 0.4778, + "step": 436 + }, + { + "epoch": 1.0737100737100738, + "grad_norm": 0.3923116193005877, + "learning_rate": 3.570127504553734e-05, + "loss": 0.5919, + "step": 437 + }, + { + "epoch": 1.076167076167076, + "grad_norm": 0.29914831145059495, + "learning_rate": 3.5655737704918037e-05, + "loss": 0.494, + "step": 438 + }, + { + "epoch": 1.0786240786240786, + "grad_norm": 0.31767336538989416, + "learning_rate": 3.5610200364298727e-05, + "loss": 0.6199, + "step": 439 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.46913096826211653, + "learning_rate": 3.5564663023679417e-05, + "loss": 0.6955, + "step": 440 + }, + { + "epoch": 1.0835380835380835, + "grad_norm": 0.3675875371319456, + "learning_rate": 3.551912568306011e-05, + "loss": 0.538, + "step": 441 + }, + { + "epoch": 1.085995085995086, + "grad_norm": 0.3330032586684102, + "learning_rate": 3.54735883424408e-05, + "loss": 0.5659, + "step": 442 + }, + { + "epoch": 1.0884520884520885, + "grad_norm": 0.39684518158418425, + "learning_rate": 3.542805100182149e-05, + "loss": 0.4902, + "step": 443 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 0.3370350178101319, + "learning_rate": 3.538251366120219e-05, + "loss": 0.6277, + "step": 444 + }, + { + "epoch": 1.0933660933660934, + "grad_norm": 0.3120031541968653, + "learning_rate": 3.533697632058288e-05, + "loss": 0.5705, + "step": 445 + }, + { + "epoch": 1.095823095823096, + "grad_norm": 0.35804818545314876, + "learning_rate": 3.529143897996357e-05, + "loss": 0.5268, + "step": 446 + }, + { + "epoch": 1.0982800982800982, + "grad_norm": 0.36340510531282566, + "learning_rate": 3.524590163934427e-05, + "loss": 0.5328, + "step": 447 + }, + { + "epoch": 1.1007371007371007, + "grad_norm": 0.3098836614900157, + "learning_rate": 3.520036429872496e-05, + "loss": 0.5773, + "step": 448 + }, + { + "epoch": 1.1031941031941033, + "grad_norm": 0.3135507590572425, + "learning_rate": 3.515482695810565e-05, + "loss": 0.6188, + "step": 449 + }, + { + "epoch": 1.1056511056511056, + "grad_norm": 0.31164002022216103, + "learning_rate": 3.5109289617486344e-05, + "loss": 0.4606, + "step": 450 + }, + { + "epoch": 1.1081081081081081, + "grad_norm": 0.3623278294612082, + "learning_rate": 3.5063752276867034e-05, + "loss": 0.6439, + "step": 451 + }, + { + "epoch": 1.1105651105651106, + "grad_norm": 0.296521047913555, + "learning_rate": 3.5018214936247724e-05, + "loss": 0.4695, + "step": 452 + }, + { + "epoch": 1.113022113022113, + "grad_norm": 0.3483084595473505, + "learning_rate": 3.4972677595628414e-05, + "loss": 0.5615, + "step": 453 + }, + { + "epoch": 1.1154791154791155, + "grad_norm": 0.2860532290662123, + "learning_rate": 3.492714025500911e-05, + "loss": 0.5426, + "step": 454 + }, + { + "epoch": 1.117936117936118, + "grad_norm": 0.35904055869223206, + "learning_rate": 3.48816029143898e-05, + "loss": 0.6372, + "step": 455 + }, + { + "epoch": 1.1203931203931203, + "grad_norm": 0.3035047945160019, + "learning_rate": 3.483606557377049e-05, + "loss": 0.5084, + "step": 456 + }, + { + "epoch": 1.1228501228501229, + "grad_norm": 0.34056825729709134, + "learning_rate": 3.479052823315119e-05, + "loss": 0.5445, + "step": 457 + }, + { + "epoch": 1.1253071253071254, + "grad_norm": 0.34548063719869543, + "learning_rate": 3.474499089253188e-05, + "loss": 0.5538, + "step": 458 + }, + { + "epoch": 1.1277641277641277, + "grad_norm": 0.34863453010817147, + "learning_rate": 3.469945355191257e-05, + "loss": 0.6136, + "step": 459 + }, + { + "epoch": 1.1302211302211302, + "grad_norm": 0.36452640020436167, + "learning_rate": 3.4653916211293264e-05, + "loss": 0.6339, + "step": 460 + }, + { + "epoch": 1.1326781326781328, + "grad_norm": 0.33505641304640355, + "learning_rate": 3.4608378870673954e-05, + "loss": 0.5226, + "step": 461 + }, + { + "epoch": 1.135135135135135, + "grad_norm": 0.5832869535948028, + "learning_rate": 3.4562841530054644e-05, + "loss": 0.6528, + "step": 462 + }, + { + "epoch": 1.1375921375921376, + "grad_norm": 0.29618924105134536, + "learning_rate": 3.451730418943534e-05, + "loss": 0.6025, + "step": 463 + }, + { + "epoch": 1.1400491400491402, + "grad_norm": 0.34874600771453107, + "learning_rate": 3.447176684881603e-05, + "loss": 0.5565, + "step": 464 + }, + { + "epoch": 1.1425061425061425, + "grad_norm": 0.335951908594719, + "learning_rate": 3.442622950819672e-05, + "loss": 0.547, + "step": 465 + }, + { + "epoch": 1.144963144963145, + "grad_norm": 0.2998993608726187, + "learning_rate": 3.438069216757742e-05, + "loss": 0.628, + "step": 466 + }, + { + "epoch": 1.1474201474201475, + "grad_norm": 0.29644218347091184, + "learning_rate": 3.433515482695811e-05, + "loss": 0.52, + "step": 467 + }, + { + "epoch": 1.1498771498771498, + "grad_norm": 0.30863434769848686, + "learning_rate": 3.42896174863388e-05, + "loss": 0.5253, + "step": 468 + }, + { + "epoch": 1.1523341523341524, + "grad_norm": 0.28232514356630184, + "learning_rate": 3.4244080145719494e-05, + "loss": 0.5264, + "step": 469 + }, + { + "epoch": 1.154791154791155, + "grad_norm": 0.3486029632281899, + "learning_rate": 3.4198542805100184e-05, + "loss": 0.5337, + "step": 470 + }, + { + "epoch": 1.1572481572481572, + "grad_norm": 0.2749244379146869, + "learning_rate": 3.4153005464480874e-05, + "loss": 0.4396, + "step": 471 + }, + { + "epoch": 1.1597051597051597, + "grad_norm": 0.35073763579329614, + "learning_rate": 3.410746812386157e-05, + "loss": 0.5767, + "step": 472 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 0.3148751339175056, + "learning_rate": 3.406193078324226e-05, + "loss": 0.554, + "step": 473 + }, + { + "epoch": 1.1646191646191646, + "grad_norm": 0.31661478461777187, + "learning_rate": 3.401639344262295e-05, + "loss": 0.6312, + "step": 474 + }, + { + "epoch": 1.1670761670761671, + "grad_norm": 0.32266558978084553, + "learning_rate": 3.397085610200365e-05, + "loss": 0.5549, + "step": 475 + }, + { + "epoch": 1.1695331695331694, + "grad_norm": 0.31175094191334074, + "learning_rate": 3.392531876138434e-05, + "loss": 0.6031, + "step": 476 + }, + { + "epoch": 1.171990171990172, + "grad_norm": 0.2860842816292032, + "learning_rate": 3.387978142076503e-05, + "loss": 0.5033, + "step": 477 + }, + { + "epoch": 1.1744471744471745, + "grad_norm": 0.2863055488397975, + "learning_rate": 3.3834244080145725e-05, + "loss": 0.5826, + "step": 478 + }, + { + "epoch": 1.1769041769041768, + "grad_norm": 0.2814884571892455, + "learning_rate": 3.3788706739526415e-05, + "loss": 0.6098, + "step": 479 + }, + { + "epoch": 1.1793611793611793, + "grad_norm": 0.3343616425168066, + "learning_rate": 3.3743169398907105e-05, + "loss": 0.6576, + "step": 480 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 4.471655389420487, + "learning_rate": 3.36976320582878e-05, + "loss": 0.6501, + "step": 481 + }, + { + "epoch": 1.1842751842751842, + "grad_norm": 0.3531434211683213, + "learning_rate": 3.365209471766849e-05, + "loss": 0.5736, + "step": 482 + }, + { + "epoch": 1.1867321867321867, + "grad_norm": 0.30933282032145204, + "learning_rate": 3.360655737704918e-05, + "loss": 0.5773, + "step": 483 + }, + { + "epoch": 1.1891891891891893, + "grad_norm": 0.34749618430933105, + "learning_rate": 3.356102003642987e-05, + "loss": 0.5344, + "step": 484 + }, + { + "epoch": 1.1916461916461916, + "grad_norm": 0.2890952500864336, + "learning_rate": 3.351548269581057e-05, + "loss": 0.5979, + "step": 485 + }, + { + "epoch": 1.194103194103194, + "grad_norm": 0.34484921930011087, + "learning_rate": 3.346994535519126e-05, + "loss": 0.5318, + "step": 486 + }, + { + "epoch": 1.1965601965601966, + "grad_norm": 0.30984886065289263, + "learning_rate": 3.342440801457195e-05, + "loss": 0.5531, + "step": 487 + }, + { + "epoch": 1.199017199017199, + "grad_norm": 0.32020672210102435, + "learning_rate": 3.3378870673952645e-05, + "loss": 0.559, + "step": 488 + }, + { + "epoch": 1.2014742014742015, + "grad_norm": 0.3715980189408075, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.5414, + "step": 489 + }, + { + "epoch": 1.203931203931204, + "grad_norm": 0.2867099183140612, + "learning_rate": 3.3287795992714025e-05, + "loss": 0.5016, + "step": 490 + }, + { + "epoch": 1.2063882063882063, + "grad_norm": 0.32647658657343387, + "learning_rate": 3.324225865209472e-05, + "loss": 0.5668, + "step": 491 + }, + { + "epoch": 1.2088452088452089, + "grad_norm": 0.31285287963181513, + "learning_rate": 3.319672131147541e-05, + "loss": 0.5808, + "step": 492 + }, + { + "epoch": 1.2113022113022114, + "grad_norm": 0.31154263564497325, + "learning_rate": 3.31511839708561e-05, + "loss": 0.577, + "step": 493 + }, + { + "epoch": 1.2137592137592137, + "grad_norm": 0.3148888983694767, + "learning_rate": 3.31056466302368e-05, + "loss": 0.5713, + "step": 494 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.33196948700396134, + "learning_rate": 3.306010928961749e-05, + "loss": 0.6411, + "step": 495 + }, + { + "epoch": 1.2186732186732188, + "grad_norm": 0.3089241773992785, + "learning_rate": 3.301457194899818e-05, + "loss": 0.6084, + "step": 496 + }, + { + "epoch": 1.221130221130221, + "grad_norm": 0.35264205238860336, + "learning_rate": 3.2969034608378875e-05, + "loss": 0.6082, + "step": 497 + }, + { + "epoch": 1.2235872235872236, + "grad_norm": 0.3592504157610499, + "learning_rate": 3.2923497267759565e-05, + "loss": 0.5017, + "step": 498 + }, + { + "epoch": 1.2260442260442261, + "grad_norm": 0.3294945441126368, + "learning_rate": 3.2877959927140255e-05, + "loss": 0.5671, + "step": 499 + }, + { + "epoch": 1.2285012285012284, + "grad_norm": 0.31804938107229946, + "learning_rate": 3.283242258652095e-05, + "loss": 0.5706, + "step": 500 + }, + { + "epoch": 1.230958230958231, + "grad_norm": 0.2933642876504185, + "learning_rate": 3.2786885245901635e-05, + "loss": 0.5426, + "step": 501 + }, + { + "epoch": 1.2334152334152333, + "grad_norm": 0.3626340514862369, + "learning_rate": 3.274134790528233e-05, + "loss": 0.6105, + "step": 502 + }, + { + "epoch": 1.2358722358722358, + "grad_norm": 0.26476010226570695, + "learning_rate": 3.269581056466303e-05, + "loss": 0.4702, + "step": 503 + }, + { + "epoch": 1.2383292383292384, + "grad_norm": 0.3661036271637661, + "learning_rate": 3.265027322404371e-05, + "loss": 0.542, + "step": 504 + }, + { + "epoch": 1.2407862407862407, + "grad_norm": 0.3421274093595941, + "learning_rate": 3.260473588342441e-05, + "loss": 0.5814, + "step": 505 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 0.27100222834936427, + "learning_rate": 3.2559198542805106e-05, + "loss": 0.5478, + "step": 506 + }, + { + "epoch": 1.2457002457002457, + "grad_norm": 0.314120753601731, + "learning_rate": 3.251366120218579e-05, + "loss": 0.5531, + "step": 507 + }, + { + "epoch": 1.248157248157248, + "grad_norm": 0.9759156709730757, + "learning_rate": 3.2468123861566486e-05, + "loss": 0.6531, + "step": 508 + }, + { + "epoch": 1.2506142506142506, + "grad_norm": 0.30944457432745653, + "learning_rate": 3.242258652094718e-05, + "loss": 0.5513, + "step": 509 + }, + { + "epoch": 1.253071253071253, + "grad_norm": 0.3010475271711826, + "learning_rate": 3.237704918032787e-05, + "loss": 0.5095, + "step": 510 + }, + { + "epoch": 1.2555282555282554, + "grad_norm": 2.091229835428742, + "learning_rate": 3.233151183970856e-05, + "loss": 0.6917, + "step": 511 + }, + { + "epoch": 1.257985257985258, + "grad_norm": 0.4263480510636171, + "learning_rate": 3.228597449908925e-05, + "loss": 0.5107, + "step": 512 + }, + { + "epoch": 1.2604422604422605, + "grad_norm": 0.2662240671218934, + "learning_rate": 3.224043715846995e-05, + "loss": 0.5171, + "step": 513 + }, + { + "epoch": 1.2628992628992628, + "grad_norm": 0.38958730612737474, + "learning_rate": 3.219489981785064e-05, + "loss": 0.5062, + "step": 514 + }, + { + "epoch": 1.2653562653562653, + "grad_norm": 0.30418756289720655, + "learning_rate": 3.214936247723133e-05, + "loss": 0.5835, + "step": 515 + }, + { + "epoch": 1.2678132678132679, + "grad_norm": 0.344436665503126, + "learning_rate": 3.2103825136612026e-05, + "loss": 0.5983, + "step": 516 + }, + { + "epoch": 1.2702702702702702, + "grad_norm": 1.918725499774248, + "learning_rate": 3.2058287795992716e-05, + "loss": 0.6293, + "step": 517 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 0.3623753413503759, + "learning_rate": 3.2012750455373406e-05, + "loss": 0.631, + "step": 518 + }, + { + "epoch": 1.2751842751842752, + "grad_norm": 0.4577653918156244, + "learning_rate": 3.19672131147541e-05, + "loss": 0.5013, + "step": 519 + }, + { + "epoch": 1.2776412776412776, + "grad_norm": 0.31126359791794433, + "learning_rate": 3.192167577413479e-05, + "loss": 0.502, + "step": 520 + }, + { + "epoch": 1.28009828009828, + "grad_norm": 0.30127450296424224, + "learning_rate": 3.187613843351548e-05, + "loss": 0.4445, + "step": 521 + }, + { + "epoch": 1.2825552825552826, + "grad_norm": 0.32777802361056146, + "learning_rate": 3.183060109289618e-05, + "loss": 0.586, + "step": 522 + }, + { + "epoch": 1.285012285012285, + "grad_norm": 0.3151574260038467, + "learning_rate": 3.178506375227687e-05, + "loss": 0.5101, + "step": 523 + }, + { + "epoch": 1.2874692874692875, + "grad_norm": 0.2958405193987708, + "learning_rate": 3.173952641165756e-05, + "loss": 0.5115, + "step": 524 + }, + { + "epoch": 1.28992628992629, + "grad_norm": 0.30692569753814974, + "learning_rate": 3.1693989071038256e-05, + "loss": 0.5255, + "step": 525 + }, + { + "epoch": 1.2923832923832923, + "grad_norm": 0.31369349705521754, + "learning_rate": 3.1648451730418946e-05, + "loss": 0.5708, + "step": 526 + }, + { + "epoch": 1.2948402948402948, + "grad_norm": 0.2818423915221156, + "learning_rate": 3.1602914389799636e-05, + "loss": 0.4837, + "step": 527 + }, + { + "epoch": 1.2972972972972974, + "grad_norm": 0.3134826582265648, + "learning_rate": 3.155737704918033e-05, + "loss": 0.5751, + "step": 528 + }, + { + "epoch": 1.2997542997542997, + "grad_norm": 0.2816827747685129, + "learning_rate": 3.1511839708561016e-05, + "loss": 0.4596, + "step": 529 + }, + { + "epoch": 1.3022113022113022, + "grad_norm": 0.34084186090096374, + "learning_rate": 3.146630236794171e-05, + "loss": 0.5656, + "step": 530 + }, + { + "epoch": 1.3046683046683047, + "grad_norm": 0.32476535285413916, + "learning_rate": 3.142076502732241e-05, + "loss": 0.5661, + "step": 531 + }, + { + "epoch": 1.307125307125307, + "grad_norm": 0.3188888254272654, + "learning_rate": 3.137522768670309e-05, + "loss": 0.5269, + "step": 532 + }, + { + "epoch": 1.3095823095823096, + "grad_norm": 0.3366341919026146, + "learning_rate": 3.132969034608379e-05, + "loss": 0.4923, + "step": 533 + }, + { + "epoch": 1.3120393120393121, + "grad_norm": 0.3271992624122109, + "learning_rate": 3.1284153005464487e-05, + "loss": 0.5977, + "step": 534 + }, + { + "epoch": 1.3144963144963144, + "grad_norm": 0.417085896710461, + "learning_rate": 3.123861566484517e-05, + "loss": 0.614, + "step": 535 + }, + { + "epoch": 1.316953316953317, + "grad_norm": 0.31946680031176, + "learning_rate": 3.1193078324225867e-05, + "loss": 0.5364, + "step": 536 + }, + { + "epoch": 1.3194103194103195, + "grad_norm": 0.34172653254662405, + "learning_rate": 3.114754098360656e-05, + "loss": 0.5552, + "step": 537 + }, + { + "epoch": 1.3218673218673218, + "grad_norm": 0.334367874832506, + "learning_rate": 3.1102003642987247e-05, + "loss": 0.5969, + "step": 538 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 0.3986000529335846, + "learning_rate": 3.105646630236794e-05, + "loss": 0.499, + "step": 539 + }, + { + "epoch": 1.3267813267813269, + "grad_norm": 0.30475256236149134, + "learning_rate": 3.101092896174863e-05, + "loss": 0.5106, + "step": 540 + }, + { + "epoch": 1.3292383292383292, + "grad_norm": 0.3316364648381355, + "learning_rate": 3.096539162112932e-05, + "loss": 0.5334, + "step": 541 + }, + { + "epoch": 1.3316953316953317, + "grad_norm": 0.409941745047023, + "learning_rate": 3.091985428051002e-05, + "loss": 0.6345, + "step": 542 + }, + { + "epoch": 1.3341523341523343, + "grad_norm": 0.3401524473507645, + "learning_rate": 3.087431693989071e-05, + "loss": 0.6766, + "step": 543 + }, + { + "epoch": 1.3366093366093366, + "grad_norm": 0.3950305885673271, + "learning_rate": 3.082877959927141e-05, + "loss": 0.5296, + "step": 544 + }, + { + "epoch": 1.339066339066339, + "grad_norm": 0.306723619335892, + "learning_rate": 3.07832422586521e-05, + "loss": 0.6201, + "step": 545 + }, + { + "epoch": 1.3415233415233416, + "grad_norm": 0.3877898069868618, + "learning_rate": 3.073770491803279e-05, + "loss": 0.5411, + "step": 546 + }, + { + "epoch": 1.343980343980344, + "grad_norm": 0.31598719997076186, + "learning_rate": 3.0692167577413484e-05, + "loss": 0.5121, + "step": 547 + }, + { + "epoch": 1.3464373464373465, + "grad_norm": 0.3712193743058151, + "learning_rate": 3.0646630236794174e-05, + "loss": 0.5978, + "step": 548 + }, + { + "epoch": 1.348894348894349, + "grad_norm": 0.33020226938329394, + "learning_rate": 3.0601092896174864e-05, + "loss": 0.4806, + "step": 549 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.3248140873697447, + "learning_rate": 3.055555555555556e-05, + "loss": 0.5433, + "step": 550 + }, + { + "epoch": 1.3538083538083538, + "grad_norm": 0.3230001924138346, + "learning_rate": 3.0510018214936247e-05, + "loss": 0.5853, + "step": 551 + }, + { + "epoch": 1.3562653562653564, + "grad_norm": 0.35792498410700313, + "learning_rate": 3.046448087431694e-05, + "loss": 0.4961, + "step": 552 + }, + { + "epoch": 1.3587223587223587, + "grad_norm": 0.37595474090197006, + "learning_rate": 3.0418943533697637e-05, + "loss": 0.5736, + "step": 553 + }, + { + "epoch": 1.3611793611793612, + "grad_norm": 4.505487497550051, + "learning_rate": 3.0373406193078324e-05, + "loss": 0.7102, + "step": 554 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.4374726309839091, + "learning_rate": 3.0327868852459017e-05, + "loss": 0.5518, + "step": 555 + }, + { + "epoch": 1.366093366093366, + "grad_norm": 0.3410372388042177, + "learning_rate": 3.028233151183971e-05, + "loss": 0.5756, + "step": 556 + }, + { + "epoch": 1.3685503685503686, + "grad_norm": 0.4059522167570831, + "learning_rate": 3.02367941712204e-05, + "loss": 0.5434, + "step": 557 + }, + { + "epoch": 1.3710073710073711, + "grad_norm": 1.27646389360509, + "learning_rate": 3.0191256830601094e-05, + "loss": 0.4415, + "step": 558 + }, + { + "epoch": 1.3734643734643734, + "grad_norm": 0.39439443949738967, + "learning_rate": 3.0145719489981787e-05, + "loss": 0.5545, + "step": 559 + }, + { + "epoch": 1.375921375921376, + "grad_norm": 0.3677529411655735, + "learning_rate": 3.0100182149362477e-05, + "loss": 0.5867, + "step": 560 + }, + { + "epoch": 1.3783783783783785, + "grad_norm": 0.3938192601050454, + "learning_rate": 3.005464480874317e-05, + "loss": 0.6695, + "step": 561 + }, + { + "epoch": 1.3808353808353808, + "grad_norm": 0.4188986867298627, + "learning_rate": 3.0009107468123864e-05, + "loss": 0.6397, + "step": 562 + }, + { + "epoch": 1.3832923832923834, + "grad_norm": 0.36496617388518987, + "learning_rate": 2.9963570127504554e-05, + "loss": 0.642, + "step": 563 + }, + { + "epoch": 1.3857493857493859, + "grad_norm": 0.36732841184379483, + "learning_rate": 2.9918032786885248e-05, + "loss": 0.6248, + "step": 564 + }, + { + "epoch": 1.3882063882063882, + "grad_norm": 0.3287638014013039, + "learning_rate": 2.987249544626594e-05, + "loss": 0.5528, + "step": 565 + }, + { + "epoch": 1.3906633906633907, + "grad_norm": 0.3290826647548518, + "learning_rate": 2.982695810564663e-05, + "loss": 0.5995, + "step": 566 + }, + { + "epoch": 1.393120393120393, + "grad_norm": 7.1958757083746585, + "learning_rate": 2.9781420765027324e-05, + "loss": 1.1998, + "step": 567 + }, + { + "epoch": 1.3955773955773956, + "grad_norm": 2.242377112943564, + "learning_rate": 2.9735883424408018e-05, + "loss": 0.6146, + "step": 568 + }, + { + "epoch": 1.398034398034398, + "grad_norm": 0.46246249293412817, + "learning_rate": 2.9690346083788704e-05, + "loss": 0.6113, + "step": 569 + }, + { + "epoch": 1.4004914004914004, + "grad_norm": 0.32979804742212093, + "learning_rate": 2.96448087431694e-05, + "loss": 0.4534, + "step": 570 + }, + { + "epoch": 1.402948402948403, + "grad_norm": 0.362610751520732, + "learning_rate": 2.9599271402550094e-05, + "loss": 0.6311, + "step": 571 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 0.3834824335793335, + "learning_rate": 2.955373406193078e-05, + "loss": 0.5671, + "step": 572 + }, + { + "epoch": 1.4078624078624078, + "grad_norm": 0.3162740542302538, + "learning_rate": 2.9508196721311478e-05, + "loss": 0.4862, + "step": 573 + }, + { + "epoch": 1.4103194103194103, + "grad_norm": 0.6767191055749541, + "learning_rate": 2.946265938069217e-05, + "loss": 0.574, + "step": 574 + }, + { + "epoch": 1.4127764127764126, + "grad_norm": 0.2842423993917733, + "learning_rate": 2.9417122040072858e-05, + "loss": 0.4788, + "step": 575 + }, + { + "epoch": 1.4152334152334152, + "grad_norm": 0.4398113214411677, + "learning_rate": 2.937158469945355e-05, + "loss": 0.5164, + "step": 576 + }, + { + "epoch": 1.4176904176904177, + "grad_norm": 0.33122596395932824, + "learning_rate": 2.9326047358834248e-05, + "loss": 0.5859, + "step": 577 + }, + { + "epoch": 1.42014742014742, + "grad_norm": 0.6087999515003836, + "learning_rate": 2.9280510018214935e-05, + "loss": 0.5303, + "step": 578 + }, + { + "epoch": 1.4226044226044225, + "grad_norm": 0.32726522540965197, + "learning_rate": 2.9234972677595628e-05, + "loss": 0.5724, + "step": 579 + }, + { + "epoch": 1.425061425061425, + "grad_norm": 0.4106956970848515, + "learning_rate": 2.918943533697632e-05, + "loss": 0.5341, + "step": 580 + }, + { + "epoch": 1.4275184275184274, + "grad_norm": 0.2852492527312244, + "learning_rate": 2.9143897996357018e-05, + "loss": 0.5196, + "step": 581 + }, + { + "epoch": 1.42997542997543, + "grad_norm": 0.285739926248913, + "learning_rate": 2.9098360655737705e-05, + "loss": 0.5442, + "step": 582 + }, + { + "epoch": 1.4324324324324325, + "grad_norm": 0.36932760352978566, + "learning_rate": 2.9052823315118398e-05, + "loss": 0.6142, + "step": 583 + }, + { + "epoch": 1.4348894348894348, + "grad_norm": 0.33676368381537514, + "learning_rate": 2.9007285974499095e-05, + "loss": 0.6479, + "step": 584 + }, + { + "epoch": 1.4373464373464373, + "grad_norm": 2.762615052994114, + "learning_rate": 2.896174863387978e-05, + "loss": 0.5999, + "step": 585 + }, + { + "epoch": 1.4398034398034398, + "grad_norm": 0.33823584373786114, + "learning_rate": 2.8916211293260475e-05, + "loss": 0.5738, + "step": 586 + }, + { + "epoch": 1.4422604422604421, + "grad_norm": 0.35461746489514906, + "learning_rate": 2.8870673952641168e-05, + "loss": 0.5155, + "step": 587 + }, + { + "epoch": 1.4447174447174447, + "grad_norm": 0.30665641874707567, + "learning_rate": 2.8825136612021858e-05, + "loss": 0.5726, + "step": 588 + }, + { + "epoch": 1.4471744471744472, + "grad_norm": 0.3218144044024646, + "learning_rate": 2.877959927140255e-05, + "loss": 0.5198, + "step": 589 + }, + { + "epoch": 1.4496314496314495, + "grad_norm": 0.3282045050488162, + "learning_rate": 2.8734061930783245e-05, + "loss": 0.5923, + "step": 590 + }, + { + "epoch": 1.452088452088452, + "grad_norm": 0.2882401227029393, + "learning_rate": 2.8688524590163935e-05, + "loss": 0.5092, + "step": 591 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 0.3127088054502666, + "learning_rate": 2.864298724954463e-05, + "loss": 0.526, + "step": 592 + }, + { + "epoch": 1.457002457002457, + "grad_norm": 2.4172038138032828, + "learning_rate": 2.8597449908925322e-05, + "loss": 0.7051, + "step": 593 + }, + { + "epoch": 1.4594594594594594, + "grad_norm": 0.5233594884346805, + "learning_rate": 2.8551912568306012e-05, + "loss": 0.4573, + "step": 594 + }, + { + "epoch": 1.461916461916462, + "grad_norm": 21.995326582795617, + "learning_rate": 2.8506375227686705e-05, + "loss": 0.6321, + "step": 595 + }, + { + "epoch": 1.4643734643734643, + "grad_norm": 0.5078686288976298, + "learning_rate": 2.84608378870674e-05, + "loss": 0.5941, + "step": 596 + }, + { + "epoch": 1.4668304668304668, + "grad_norm": 0.3609671816475955, + "learning_rate": 2.841530054644809e-05, + "loss": 0.5707, + "step": 597 + }, + { + "epoch": 1.4692874692874693, + "grad_norm": 0.35532192813895724, + "learning_rate": 2.8369763205828782e-05, + "loss": 0.4821, + "step": 598 + }, + { + "epoch": 1.4717444717444716, + "grad_norm": 0.3671957212508993, + "learning_rate": 2.8324225865209475e-05, + "loss": 0.4661, + "step": 599 + }, + { + "epoch": 1.4742014742014742, + "grad_norm": 0.30117085000522925, + "learning_rate": 2.8278688524590162e-05, + "loss": 0.4893, + "step": 600 + }, + { + "epoch": 1.4766584766584767, + "grad_norm": 0.33268501284939167, + "learning_rate": 2.823315118397086e-05, + "loss": 0.5176, + "step": 601 + }, + { + "epoch": 1.479115479115479, + "grad_norm": 0.33604265458925436, + "learning_rate": 2.8187613843351552e-05, + "loss": 0.5182, + "step": 602 + }, + { + "epoch": 1.4815724815724816, + "grad_norm": 0.3360889711504089, + "learning_rate": 2.814207650273224e-05, + "loss": 0.5907, + "step": 603 + }, + { + "epoch": 1.484029484029484, + "grad_norm": 0.328673675164007, + "learning_rate": 2.8096539162112932e-05, + "loss": 0.55, + "step": 604 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.31706495449515043, + "learning_rate": 2.805100182149363e-05, + "loss": 0.6074, + "step": 605 + }, + { + "epoch": 1.488943488943489, + "grad_norm": 0.2920463104930728, + "learning_rate": 2.8005464480874316e-05, + "loss": 0.5588, + "step": 606 + }, + { + "epoch": 1.4914004914004915, + "grad_norm": 0.2861747694993797, + "learning_rate": 2.795992714025501e-05, + "loss": 0.5292, + "step": 607 + }, + { + "epoch": 1.4938574938574938, + "grad_norm": 0.3285475879079594, + "learning_rate": 2.7914389799635702e-05, + "loss": 0.6151, + "step": 608 + }, + { + "epoch": 1.4963144963144963, + "grad_norm": 0.34034599761877166, + "learning_rate": 2.7868852459016392e-05, + "loss": 0.697, + "step": 609 + }, + { + "epoch": 1.4987714987714988, + "grad_norm": 0.2804576195171845, + "learning_rate": 2.7823315118397086e-05, + "loss": 0.5973, + "step": 610 + }, + { + "epoch": 1.5012285012285012, + "grad_norm": 0.33784205792768834, + "learning_rate": 2.777777777777778e-05, + "loss": 0.474, + "step": 611 + }, + { + "epoch": 1.5036855036855037, + "grad_norm": 0.31097306577779116, + "learning_rate": 2.773224043715847e-05, + "loss": 0.4641, + "step": 612 + }, + { + "epoch": 1.5061425061425062, + "grad_norm": 0.2844030812033379, + "learning_rate": 2.7686703096539162e-05, + "loss": 0.5817, + "step": 613 + }, + { + "epoch": 1.5085995085995085, + "grad_norm": 0.38968984446732813, + "learning_rate": 2.7641165755919856e-05, + "loss": 0.5086, + "step": 614 + }, + { + "epoch": 1.511056511056511, + "grad_norm": 0.31040975194056414, + "learning_rate": 2.7595628415300546e-05, + "loss": 0.4693, + "step": 615 + }, + { + "epoch": 1.5135135135135136, + "grad_norm": 0.33741077330159325, + "learning_rate": 2.755009107468124e-05, + "loss": 0.589, + "step": 616 + }, + { + "epoch": 1.515970515970516, + "grad_norm": 0.31661478005044347, + "learning_rate": 2.7504553734061933e-05, + "loss": 0.6049, + "step": 617 + }, + { + "epoch": 1.5184275184275184, + "grad_norm": 0.3335488405098975, + "learning_rate": 2.7459016393442626e-05, + "loss": 0.6017, + "step": 618 + }, + { + "epoch": 1.520884520884521, + "grad_norm": 0.31027476247695246, + "learning_rate": 2.7413479052823316e-05, + "loss": 0.5838, + "step": 619 + }, + { + "epoch": 1.5233415233415233, + "grad_norm": 0.2777646630361566, + "learning_rate": 2.736794171220401e-05, + "loss": 0.5584, + "step": 620 + }, + { + "epoch": 1.5257985257985258, + "grad_norm": 0.2913563020518906, + "learning_rate": 2.7322404371584703e-05, + "loss": 0.4762, + "step": 621 + }, + { + "epoch": 1.5282555282555284, + "grad_norm": 0.350171904455104, + "learning_rate": 2.7276867030965393e-05, + "loss": 0.6177, + "step": 622 + }, + { + "epoch": 1.5307125307125307, + "grad_norm": 0.30111848450509915, + "learning_rate": 2.7231329690346086e-05, + "loss": 0.5101, + "step": 623 + }, + { + "epoch": 1.5331695331695332, + "grad_norm": 0.2867090459147248, + "learning_rate": 2.718579234972678e-05, + "loss": 0.5821, + "step": 624 + }, + { + "epoch": 1.5356265356265357, + "grad_norm": 0.3638114975712457, + "learning_rate": 2.714025500910747e-05, + "loss": 0.6383, + "step": 625 + }, + { + "epoch": 1.538083538083538, + "grad_norm": 0.2867067374702508, + "learning_rate": 2.7094717668488163e-05, + "loss": 0.5452, + "step": 626 + }, + { + "epoch": 1.5405405405405406, + "grad_norm": 0.28737600107101, + "learning_rate": 2.7049180327868856e-05, + "loss": 0.5418, + "step": 627 + }, + { + "epoch": 1.542997542997543, + "grad_norm": 0.26498806931754665, + "learning_rate": 2.7003642987249543e-05, + "loss": 0.5039, + "step": 628 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 0.3011713624967737, + "learning_rate": 2.695810564663024e-05, + "loss": 0.53, + "step": 629 + }, + { + "epoch": 1.547911547911548, + "grad_norm": 0.2691530635935858, + "learning_rate": 2.6912568306010933e-05, + "loss": 0.4859, + "step": 630 + }, + { + "epoch": 1.5503685503685505, + "grad_norm": 0.2558261406391803, + "learning_rate": 2.686703096539162e-05, + "loss": 0.477, + "step": 631 + }, + { + "epoch": 1.5528255528255528, + "grad_norm": 0.25620551411091325, + "learning_rate": 2.6821493624772313e-05, + "loss": 0.4604, + "step": 632 + }, + { + "epoch": 1.5552825552825553, + "grad_norm": 0.2763114315788065, + "learning_rate": 2.677595628415301e-05, + "loss": 0.5792, + "step": 633 + }, + { + "epoch": 1.5577395577395579, + "grad_norm": 0.28083914382759145, + "learning_rate": 2.6730418943533697e-05, + "loss": 0.5356, + "step": 634 + }, + { + "epoch": 1.5601965601965602, + "grad_norm": 0.2920422213997252, + "learning_rate": 2.668488160291439e-05, + "loss": 0.5251, + "step": 635 + }, + { + "epoch": 1.5626535626535627, + "grad_norm": 0.27262649363198743, + "learning_rate": 2.6639344262295087e-05, + "loss": 0.5402, + "step": 636 + }, + { + "epoch": 1.5651105651105652, + "grad_norm": 0.2910361795236517, + "learning_rate": 2.6593806921675773e-05, + "loss": 0.5268, + "step": 637 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 0.26654925026502435, + "learning_rate": 2.6548269581056467e-05, + "loss": 0.5328, + "step": 638 + }, + { + "epoch": 1.57002457002457, + "grad_norm": 0.2973118734804211, + "learning_rate": 2.650273224043716e-05, + "loss": 0.5803, + "step": 639 + }, + { + "epoch": 1.5724815724815726, + "grad_norm": 0.2971915311670167, + "learning_rate": 2.645719489981785e-05, + "loss": 0.5564, + "step": 640 + }, + { + "epoch": 1.574938574938575, + "grad_norm": 0.2969196104157182, + "learning_rate": 2.6411657559198543e-05, + "loss": 0.5677, + "step": 641 + }, + { + "epoch": 1.5773955773955772, + "grad_norm": 0.273834529796921, + "learning_rate": 2.6366120218579237e-05, + "loss": 0.4952, + "step": 642 + }, + { + "epoch": 1.57985257985258, + "grad_norm": 0.2743084458426201, + "learning_rate": 2.6320582877959927e-05, + "loss": 0.5289, + "step": 643 + }, + { + "epoch": 1.5823095823095823, + "grad_norm": 0.27010983622024526, + "learning_rate": 2.627504553734062e-05, + "loss": 0.5218, + "step": 644 + }, + { + "epoch": 1.5847665847665846, + "grad_norm": 0.33171449854749435, + "learning_rate": 2.6229508196721314e-05, + "loss": 0.5671, + "step": 645 + }, + { + "epoch": 1.5872235872235874, + "grad_norm": 0.30504425469503404, + "learning_rate": 2.6183970856102004e-05, + "loss": 0.5877, + "step": 646 + }, + { + "epoch": 1.5896805896805897, + "grad_norm": 0.278481170782536, + "learning_rate": 2.6138433515482697e-05, + "loss": 0.6005, + "step": 647 + }, + { + "epoch": 1.592137592137592, + "grad_norm": 0.29696142460818625, + "learning_rate": 2.609289617486339e-05, + "loss": 0.5305, + "step": 648 + }, + { + "epoch": 1.5945945945945947, + "grad_norm": 0.33350125578968326, + "learning_rate": 2.604735883424408e-05, + "loss": 0.5594, + "step": 649 + }, + { + "epoch": 1.597051597051597, + "grad_norm": 0.2895010135396355, + "learning_rate": 2.6001821493624774e-05, + "loss": 0.5711, + "step": 650 + }, + { + "epoch": 1.5995085995085994, + "grad_norm": 0.3218565904038471, + "learning_rate": 2.5956284153005467e-05, + "loss": 0.5958, + "step": 651 + }, + { + "epoch": 1.6019656019656021, + "grad_norm": 0.333555069936009, + "learning_rate": 2.5910746812386154e-05, + "loss": 0.5207, + "step": 652 + }, + { + "epoch": 1.6044226044226044, + "grad_norm": 0.24599267706082115, + "learning_rate": 2.586520947176685e-05, + "loss": 0.5302, + "step": 653 + }, + { + "epoch": 1.6068796068796067, + "grad_norm": 0.30331457642746157, + "learning_rate": 2.5819672131147544e-05, + "loss": 0.5827, + "step": 654 + }, + { + "epoch": 1.6093366093366095, + "grad_norm": 0.30665436759125925, + "learning_rate": 2.5774134790528237e-05, + "loss": 0.5576, + "step": 655 + }, + { + "epoch": 1.6117936117936118, + "grad_norm": 0.34092613831052127, + "learning_rate": 2.5728597449908924e-05, + "loss": 0.5054, + "step": 656 + }, + { + "epoch": 1.6142506142506141, + "grad_norm": 0.28798725261513564, + "learning_rate": 2.568306010928962e-05, + "loss": 0.5379, + "step": 657 + }, + { + "epoch": 1.6167076167076169, + "grad_norm": 0.3089417920348662, + "learning_rate": 2.5637522768670314e-05, + "loss": 0.5599, + "step": 658 + }, + { + "epoch": 1.6191646191646192, + "grad_norm": 0.30820249889026247, + "learning_rate": 2.5591985428051e-05, + "loss": 0.4972, + "step": 659 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.27710041943109465, + "learning_rate": 2.5546448087431697e-05, + "loss": 0.5631, + "step": 660 + }, + { + "epoch": 1.6240786240786242, + "grad_norm": 0.30117632251651183, + "learning_rate": 2.550091074681239e-05, + "loss": 0.6309, + "step": 661 + }, + { + "epoch": 1.6265356265356266, + "grad_norm": 0.3148919358125854, + "learning_rate": 2.5455373406193077e-05, + "loss": 0.6265, + "step": 662 + }, + { + "epoch": 1.6289926289926289, + "grad_norm": 0.2912887580005306, + "learning_rate": 2.540983606557377e-05, + "loss": 0.5901, + "step": 663 + }, + { + "epoch": 1.6314496314496314, + "grad_norm": 0.24051170702858976, + "learning_rate": 2.5364298724954468e-05, + "loss": 0.4714, + "step": 664 + }, + { + "epoch": 1.633906633906634, + "grad_norm": 0.3126823231242451, + "learning_rate": 2.5318761384335154e-05, + "loss": 0.5539, + "step": 665 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 0.32930595380736144, + "learning_rate": 2.5273224043715848e-05, + "loss": 0.5895, + "step": 666 + }, + { + "epoch": 1.6388206388206388, + "grad_norm": 0.2932745657575283, + "learning_rate": 2.522768670309654e-05, + "loss": 0.5388, + "step": 667 + }, + { + "epoch": 1.6412776412776413, + "grad_norm": 0.2787597154650259, + "learning_rate": 2.518214936247723e-05, + "loss": 0.5179, + "step": 668 + }, + { + "epoch": 1.6437346437346436, + "grad_norm": 0.2937841728580581, + "learning_rate": 2.5136612021857924e-05, + "loss": 0.5083, + "step": 669 + }, + { + "epoch": 1.6461916461916462, + "grad_norm": 0.32130959163428363, + "learning_rate": 2.5091074681238618e-05, + "loss": 0.5693, + "step": 670 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 0.2991851088280298, + "learning_rate": 2.5045537340619308e-05, + "loss": 0.4979, + "step": 671 + }, + { + "epoch": 1.651105651105651, + "grad_norm": 0.2776565257867249, + "learning_rate": 2.5e-05, + "loss": 0.5318, + "step": 672 + }, + { + "epoch": 1.6535626535626535, + "grad_norm": 0.36657164420346156, + "learning_rate": 2.495446265938069e-05, + "loss": 0.5043, + "step": 673 + }, + { + "epoch": 1.656019656019656, + "grad_norm": 0.3215204376240886, + "learning_rate": 2.4908925318761388e-05, + "loss": 0.5854, + "step": 674 + }, + { + "epoch": 1.6584766584766584, + "grad_norm": 0.2950829336587517, + "learning_rate": 2.4863387978142078e-05, + "loss": 0.6343, + "step": 675 + }, + { + "epoch": 1.660933660933661, + "grad_norm": 0.3113697420564982, + "learning_rate": 2.4817850637522768e-05, + "loss": 0.5147, + "step": 676 + }, + { + "epoch": 1.6633906633906634, + "grad_norm": 0.31302535665995557, + "learning_rate": 2.477231329690346e-05, + "loss": 0.49, + "step": 677 + }, + { + "epoch": 1.6658476658476657, + "grad_norm": 0.34035592954838445, + "learning_rate": 2.4726775956284155e-05, + "loss": 0.5816, + "step": 678 + }, + { + "epoch": 1.6683046683046683, + "grad_norm": 0.3015369953798126, + "learning_rate": 2.4681238615664845e-05, + "loss": 0.5775, + "step": 679 + }, + { + "epoch": 1.6707616707616708, + "grad_norm": 0.31787296914393265, + "learning_rate": 2.4635701275045538e-05, + "loss": 0.5725, + "step": 680 + }, + { + "epoch": 1.6732186732186731, + "grad_norm": 0.3379584502023648, + "learning_rate": 2.459016393442623e-05, + "loss": 0.5642, + "step": 681 + }, + { + "epoch": 1.6756756756756757, + "grad_norm": 0.27840602624421484, + "learning_rate": 2.4544626593806925e-05, + "loss": 0.615, + "step": 682 + }, + { + "epoch": 1.6781326781326782, + "grad_norm": 0.28097823151212464, + "learning_rate": 2.4499089253187615e-05, + "loss": 0.547, + "step": 683 + }, + { + "epoch": 1.6805896805896805, + "grad_norm": 0.2616920934490369, + "learning_rate": 2.4453551912568305e-05, + "loss": 0.4777, + "step": 684 + }, + { + "epoch": 1.683046683046683, + "grad_norm": 0.2787698127004213, + "learning_rate": 2.4408014571949e-05, + "loss": 0.5141, + "step": 685 + }, + { + "epoch": 1.6855036855036856, + "grad_norm": 0.27981914048115714, + "learning_rate": 2.436247723132969e-05, + "loss": 0.5716, + "step": 686 + }, + { + "epoch": 1.6879606879606879, + "grad_norm": 4.088562590331179, + "learning_rate": 2.431693989071038e-05, + "loss": 0.6609, + "step": 687 + }, + { + "epoch": 1.6904176904176904, + "grad_norm": 0.2956395589638685, + "learning_rate": 2.427140255009108e-05, + "loss": 0.5403, + "step": 688 + }, + { + "epoch": 1.692874692874693, + "grad_norm": 0.29438181310455147, + "learning_rate": 2.422586520947177e-05, + "loss": 0.5685, + "step": 689 + }, + { + "epoch": 1.6953316953316953, + "grad_norm": 0.2538193516711629, + "learning_rate": 2.418032786885246e-05, + "loss": 0.4785, + "step": 690 + }, + { + "epoch": 1.6977886977886978, + "grad_norm": 0.28796731794186514, + "learning_rate": 2.4134790528233152e-05, + "loss": 0.552, + "step": 691 + }, + { + "epoch": 1.7002457002457003, + "grad_norm": 0.27868424952411996, + "learning_rate": 2.4089253187613845e-05, + "loss": 0.5076, + "step": 692 + }, + { + "epoch": 1.7027027027027026, + "grad_norm": 0.25990529519697514, + "learning_rate": 2.4043715846994535e-05, + "loss": 0.4533, + "step": 693 + }, + { + "epoch": 1.7051597051597052, + "grad_norm": 0.2873379074909231, + "learning_rate": 2.399817850637523e-05, + "loss": 0.5913, + "step": 694 + }, + { + "epoch": 1.7076167076167077, + "grad_norm": 0.29088350023709175, + "learning_rate": 2.3952641165755922e-05, + "loss": 0.5959, + "step": 695 + }, + { + "epoch": 1.71007371007371, + "grad_norm": 0.2573364666726, + "learning_rate": 2.3907103825136612e-05, + "loss": 0.5062, + "step": 696 + }, + { + "epoch": 1.7125307125307125, + "grad_norm": 0.29392565603255266, + "learning_rate": 2.3861566484517305e-05, + "loss": 0.5687, + "step": 697 + }, + { + "epoch": 1.714987714987715, + "grad_norm": 0.2891193206942597, + "learning_rate": 2.3816029143898e-05, + "loss": 0.6049, + "step": 698 + }, + { + "epoch": 1.7174447174447174, + "grad_norm": 0.2840198076767787, + "learning_rate": 2.377049180327869e-05, + "loss": 0.507, + "step": 699 + }, + { + "epoch": 1.71990171990172, + "grad_norm": 0.29919110648065483, + "learning_rate": 2.3724954462659382e-05, + "loss": 0.5467, + "step": 700 + }, + { + "epoch": 1.7223587223587224, + "grad_norm": 0.3058328148398321, + "learning_rate": 2.3679417122040072e-05, + "loss": 0.5812, + "step": 701 + }, + { + "epoch": 1.7248157248157248, + "grad_norm": 0.30016823104589047, + "learning_rate": 2.363387978142077e-05, + "loss": 0.5755, + "step": 702 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 0.3499121631564174, + "learning_rate": 2.358834244080146e-05, + "loss": 0.6278, + "step": 703 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 0.7047977484338855, + "learning_rate": 2.354280510018215e-05, + "loss": 0.4715, + "step": 704 + }, + { + "epoch": 1.7321867321867321, + "grad_norm": 0.3627891702679587, + "learning_rate": 2.3497267759562842e-05, + "loss": 0.5641, + "step": 705 + }, + { + "epoch": 1.7346437346437347, + "grad_norm": 0.3269601035291933, + "learning_rate": 2.3451730418943536e-05, + "loss": 0.5644, + "step": 706 + }, + { + "epoch": 1.7371007371007372, + "grad_norm": 0.3229301454671492, + "learning_rate": 2.3406193078324226e-05, + "loss": 0.5623, + "step": 707 + }, + { + "epoch": 1.7395577395577395, + "grad_norm": 0.2621133990792928, + "learning_rate": 2.336065573770492e-05, + "loss": 0.4495, + "step": 708 + }, + { + "epoch": 1.742014742014742, + "grad_norm": 0.3105707711781621, + "learning_rate": 2.3315118397085612e-05, + "loss": 0.6677, + "step": 709 + }, + { + "epoch": 1.7444717444717446, + "grad_norm": 0.31405789889240876, + "learning_rate": 2.3269581056466302e-05, + "loss": 0.6109, + "step": 710 + }, + { + "epoch": 1.746928746928747, + "grad_norm": 0.27862780228274875, + "learning_rate": 2.3224043715846996e-05, + "loss": 0.4707, + "step": 711 + }, + { + "epoch": 1.7493857493857494, + "grad_norm": 0.2951135319765008, + "learning_rate": 2.317850637522769e-05, + "loss": 0.5152, + "step": 712 + }, + { + "epoch": 1.751842751842752, + "grad_norm": 0.23868591982715384, + "learning_rate": 2.313296903460838e-05, + "loss": 0.4805, + "step": 713 + }, + { + "epoch": 1.7542997542997543, + "grad_norm": 0.2809998521081784, + "learning_rate": 2.3087431693989073e-05, + "loss": 0.6071, + "step": 714 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.3062020782313415, + "learning_rate": 2.3041894353369763e-05, + "loss": 0.5542, + "step": 715 + }, + { + "epoch": 1.7592137592137593, + "grad_norm": 0.25160094772031133, + "learning_rate": 2.2996357012750456e-05, + "loss": 0.5262, + "step": 716 + }, + { + "epoch": 1.7616707616707616, + "grad_norm": 0.2648834976152306, + "learning_rate": 2.295081967213115e-05, + "loss": 0.4888, + "step": 717 + }, + { + "epoch": 1.7641277641277642, + "grad_norm": 0.2692198246802372, + "learning_rate": 2.290528233151184e-05, + "loss": 0.5413, + "step": 718 + }, + { + "epoch": 1.7665847665847667, + "grad_norm": 0.302608144563453, + "learning_rate": 2.2859744990892533e-05, + "loss": 0.5485, + "step": 719 + }, + { + "epoch": 1.769041769041769, + "grad_norm": 0.3024049619053678, + "learning_rate": 2.2814207650273226e-05, + "loss": 0.5612, + "step": 720 + }, + { + "epoch": 1.7714987714987716, + "grad_norm": 0.2686755624508314, + "learning_rate": 2.2768670309653916e-05, + "loss": 0.5532, + "step": 721 + }, + { + "epoch": 1.773955773955774, + "grad_norm": 0.31355914454819966, + "learning_rate": 2.272313296903461e-05, + "loss": 0.545, + "step": 722 + }, + { + "epoch": 1.7764127764127764, + "grad_norm": 0.2679523528547601, + "learning_rate": 2.2677595628415303e-05, + "loss": 0.5404, + "step": 723 + }, + { + "epoch": 1.7788697788697787, + "grad_norm": 0.34380417593496515, + "learning_rate": 2.2632058287795993e-05, + "loss": 0.6352, + "step": 724 + }, + { + "epoch": 1.7813267813267815, + "grad_norm": 0.29712724540471824, + "learning_rate": 2.2586520947176686e-05, + "loss": 0.5769, + "step": 725 + }, + { + "epoch": 1.7837837837837838, + "grad_norm": 0.29704261087468237, + "learning_rate": 2.254098360655738e-05, + "loss": 0.5088, + "step": 726 + }, + { + "epoch": 1.786240786240786, + "grad_norm": 0.2823609172880149, + "learning_rate": 2.249544626593807e-05, + "loss": 0.5047, + "step": 727 + }, + { + "epoch": 1.7886977886977888, + "grad_norm": 0.5140965758355988, + "learning_rate": 2.2449908925318763e-05, + "loss": 0.4842, + "step": 728 + }, + { + "epoch": 1.7911547911547911, + "grad_norm": 1.7892960579058013, + "learning_rate": 2.2404371584699453e-05, + "loss": 0.5787, + "step": 729 + }, + { + "epoch": 1.7936117936117935, + "grad_norm": 0.31920880587878125, + "learning_rate": 2.2358834244080147e-05, + "loss": 0.5224, + "step": 730 + }, + { + "epoch": 1.7960687960687962, + "grad_norm": 0.2842312803943501, + "learning_rate": 2.231329690346084e-05, + "loss": 0.5373, + "step": 731 + }, + { + "epoch": 1.7985257985257985, + "grad_norm": 0.3016887670720209, + "learning_rate": 2.226775956284153e-05, + "loss": 0.542, + "step": 732 + }, + { + "epoch": 1.8009828009828008, + "grad_norm": 0.3107473379471071, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.555, + "step": 733 + }, + { + "epoch": 1.8034398034398036, + "grad_norm": 0.2955503666728652, + "learning_rate": 2.2176684881602917e-05, + "loss": 0.5613, + "step": 734 + }, + { + "epoch": 1.805896805896806, + "grad_norm": 0.30524826919317594, + "learning_rate": 2.2131147540983607e-05, + "loss": 0.5297, + "step": 735 + }, + { + "epoch": 1.8083538083538082, + "grad_norm": 0.27729189858459274, + "learning_rate": 2.20856102003643e-05, + "loss": 0.6061, + "step": 736 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 0.2893287066030788, + "learning_rate": 2.2040072859744993e-05, + "loss": 0.4896, + "step": 737 + }, + { + "epoch": 1.8132678132678133, + "grad_norm": 0.2606901787727459, + "learning_rate": 2.1994535519125683e-05, + "loss": 0.5773, + "step": 738 + }, + { + "epoch": 1.8157248157248156, + "grad_norm": 0.25527945735655144, + "learning_rate": 2.1948998178506377e-05, + "loss": 0.561, + "step": 739 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.2669859467920838, + "learning_rate": 2.190346083788707e-05, + "loss": 0.5482, + "step": 740 + }, + { + "epoch": 1.8206388206388207, + "grad_norm": 0.2693248782403161, + "learning_rate": 2.185792349726776e-05, + "loss": 0.5919, + "step": 741 + }, + { + "epoch": 1.823095823095823, + "grad_norm": 0.2554414448707284, + "learning_rate": 2.1812386156648454e-05, + "loss": 0.5184, + "step": 742 + }, + { + "epoch": 1.8255528255528255, + "grad_norm": 0.2945019560024116, + "learning_rate": 2.1766848816029144e-05, + "loss": 0.5633, + "step": 743 + }, + { + "epoch": 1.828009828009828, + "grad_norm": 0.30074361548984935, + "learning_rate": 2.1721311475409837e-05, + "loss": 0.6057, + "step": 744 + }, + { + "epoch": 1.8304668304668303, + "grad_norm": 0.31888019740726103, + "learning_rate": 2.167577413479053e-05, + "loss": 0.6217, + "step": 745 + }, + { + "epoch": 1.8329238329238329, + "grad_norm": 0.6953990362734012, + "learning_rate": 2.163023679417122e-05, + "loss": 0.5032, + "step": 746 + }, + { + "epoch": 1.8353808353808354, + "grad_norm": 0.2638969682962359, + "learning_rate": 2.1584699453551914e-05, + "loss": 0.5814, + "step": 747 + }, + { + "epoch": 1.8378378378378377, + "grad_norm": 0.2673502531002082, + "learning_rate": 2.1539162112932607e-05, + "loss": 0.5353, + "step": 748 + }, + { + "epoch": 1.8402948402948403, + "grad_norm": 0.3052402733466028, + "learning_rate": 2.1493624772313297e-05, + "loss": 0.6075, + "step": 749 + }, + { + "epoch": 1.8427518427518428, + "grad_norm": 0.2590866717496352, + "learning_rate": 2.144808743169399e-05, + "loss": 0.548, + "step": 750 + }, + { + "epoch": 1.845208845208845, + "grad_norm": 0.2800576089744623, + "learning_rate": 2.1402550091074684e-05, + "loss": 0.5195, + "step": 751 + }, + { + "epoch": 1.8476658476658476, + "grad_norm": 0.3032887173826026, + "learning_rate": 2.1357012750455374e-05, + "loss": 0.482, + "step": 752 + }, + { + "epoch": 1.8501228501228502, + "grad_norm": 0.2780282487888591, + "learning_rate": 2.1311475409836064e-05, + "loss": 0.5939, + "step": 753 + }, + { + "epoch": 1.8525798525798525, + "grad_norm": 0.28853976340491677, + "learning_rate": 2.126593806921676e-05, + "loss": 0.5743, + "step": 754 + }, + { + "epoch": 1.855036855036855, + "grad_norm": 0.3327786395512487, + "learning_rate": 2.122040072859745e-05, + "loss": 0.4186, + "step": 755 + }, + { + "epoch": 1.8574938574938575, + "grad_norm": 0.2819311307178514, + "learning_rate": 2.1174863387978144e-05, + "loss": 0.5976, + "step": 756 + }, + { + "epoch": 1.8599508599508598, + "grad_norm": 0.27219577380560167, + "learning_rate": 2.1129326047358834e-05, + "loss": 0.481, + "step": 757 + }, + { + "epoch": 1.8624078624078624, + "grad_norm": 0.2888069852070096, + "learning_rate": 2.1083788706739527e-05, + "loss": 0.5633, + "step": 758 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 0.283281648136038, + "learning_rate": 2.103825136612022e-05, + "loss": 0.4768, + "step": 759 + }, + { + "epoch": 1.8673218673218672, + "grad_norm": 0.2506456753016858, + "learning_rate": 2.099271402550091e-05, + "loss": 0.4806, + "step": 760 + }, + { + "epoch": 1.8697788697788698, + "grad_norm": 0.259014545169757, + "learning_rate": 2.0947176684881604e-05, + "loss": 0.5095, + "step": 761 + }, + { + "epoch": 1.8722358722358723, + "grad_norm": 0.28690420297792946, + "learning_rate": 2.0901639344262298e-05, + "loss": 0.5739, + "step": 762 + }, + { + "epoch": 1.8746928746928746, + "grad_norm": 0.3067416454066446, + "learning_rate": 2.0856102003642988e-05, + "loss": 0.5557, + "step": 763 + }, + { + "epoch": 1.8771498771498771, + "grad_norm": 0.2557249566617281, + "learning_rate": 2.081056466302368e-05, + "loss": 0.5136, + "step": 764 + }, + { + "epoch": 1.8796068796068797, + "grad_norm": 0.27522324877940546, + "learning_rate": 2.0765027322404374e-05, + "loss": 0.582, + "step": 765 + }, + { + "epoch": 1.882063882063882, + "grad_norm": 0.27502624886911736, + "learning_rate": 2.0719489981785064e-05, + "loss": 0.5115, + "step": 766 + }, + { + "epoch": 1.8845208845208845, + "grad_norm": 0.28524062704028064, + "learning_rate": 2.0673952641165754e-05, + "loss": 0.604, + "step": 767 + }, + { + "epoch": 1.886977886977887, + "grad_norm": 0.2896306842805975, + "learning_rate": 2.062841530054645e-05, + "loss": 0.5909, + "step": 768 + }, + { + "epoch": 1.8894348894348894, + "grad_norm": 0.37017739402403504, + "learning_rate": 2.058287795992714e-05, + "loss": 0.6113, + "step": 769 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.25659739921327007, + "learning_rate": 2.053734061930783e-05, + "loss": 0.533, + "step": 770 + }, + { + "epoch": 1.8943488943488944, + "grad_norm": 0.2631965030301113, + "learning_rate": 2.0491803278688525e-05, + "loss": 0.4888, + "step": 771 + }, + { + "epoch": 1.8968058968058967, + "grad_norm": 0.26335776548343703, + "learning_rate": 2.0446265938069218e-05, + "loss": 0.5527, + "step": 772 + }, + { + "epoch": 1.8992628992628993, + "grad_norm": 0.2754894594590262, + "learning_rate": 2.040072859744991e-05, + "loss": 0.5974, + "step": 773 + }, + { + "epoch": 1.9017199017199018, + "grad_norm": 0.25145939998899, + "learning_rate": 2.03551912568306e-05, + "loss": 0.4814, + "step": 774 + }, + { + "epoch": 1.904176904176904, + "grad_norm": 0.24080950527586228, + "learning_rate": 2.0309653916211295e-05, + "loss": 0.485, + "step": 775 + }, + { + "epoch": 1.9066339066339066, + "grad_norm": 0.26980458286482356, + "learning_rate": 2.0264116575591988e-05, + "loss": 0.5519, + "step": 776 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.26842981889888856, + "learning_rate": 2.0218579234972678e-05, + "loss": 0.5098, + "step": 777 + }, + { + "epoch": 1.9115479115479115, + "grad_norm": 0.2524704758947662, + "learning_rate": 2.017304189435337e-05, + "loss": 0.4943, + "step": 778 + }, + { + "epoch": 1.914004914004914, + "grad_norm": 0.2856037686590571, + "learning_rate": 2.0127504553734065e-05, + "loss": 0.5953, + "step": 779 + }, + { + "epoch": 1.9164619164619165, + "grad_norm": 0.30220124538301824, + "learning_rate": 2.0081967213114755e-05, + "loss": 0.6225, + "step": 780 + }, + { + "epoch": 1.9189189189189189, + "grad_norm": 0.2680433180647516, + "learning_rate": 2.0036429872495445e-05, + "loss": 0.5303, + "step": 781 + }, + { + "epoch": 1.9213759213759214, + "grad_norm": 0.2658779045053125, + "learning_rate": 1.999089253187614e-05, + "loss": 0.5525, + "step": 782 + }, + { + "epoch": 1.923832923832924, + "grad_norm": 0.2962893495403355, + "learning_rate": 1.994535519125683e-05, + "loss": 0.5522, + "step": 783 + }, + { + "epoch": 1.9262899262899262, + "grad_norm": 0.5328833977576326, + "learning_rate": 1.989981785063752e-05, + "loss": 0.5646, + "step": 784 + }, + { + "epoch": 1.9287469287469288, + "grad_norm": 0.3653824185491529, + "learning_rate": 1.9854280510018215e-05, + "loss": 0.6461, + "step": 785 + }, + { + "epoch": 1.9312039312039313, + "grad_norm": 0.342083225179309, + "learning_rate": 1.980874316939891e-05, + "loss": 0.5784, + "step": 786 + }, + { + "epoch": 1.9336609336609336, + "grad_norm": 0.2879236786453721, + "learning_rate": 1.97632058287796e-05, + "loss": 0.4979, + "step": 787 + }, + { + "epoch": 1.9361179361179361, + "grad_norm": 0.2837537460473016, + "learning_rate": 1.9717668488160292e-05, + "loss": 0.5029, + "step": 788 + }, + { + "epoch": 1.9385749385749387, + "grad_norm": 0.32014780260432985, + "learning_rate": 1.9672131147540985e-05, + "loss": 0.5573, + "step": 789 + }, + { + "epoch": 1.941031941031941, + "grad_norm": 0.32817327775242966, + "learning_rate": 1.9626593806921675e-05, + "loss": 0.4987, + "step": 790 + }, + { + "epoch": 1.9434889434889435, + "grad_norm": 0.28163717402922606, + "learning_rate": 1.958105646630237e-05, + "loss": 0.4783, + "step": 791 + }, + { + "epoch": 1.945945945945946, + "grad_norm": 0.2861392501523952, + "learning_rate": 1.9535519125683062e-05, + "loss": 0.4826, + "step": 792 + }, + { + "epoch": 1.9484029484029484, + "grad_norm": 0.3363642864939868, + "learning_rate": 1.9489981785063755e-05, + "loss": 0.5295, + "step": 793 + }, + { + "epoch": 1.950859950859951, + "grad_norm": 0.297653344647604, + "learning_rate": 1.9444444444444445e-05, + "loss": 0.5454, + "step": 794 + }, + { + "epoch": 1.9533169533169534, + "grad_norm": 0.3270453521765983, + "learning_rate": 1.9398907103825135e-05, + "loss": 0.6322, + "step": 795 + }, + { + "epoch": 1.9557739557739557, + "grad_norm": 0.26046016384211484, + "learning_rate": 1.9353369763205832e-05, + "loss": 0.5203, + "step": 796 + }, + { + "epoch": 1.9582309582309583, + "grad_norm": 0.2770362852807057, + "learning_rate": 1.9307832422586522e-05, + "loss": 0.5273, + "step": 797 + }, + { + "epoch": 1.9606879606879608, + "grad_norm": 0.29331896952461095, + "learning_rate": 1.9262295081967212e-05, + "loss": 0.6334, + "step": 798 + }, + { + "epoch": 1.9631449631449631, + "grad_norm": 0.8983263494767803, + "learning_rate": 1.9216757741347906e-05, + "loss": 0.5353, + "step": 799 + }, + { + "epoch": 1.9656019656019657, + "grad_norm": 0.354421880818103, + "learning_rate": 1.91712204007286e-05, + "loss": 0.5813, + "step": 800 + }, + { + "epoch": 1.9680589680589682, + "grad_norm": 0.2927169517127379, + "learning_rate": 1.912568306010929e-05, + "loss": 0.579, + "step": 801 + }, + { + "epoch": 1.9705159705159705, + "grad_norm": 0.32721857789898107, + "learning_rate": 1.9080145719489982e-05, + "loss": 0.5296, + "step": 802 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 0.3808551126968657, + "learning_rate": 1.9034608378870676e-05, + "loss": 0.568, + "step": 803 + }, + { + "epoch": 1.9754299754299756, + "grad_norm": 0.29090526952275514, + "learning_rate": 1.8989071038251366e-05, + "loss": 0.5479, + "step": 804 + }, + { + "epoch": 1.9778869778869779, + "grad_norm": 0.31614970674893583, + "learning_rate": 1.894353369763206e-05, + "loss": 0.5712, + "step": 805 + }, + { + "epoch": 1.9803439803439802, + "grad_norm": 0.32886598817833446, + "learning_rate": 1.8897996357012752e-05, + "loss": 0.5644, + "step": 806 + }, + { + "epoch": 1.982800982800983, + "grad_norm": 0.29937607015091, + "learning_rate": 1.8852459016393442e-05, + "loss": 0.5381, + "step": 807 + }, + { + "epoch": 1.9852579852579852, + "grad_norm": 0.30106855434356256, + "learning_rate": 1.8806921675774136e-05, + "loss": 0.5981, + "step": 808 + }, + { + "epoch": 1.9877149877149876, + "grad_norm": 0.2800181256297378, + "learning_rate": 1.8761384335154826e-05, + "loss": 0.551, + "step": 809 + }, + { + "epoch": 1.9901719901719903, + "grad_norm": 0.28586036377645657, + "learning_rate": 1.8715846994535523e-05, + "loss": 0.5275, + "step": 810 + }, + { + "epoch": 1.9926289926289926, + "grad_norm": 0.31385447188308085, + "learning_rate": 1.8670309653916213e-05, + "loss": 0.5601, + "step": 811 + }, + { + "epoch": 1.995085995085995, + "grad_norm": 0.28851307782462143, + "learning_rate": 1.8624772313296903e-05, + "loss": 0.5781, + "step": 812 + }, + { + "epoch": 1.9975429975429977, + "grad_norm": 0.34464670180519497, + "learning_rate": 1.85792349726776e-05, + "loss": 0.6451, + "step": 813 + }, + { + "epoch": 2.0, + "grad_norm": 0.29338564297775654, + "learning_rate": 1.853369763205829e-05, + "loss": 0.5088, + "step": 814 + }, + { + "epoch": 2.0024570024570023, + "grad_norm": 0.41071893724186426, + "learning_rate": 1.848816029143898e-05, + "loss": 0.4005, + "step": 815 + }, + { + "epoch": 2.004914004914005, + "grad_norm": 0.35729203255082986, + "learning_rate": 1.8442622950819673e-05, + "loss": 0.5375, + "step": 816 + }, + { + "epoch": 2.0073710073710074, + "grad_norm": 0.35116777825965795, + "learning_rate": 1.8397085610200366e-05, + "loss": 0.3852, + "step": 817 + }, + { + "epoch": 2.0098280098280097, + "grad_norm": 0.4283969986464076, + "learning_rate": 1.8351548269581056e-05, + "loss": 0.3566, + "step": 818 + }, + { + "epoch": 2.0122850122850124, + "grad_norm": 0.35665717517061507, + "learning_rate": 1.830601092896175e-05, + "loss": 0.4491, + "step": 819 + }, + { + "epoch": 2.0147420147420148, + "grad_norm": 0.29607963350631866, + "learning_rate": 1.8260473588342443e-05, + "loss": 0.4156, + "step": 820 + }, + { + "epoch": 2.017199017199017, + "grad_norm": 0.40471159128724016, + "learning_rate": 1.8214936247723133e-05, + "loss": 0.4252, + "step": 821 + }, + { + "epoch": 2.01965601965602, + "grad_norm": 0.4023037095439346, + "learning_rate": 1.8169398907103826e-05, + "loss": 0.4303, + "step": 822 + }, + { + "epoch": 2.022113022113022, + "grad_norm": 0.31491067990723465, + "learning_rate": 1.8123861566484516e-05, + "loss": 0.4405, + "step": 823 + }, + { + "epoch": 2.0245700245700244, + "grad_norm": 0.32098076190832764, + "learning_rate": 1.807832422586521e-05, + "loss": 0.4037, + "step": 824 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.4117176838641387, + "learning_rate": 1.8032786885245903e-05, + "loss": 0.4702, + "step": 825 + }, + { + "epoch": 2.0294840294840295, + "grad_norm": 0.36001699667740716, + "learning_rate": 1.7987249544626593e-05, + "loss": 0.4989, + "step": 826 + }, + { + "epoch": 2.031941031941032, + "grad_norm": 0.3396068846506985, + "learning_rate": 1.7941712204007287e-05, + "loss": 0.4954, + "step": 827 + }, + { + "epoch": 2.0343980343980346, + "grad_norm": 0.32243687243046554, + "learning_rate": 1.789617486338798e-05, + "loss": 0.4664, + "step": 828 + }, + { + "epoch": 2.036855036855037, + "grad_norm": 0.27969879077206583, + "learning_rate": 1.785063752276867e-05, + "loss": 0.4429, + "step": 829 + }, + { + "epoch": 2.039312039312039, + "grad_norm": 0.31574022998246704, + "learning_rate": 1.7805100182149363e-05, + "loss": 0.4584, + "step": 830 + }, + { + "epoch": 2.041769041769042, + "grad_norm": 0.2688789472016761, + "learning_rate": 1.7759562841530057e-05, + "loss": 0.3876, + "step": 831 + }, + { + "epoch": 2.0442260442260443, + "grad_norm": 0.2784054655160525, + "learning_rate": 1.7714025500910747e-05, + "loss": 0.3927, + "step": 832 + }, + { + "epoch": 2.0466830466830466, + "grad_norm": 0.2673023986899889, + "learning_rate": 1.766848816029144e-05, + "loss": 0.386, + "step": 833 + }, + { + "epoch": 2.0491400491400493, + "grad_norm": 0.2800382150316435, + "learning_rate": 1.7622950819672133e-05, + "loss": 0.4102, + "step": 834 + }, + { + "epoch": 2.0515970515970516, + "grad_norm": 0.2893871141752102, + "learning_rate": 1.7577413479052823e-05, + "loss": 0.4367, + "step": 835 + }, + { + "epoch": 2.054054054054054, + "grad_norm": 0.32085451551691285, + "learning_rate": 1.7531876138433517e-05, + "loss": 0.438, + "step": 836 + }, + { + "epoch": 2.0565110565110567, + "grad_norm": 0.26054641681204144, + "learning_rate": 1.7486338797814207e-05, + "loss": 0.4182, + "step": 837 + }, + { + "epoch": 2.058968058968059, + "grad_norm": 0.2876218424667253, + "learning_rate": 1.74408014571949e-05, + "loss": 0.4671, + "step": 838 + }, + { + "epoch": 2.0614250614250613, + "grad_norm": 0.32477671982156153, + "learning_rate": 1.7395264116575594e-05, + "loss": 0.5294, + "step": 839 + }, + { + "epoch": 2.063882063882064, + "grad_norm": 0.25297866530499685, + "learning_rate": 1.7349726775956284e-05, + "loss": 0.3794, + "step": 840 + }, + { + "epoch": 2.0663390663390664, + "grad_norm": 0.3220090607744593, + "learning_rate": 1.7304189435336977e-05, + "loss": 0.4592, + "step": 841 + }, + { + "epoch": 2.0687960687960687, + "grad_norm": 0.27031704077453683, + "learning_rate": 1.725865209471767e-05, + "loss": 0.3944, + "step": 842 + }, + { + "epoch": 2.0712530712530715, + "grad_norm": 0.27581141711381324, + "learning_rate": 1.721311475409836e-05, + "loss": 0.4197, + "step": 843 + }, + { + "epoch": 2.0737100737100738, + "grad_norm": 0.272567864438195, + "learning_rate": 1.7167577413479054e-05, + "loss": 0.4428, + "step": 844 + }, + { + "epoch": 2.076167076167076, + "grad_norm": 0.2968432656835, + "learning_rate": 1.7122040072859747e-05, + "loss": 0.4456, + "step": 845 + }, + { + "epoch": 2.078624078624079, + "grad_norm": 0.25441437304353254, + "learning_rate": 1.7076502732240437e-05, + "loss": 0.4155, + "step": 846 + }, + { + "epoch": 2.081081081081081, + "grad_norm": 0.5659771360772702, + "learning_rate": 1.703096539162113e-05, + "loss": 0.4397, + "step": 847 + }, + { + "epoch": 2.0835380835380835, + "grad_norm": 0.2472720529621334, + "learning_rate": 1.6985428051001824e-05, + "loss": 0.3765, + "step": 848 + }, + { + "epoch": 2.085995085995086, + "grad_norm": 0.2859826191779256, + "learning_rate": 1.6939890710382514e-05, + "loss": 0.4637, + "step": 849 + }, + { + "epoch": 2.0884520884520885, + "grad_norm": 0.2785861022601944, + "learning_rate": 1.6894353369763207e-05, + "loss": 0.4135, + "step": 850 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 0.25864220657304277, + "learning_rate": 1.68488160291439e-05, + "loss": 0.4325, + "step": 851 + }, + { + "epoch": 2.093366093366093, + "grad_norm": 0.31600684812535784, + "learning_rate": 1.680327868852459e-05, + "loss": 0.4794, + "step": 852 + }, + { + "epoch": 2.095823095823096, + "grad_norm": 0.22407053363373733, + "learning_rate": 1.6757741347905284e-05, + "loss": 0.3725, + "step": 853 + }, + { + "epoch": 2.098280098280098, + "grad_norm": 0.2784163632616066, + "learning_rate": 1.6712204007285974e-05, + "loss": 0.4596, + "step": 854 + }, + { + "epoch": 2.100737100737101, + "grad_norm": 0.2615510772868201, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.4238, + "step": 855 + }, + { + "epoch": 2.1031941031941033, + "grad_norm": 0.23026759434611474, + "learning_rate": 1.662112932604736e-05, + "loss": 0.3978, + "step": 856 + }, + { + "epoch": 2.1056511056511056, + "grad_norm": 0.2676230826969792, + "learning_rate": 1.657559198542805e-05, + "loss": 0.3808, + "step": 857 + }, + { + "epoch": 2.108108108108108, + "grad_norm": 0.28732333343272015, + "learning_rate": 1.6530054644808744e-05, + "loss": 0.422, + "step": 858 + }, + { + "epoch": 2.1105651105651106, + "grad_norm": 0.2275060472400037, + "learning_rate": 1.6484517304189438e-05, + "loss": 0.3937, + "step": 859 + }, + { + "epoch": 2.113022113022113, + "grad_norm": 0.26578354958753214, + "learning_rate": 1.6438979963570128e-05, + "loss": 0.4314, + "step": 860 + }, + { + "epoch": 2.1154791154791153, + "grad_norm": 0.2738579827950538, + "learning_rate": 1.6393442622950818e-05, + "loss": 0.4774, + "step": 861 + }, + { + "epoch": 2.117936117936118, + "grad_norm": 0.26017321347892347, + "learning_rate": 1.6347905282331514e-05, + "loss": 0.4136, + "step": 862 + }, + { + "epoch": 2.1203931203931203, + "grad_norm": 0.25094754715043494, + "learning_rate": 1.6302367941712204e-05, + "loss": 0.3995, + "step": 863 + }, + { + "epoch": 2.1228501228501226, + "grad_norm": 0.24234839713703257, + "learning_rate": 1.6256830601092894e-05, + "loss": 0.4119, + "step": 864 + }, + { + "epoch": 2.1253071253071254, + "grad_norm": 0.25163386424602613, + "learning_rate": 1.621129326047359e-05, + "loss": 0.3784, + "step": 865 + }, + { + "epoch": 2.1277641277641277, + "grad_norm": 0.27936129219089983, + "learning_rate": 1.616575591985428e-05, + "loss": 0.4346, + "step": 866 + }, + { + "epoch": 2.13022113022113, + "grad_norm": 0.25895393324808647, + "learning_rate": 1.6120218579234975e-05, + "loss": 0.4425, + "step": 867 + }, + { + "epoch": 2.1326781326781328, + "grad_norm": 0.23027249451614298, + "learning_rate": 1.6074681238615665e-05, + "loss": 0.3955, + "step": 868 + }, + { + "epoch": 2.135135135135135, + "grad_norm": 0.24009714066156393, + "learning_rate": 1.6029143897996358e-05, + "loss": 0.4008, + "step": 869 + }, + { + "epoch": 2.1375921375921374, + "grad_norm": 0.2608536899762057, + "learning_rate": 1.598360655737705e-05, + "loss": 0.4489, + "step": 870 + }, + { + "epoch": 2.14004914004914, + "grad_norm": 0.27104838457856334, + "learning_rate": 1.593806921675774e-05, + "loss": 0.4567, + "step": 871 + }, + { + "epoch": 2.1425061425061425, + "grad_norm": 0.23710136887205738, + "learning_rate": 1.5892531876138435e-05, + "loss": 0.4177, + "step": 872 + }, + { + "epoch": 2.1449631449631448, + "grad_norm": 0.2614832850724873, + "learning_rate": 1.5846994535519128e-05, + "loss": 0.4159, + "step": 873 + }, + { + "epoch": 2.1474201474201475, + "grad_norm": 0.2493244212239117, + "learning_rate": 1.5801457194899818e-05, + "loss": 0.398, + "step": 874 + }, + { + "epoch": 2.14987714987715, + "grad_norm": 0.2336464998631902, + "learning_rate": 1.5755919854280508e-05, + "loss": 0.4269, + "step": 875 + }, + { + "epoch": 2.152334152334152, + "grad_norm": 0.2500529632050007, + "learning_rate": 1.5710382513661205e-05, + "loss": 0.4597, + "step": 876 + }, + { + "epoch": 2.154791154791155, + "grad_norm": 0.237091380401223, + "learning_rate": 1.5664845173041895e-05, + "loss": 0.3458, + "step": 877 + }, + { + "epoch": 2.157248157248157, + "grad_norm": 0.2654558367364787, + "learning_rate": 1.5619307832422585e-05, + "loss": 0.4016, + "step": 878 + }, + { + "epoch": 2.1597051597051595, + "grad_norm": 0.2378495764493538, + "learning_rate": 1.557377049180328e-05, + "loss": 0.3668, + "step": 879 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.24037789383707953, + "learning_rate": 1.552823315118397e-05, + "loss": 0.4122, + "step": 880 + }, + { + "epoch": 2.1646191646191646, + "grad_norm": 0.2774310296361704, + "learning_rate": 1.548269581056466e-05, + "loss": 0.4947, + "step": 881 + }, + { + "epoch": 2.167076167076167, + "grad_norm": 0.24152670773336085, + "learning_rate": 1.5437158469945355e-05, + "loss": 0.3944, + "step": 882 + }, + { + "epoch": 2.1695331695331697, + "grad_norm": 0.2598802823682941, + "learning_rate": 1.539162112932605e-05, + "loss": 0.452, + "step": 883 + }, + { + "epoch": 2.171990171990172, + "grad_norm": 4.841124303991015, + "learning_rate": 1.5346083788706742e-05, + "loss": 0.4445, + "step": 884 + }, + { + "epoch": 2.1744471744471743, + "grad_norm": 0.9349040267922862, + "learning_rate": 1.5300546448087432e-05, + "loss": 0.4991, + "step": 885 + }, + { + "epoch": 2.176904176904177, + "grad_norm": 0.27687057595557907, + "learning_rate": 1.5255009107468124e-05, + "loss": 0.4169, + "step": 886 + }, + { + "epoch": 2.1793611793611793, + "grad_norm": 0.29678484834818836, + "learning_rate": 1.5209471766848819e-05, + "loss": 0.4114, + "step": 887 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 0.2530513739903439, + "learning_rate": 1.5163934426229509e-05, + "loss": 0.4305, + "step": 888 + }, + { + "epoch": 2.1842751842751844, + "grad_norm": 0.2882228696525778, + "learning_rate": 1.51183970856102e-05, + "loss": 0.498, + "step": 889 + }, + { + "epoch": 2.1867321867321867, + "grad_norm": 0.2888696434121559, + "learning_rate": 1.5072859744990894e-05, + "loss": 0.4209, + "step": 890 + }, + { + "epoch": 2.189189189189189, + "grad_norm": 0.26095945742134186, + "learning_rate": 1.5027322404371585e-05, + "loss": 0.4539, + "step": 891 + }, + { + "epoch": 2.191646191646192, + "grad_norm": 0.25838164536911407, + "learning_rate": 1.4981785063752277e-05, + "loss": 0.4293, + "step": 892 + }, + { + "epoch": 2.194103194103194, + "grad_norm": 0.28809740633168196, + "learning_rate": 1.493624772313297e-05, + "loss": 0.4173, + "step": 893 + }, + { + "epoch": 2.1965601965601964, + "grad_norm": 0.2800744500661992, + "learning_rate": 1.4890710382513662e-05, + "loss": 0.4365, + "step": 894 + }, + { + "epoch": 2.199017199017199, + "grad_norm": 0.2502242180844428, + "learning_rate": 1.4845173041894352e-05, + "loss": 0.4286, + "step": 895 + }, + { + "epoch": 2.2014742014742015, + "grad_norm": 0.27065496051514343, + "learning_rate": 1.4799635701275047e-05, + "loss": 0.4674, + "step": 896 + }, + { + "epoch": 2.203931203931204, + "grad_norm": 0.2683214829245222, + "learning_rate": 1.4754098360655739e-05, + "loss": 0.4248, + "step": 897 + }, + { + "epoch": 2.2063882063882065, + "grad_norm": 0.2504102146869169, + "learning_rate": 1.4708561020036429e-05, + "loss": 0.4289, + "step": 898 + }, + { + "epoch": 2.208845208845209, + "grad_norm": 0.2577283264465404, + "learning_rate": 1.4663023679417124e-05, + "loss": 0.4386, + "step": 899 + }, + { + "epoch": 2.211302211302211, + "grad_norm": 0.26573181605705387, + "learning_rate": 1.4617486338797814e-05, + "loss": 0.4475, + "step": 900 + }, + { + "epoch": 2.213759213759214, + "grad_norm": 0.24365626007806804, + "learning_rate": 1.4571948998178509e-05, + "loss": 0.4331, + "step": 901 + }, + { + "epoch": 2.2162162162162162, + "grad_norm": 0.28062941136254116, + "learning_rate": 1.4526411657559199e-05, + "loss": 0.4328, + "step": 902 + }, + { + "epoch": 2.2186732186732185, + "grad_norm": 2.972065788354103, + "learning_rate": 1.448087431693989e-05, + "loss": 0.4546, + "step": 903 + }, + { + "epoch": 2.2211302211302213, + "grad_norm": 0.2534606607226359, + "learning_rate": 1.4435336976320584e-05, + "loss": 0.3829, + "step": 904 + }, + { + "epoch": 2.2235872235872236, + "grad_norm": 0.274929708261248, + "learning_rate": 1.4389799635701276e-05, + "loss": 0.4797, + "step": 905 + }, + { + "epoch": 2.226044226044226, + "grad_norm": 0.23105916503325502, + "learning_rate": 1.4344262295081968e-05, + "loss": 0.3747, + "step": 906 + }, + { + "epoch": 2.2285012285012287, + "grad_norm": 0.2868497407842456, + "learning_rate": 1.4298724954462661e-05, + "loss": 0.4016, + "step": 907 + }, + { + "epoch": 2.230958230958231, + "grad_norm": 0.27039212267154017, + "learning_rate": 1.4253187613843353e-05, + "loss": 0.4429, + "step": 908 + }, + { + "epoch": 2.2334152334152333, + "grad_norm": 0.2626645153376362, + "learning_rate": 1.4207650273224044e-05, + "loss": 0.3925, + "step": 909 + }, + { + "epoch": 2.235872235872236, + "grad_norm": 0.2857586938595, + "learning_rate": 1.4162112932604738e-05, + "loss": 0.4041, + "step": 910 + }, + { + "epoch": 2.2383292383292384, + "grad_norm": 0.24210750827605218, + "learning_rate": 1.411657559198543e-05, + "loss": 0.3824, + "step": 911 + }, + { + "epoch": 2.2407862407862407, + "grad_norm": 0.23567329374252635, + "learning_rate": 1.407103825136612e-05, + "loss": 0.4054, + "step": 912 + }, + { + "epoch": 2.2432432432432434, + "grad_norm": 0.2608042036778111, + "learning_rate": 1.4025500910746814e-05, + "loss": 0.4663, + "step": 913 + }, + { + "epoch": 2.2457002457002457, + "grad_norm": 0.2516718644133891, + "learning_rate": 1.3979963570127504e-05, + "loss": 0.3965, + "step": 914 + }, + { + "epoch": 2.248157248157248, + "grad_norm": 0.2547812213730915, + "learning_rate": 1.3934426229508196e-05, + "loss": 0.4178, + "step": 915 + }, + { + "epoch": 2.250614250614251, + "grad_norm": 0.24505968428973618, + "learning_rate": 1.388888888888889e-05, + "loss": 0.377, + "step": 916 + }, + { + "epoch": 2.253071253071253, + "grad_norm": 0.2726142469594438, + "learning_rate": 1.3843351548269581e-05, + "loss": 0.5143, + "step": 917 + }, + { + "epoch": 2.2555282555282554, + "grad_norm": 0.2349026070277474, + "learning_rate": 1.3797814207650273e-05, + "loss": 0.3882, + "step": 918 + }, + { + "epoch": 2.257985257985258, + "grad_norm": 0.22293153420044365, + "learning_rate": 1.3752276867030966e-05, + "loss": 0.3853, + "step": 919 + }, + { + "epoch": 2.2604422604422605, + "grad_norm": 0.25743572034407713, + "learning_rate": 1.3706739526411658e-05, + "loss": 0.4365, + "step": 920 + }, + { + "epoch": 2.262899262899263, + "grad_norm": 0.2579052038508534, + "learning_rate": 1.3661202185792351e-05, + "loss": 0.4398, + "step": 921 + }, + { + "epoch": 2.2653562653562656, + "grad_norm": 0.2440626223986384, + "learning_rate": 1.3615664845173043e-05, + "loss": 0.4438, + "step": 922 + }, + { + "epoch": 2.267813267813268, + "grad_norm": 0.24281445788603354, + "learning_rate": 1.3570127504553735e-05, + "loss": 0.4001, + "step": 923 + }, + { + "epoch": 2.27027027027027, + "grad_norm": 0.2552660754956044, + "learning_rate": 1.3524590163934428e-05, + "loss": 0.4111, + "step": 924 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.2457448775461868, + "learning_rate": 1.347905282331512e-05, + "loss": 0.4873, + "step": 925 + }, + { + "epoch": 2.2751842751842752, + "grad_norm": 0.24684307678902845, + "learning_rate": 1.343351548269581e-05, + "loss": 0.4106, + "step": 926 + }, + { + "epoch": 2.2776412776412776, + "grad_norm": 0.26175150698727817, + "learning_rate": 1.3387978142076505e-05, + "loss": 0.3655, + "step": 927 + }, + { + "epoch": 2.2800982800982803, + "grad_norm": 0.22280106083223208, + "learning_rate": 1.3342440801457195e-05, + "loss": 0.386, + "step": 928 + }, + { + "epoch": 2.2825552825552826, + "grad_norm": 0.2615527014422742, + "learning_rate": 1.3296903460837887e-05, + "loss": 0.4426, + "step": 929 + }, + { + "epoch": 2.285012285012285, + "grad_norm": 0.27081169659666804, + "learning_rate": 1.325136612021858e-05, + "loss": 0.4449, + "step": 930 + }, + { + "epoch": 2.2874692874692872, + "grad_norm": 0.2664738790934516, + "learning_rate": 1.3205828779599272e-05, + "loss": 0.3878, + "step": 931 + }, + { + "epoch": 2.28992628992629, + "grad_norm": 0.25492227378588317, + "learning_rate": 1.3160291438979963e-05, + "loss": 0.4008, + "step": 932 + }, + { + "epoch": 2.2923832923832923, + "grad_norm": 0.26063202213635495, + "learning_rate": 1.3114754098360657e-05, + "loss": 0.4289, + "step": 933 + }, + { + "epoch": 2.294840294840295, + "grad_norm": 0.2910730570058918, + "learning_rate": 1.3069216757741349e-05, + "loss": 0.5056, + "step": 934 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 0.25949619296926774, + "learning_rate": 1.302367941712204e-05, + "loss": 0.4352, + "step": 935 + }, + { + "epoch": 2.2997542997542997, + "grad_norm": 0.27651669705889614, + "learning_rate": 1.2978142076502734e-05, + "loss": 0.4198, + "step": 936 + }, + { + "epoch": 2.302211302211302, + "grad_norm": 0.24886536308925195, + "learning_rate": 1.2932604735883425e-05, + "loss": 0.4334, + "step": 937 + }, + { + "epoch": 2.3046683046683047, + "grad_norm": 0.2354118767882423, + "learning_rate": 1.2887067395264119e-05, + "loss": 0.4045, + "step": 938 + }, + { + "epoch": 2.307125307125307, + "grad_norm": 0.25840133517937125, + "learning_rate": 1.284153005464481e-05, + "loss": 0.4153, + "step": 939 + }, + { + "epoch": 2.30958230958231, + "grad_norm": 0.24820967640391792, + "learning_rate": 1.27959927140255e-05, + "loss": 0.4295, + "step": 940 + }, + { + "epoch": 2.312039312039312, + "grad_norm": 0.2682235670401214, + "learning_rate": 1.2750455373406195e-05, + "loss": 0.3975, + "step": 941 + }, + { + "epoch": 2.3144963144963144, + "grad_norm": 0.24936889462306489, + "learning_rate": 1.2704918032786885e-05, + "loss": 0.4019, + "step": 942 + }, + { + "epoch": 2.3169533169533167, + "grad_norm": 0.25313590598342434, + "learning_rate": 1.2659380692167577e-05, + "loss": 0.4121, + "step": 943 + }, + { + "epoch": 2.3194103194103195, + "grad_norm": 0.2661342959579272, + "learning_rate": 1.261384335154827e-05, + "loss": 0.4755, + "step": 944 + }, + { + "epoch": 2.321867321867322, + "grad_norm": 0.24872736039438376, + "learning_rate": 1.2568306010928962e-05, + "loss": 0.4223, + "step": 945 + }, + { + "epoch": 2.3243243243243246, + "grad_norm": 0.2687245129503849, + "learning_rate": 1.2522768670309654e-05, + "loss": 0.4267, + "step": 946 + }, + { + "epoch": 2.326781326781327, + "grad_norm": 0.22779727771546104, + "learning_rate": 1.2477231329690346e-05, + "loss": 0.3892, + "step": 947 + }, + { + "epoch": 2.329238329238329, + "grad_norm": 0.24571522522484693, + "learning_rate": 1.2431693989071039e-05, + "loss": 0.4346, + "step": 948 + }, + { + "epoch": 2.3316953316953315, + "grad_norm": 0.25561261326404666, + "learning_rate": 1.238615664845173e-05, + "loss": 0.4389, + "step": 949 + }, + { + "epoch": 2.3341523341523343, + "grad_norm": 0.22939877730223857, + "learning_rate": 1.2340619307832422e-05, + "loss": 0.4432, + "step": 950 + }, + { + "epoch": 2.3366093366093366, + "grad_norm": 0.2357383557445305, + "learning_rate": 1.2295081967213116e-05, + "loss": 0.4173, + "step": 951 + }, + { + "epoch": 2.339066339066339, + "grad_norm": 0.7364831086181507, + "learning_rate": 1.2249544626593807e-05, + "loss": 0.4907, + "step": 952 + }, + { + "epoch": 2.3415233415233416, + "grad_norm": 0.44572692986041285, + "learning_rate": 1.22040072859745e-05, + "loss": 0.4205, + "step": 953 + }, + { + "epoch": 2.343980343980344, + "grad_norm": 3.567246948098372, + "learning_rate": 1.215846994535519e-05, + "loss": 0.4788, + "step": 954 + }, + { + "epoch": 2.3464373464373462, + "grad_norm": 0.30309787203713356, + "learning_rate": 1.2112932604735884e-05, + "loss": 0.4275, + "step": 955 + }, + { + "epoch": 2.348894348894349, + "grad_norm": 0.24996522048036657, + "learning_rate": 1.2067395264116576e-05, + "loss": 0.4782, + "step": 956 + }, + { + "epoch": 2.3513513513513513, + "grad_norm": 0.24778131630911673, + "learning_rate": 1.2021857923497268e-05, + "loss": 0.4645, + "step": 957 + }, + { + "epoch": 2.3538083538083536, + "grad_norm": 0.29025219100431315, + "learning_rate": 1.1976320582877961e-05, + "loss": 0.4398, + "step": 958 + }, + { + "epoch": 2.3562653562653564, + "grad_norm": 0.27470165797569124, + "learning_rate": 1.1930783242258653e-05, + "loss": 0.4196, + "step": 959 + }, + { + "epoch": 2.3587223587223587, + "grad_norm": 0.24980686026607077, + "learning_rate": 1.1885245901639344e-05, + "loss": 0.4048, + "step": 960 + }, + { + "epoch": 2.361179361179361, + "grad_norm": 0.23877293904046448, + "learning_rate": 1.1839708561020036e-05, + "loss": 0.4014, + "step": 961 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 0.2727423118228381, + "learning_rate": 1.179417122040073e-05, + "loss": 0.4454, + "step": 962 + }, + { + "epoch": 2.366093366093366, + "grad_norm": 1.7198763345457913, + "learning_rate": 1.1748633879781421e-05, + "loss": 0.4657, + "step": 963 + }, + { + "epoch": 2.3685503685503684, + "grad_norm": 0.22671067308180004, + "learning_rate": 1.1703096539162113e-05, + "loss": 0.3986, + "step": 964 + }, + { + "epoch": 2.371007371007371, + "grad_norm": 0.26385523396246985, + "learning_rate": 1.1657559198542806e-05, + "loss": 0.4469, + "step": 965 + }, + { + "epoch": 2.3734643734643734, + "grad_norm": 0.5084136843839772, + "learning_rate": 1.1612021857923498e-05, + "loss": 0.3711, + "step": 966 + }, + { + "epoch": 2.3759213759213758, + "grad_norm": 0.27655806457414583, + "learning_rate": 1.156648451730419e-05, + "loss": 0.4551, + "step": 967 + }, + { + "epoch": 2.3783783783783785, + "grad_norm": 0.2410876717300484, + "learning_rate": 1.1520947176684881e-05, + "loss": 0.4496, + "step": 968 + }, + { + "epoch": 2.380835380835381, + "grad_norm": 0.24309059276058065, + "learning_rate": 1.1475409836065575e-05, + "loss": 0.3511, + "step": 969 + }, + { + "epoch": 2.383292383292383, + "grad_norm": 0.30117433755614303, + "learning_rate": 1.1429872495446266e-05, + "loss": 0.4416, + "step": 970 + }, + { + "epoch": 2.385749385749386, + "grad_norm": 0.25582765879217223, + "learning_rate": 1.1384335154826958e-05, + "loss": 0.4337, + "step": 971 + }, + { + "epoch": 2.388206388206388, + "grad_norm": 0.22539920788225698, + "learning_rate": 1.1338797814207651e-05, + "loss": 0.3714, + "step": 972 + }, + { + "epoch": 2.3906633906633905, + "grad_norm": 0.29931134171407514, + "learning_rate": 1.1293260473588343e-05, + "loss": 0.5106, + "step": 973 + }, + { + "epoch": 2.3931203931203933, + "grad_norm": 0.24423619232997276, + "learning_rate": 1.1247723132969035e-05, + "loss": 0.4172, + "step": 974 + }, + { + "epoch": 2.3955773955773956, + "grad_norm": 0.24339190265600077, + "learning_rate": 1.1202185792349727e-05, + "loss": 0.4833, + "step": 975 + }, + { + "epoch": 2.398034398034398, + "grad_norm": 0.24812658878123112, + "learning_rate": 1.115664845173042e-05, + "loss": 0.4091, + "step": 976 + }, + { + "epoch": 2.4004914004914006, + "grad_norm": 0.28759159231823356, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.4221, + "step": 977 + }, + { + "epoch": 2.402948402948403, + "grad_norm": 0.23293079513866843, + "learning_rate": 1.1065573770491803e-05, + "loss": 0.4131, + "step": 978 + }, + { + "epoch": 2.4054054054054053, + "grad_norm": 7.749543886893285, + "learning_rate": 1.1020036429872497e-05, + "loss": 0.7646, + "step": 979 + }, + { + "epoch": 2.407862407862408, + "grad_norm": 0.23764368732650262, + "learning_rate": 1.0974499089253188e-05, + "loss": 0.4017, + "step": 980 + }, + { + "epoch": 2.4103194103194103, + "grad_norm": 0.2849297299623391, + "learning_rate": 1.092896174863388e-05, + "loss": 0.3994, + "step": 981 + }, + { + "epoch": 2.4127764127764126, + "grad_norm": 0.25594559931712746, + "learning_rate": 1.0883424408014572e-05, + "loss": 0.3598, + "step": 982 + }, + { + "epoch": 2.4152334152334154, + "grad_norm": 0.24347214875726386, + "learning_rate": 1.0837887067395265e-05, + "loss": 0.4782, + "step": 983 + }, + { + "epoch": 2.4176904176904177, + "grad_norm": 0.2667475881035064, + "learning_rate": 1.0792349726775957e-05, + "loss": 0.478, + "step": 984 + }, + { + "epoch": 2.42014742014742, + "grad_norm": 0.25874841731246545, + "learning_rate": 1.0746812386156649e-05, + "loss": 0.4026, + "step": 985 + }, + { + "epoch": 2.4226044226044228, + "grad_norm": 0.25268556183613805, + "learning_rate": 1.0701275045537342e-05, + "loss": 0.4032, + "step": 986 + }, + { + "epoch": 2.425061425061425, + "grad_norm": 0.2395640061506075, + "learning_rate": 1.0655737704918032e-05, + "loss": 0.4052, + "step": 987 + }, + { + "epoch": 2.4275184275184274, + "grad_norm": 0.2750057820112185, + "learning_rate": 1.0610200364298725e-05, + "loss": 0.5181, + "step": 988 + }, + { + "epoch": 2.42997542997543, + "grad_norm": 0.23956660835432816, + "learning_rate": 1.0564663023679417e-05, + "loss": 0.4087, + "step": 989 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.2682821537572547, + "learning_rate": 1.051912568306011e-05, + "loss": 0.3539, + "step": 990 + }, + { + "epoch": 2.4348894348894348, + "grad_norm": 0.253635569161534, + "learning_rate": 1.0473588342440802e-05, + "loss": 0.4304, + "step": 991 + }, + { + "epoch": 2.4373464373464375, + "grad_norm": 0.25725469397538964, + "learning_rate": 1.0428051001821494e-05, + "loss": 0.4665, + "step": 992 + }, + { + "epoch": 2.43980343980344, + "grad_norm": 0.21869469055564097, + "learning_rate": 1.0382513661202187e-05, + "loss": 0.3391, + "step": 993 + }, + { + "epoch": 2.442260442260442, + "grad_norm": 0.23932164929486857, + "learning_rate": 1.0336976320582877e-05, + "loss": 0.4214, + "step": 994 + }, + { + "epoch": 2.444717444717445, + "grad_norm": 0.249887483386558, + "learning_rate": 1.029143897996357e-05, + "loss": 0.4299, + "step": 995 + }, + { + "epoch": 2.447174447174447, + "grad_norm": 0.2448604037219602, + "learning_rate": 1.0245901639344262e-05, + "loss": 0.3619, + "step": 996 + }, + { + "epoch": 2.4496314496314495, + "grad_norm": 0.23238074536535608, + "learning_rate": 1.0200364298724956e-05, + "loss": 0.4014, + "step": 997 + }, + { + "epoch": 2.4520884520884523, + "grad_norm": 0.6771008652415798, + "learning_rate": 1.0154826958105647e-05, + "loss": 0.4356, + "step": 998 + }, + { + "epoch": 2.4545454545454546, + "grad_norm": 0.2590635077299185, + "learning_rate": 1.0109289617486339e-05, + "loss": 0.4019, + "step": 999 + }, + { + "epoch": 2.457002457002457, + "grad_norm": 0.2546310175980953, + "learning_rate": 1.0063752276867032e-05, + "loss": 0.4573, + "step": 1000 + }, + { + "epoch": 2.4594594594594597, + "grad_norm": 0.22827326397492897, + "learning_rate": 1.0018214936247722e-05, + "loss": 0.3811, + "step": 1001 + }, + { + "epoch": 2.461916461916462, + "grad_norm": 0.2433313320364577, + "learning_rate": 9.972677595628416e-06, + "loss": 0.429, + "step": 1002 + }, + { + "epoch": 2.4643734643734643, + "grad_norm": 0.26498928945986583, + "learning_rate": 9.927140255009108e-06, + "loss": 0.4492, + "step": 1003 + }, + { + "epoch": 2.4668304668304666, + "grad_norm": 0.2375299896617999, + "learning_rate": 9.8816029143898e-06, + "loss": 0.4213, + "step": 1004 + }, + { + "epoch": 2.4692874692874693, + "grad_norm": 0.2395572196206859, + "learning_rate": 9.836065573770493e-06, + "loss": 0.4685, + "step": 1005 + }, + { + "epoch": 2.4717444717444716, + "grad_norm": 0.2537090566068248, + "learning_rate": 9.790528233151184e-06, + "loss": 0.4412, + "step": 1006 + }, + { + "epoch": 2.4742014742014744, + "grad_norm": 0.25208141661171674, + "learning_rate": 9.744990892531878e-06, + "loss": 0.3854, + "step": 1007 + }, + { + "epoch": 2.4766584766584767, + "grad_norm": 0.2642583302690173, + "learning_rate": 9.699453551912568e-06, + "loss": 0.4179, + "step": 1008 + }, + { + "epoch": 2.479115479115479, + "grad_norm": 0.23767421601073707, + "learning_rate": 9.653916211293261e-06, + "loss": 0.4385, + "step": 1009 + }, + { + "epoch": 2.4815724815724813, + "grad_norm": 0.24744117798052945, + "learning_rate": 9.608378870673953e-06, + "loss": 0.4231, + "step": 1010 + }, + { + "epoch": 2.484029484029484, + "grad_norm": 0.23898600337582362, + "learning_rate": 9.562841530054644e-06, + "loss": 0.4217, + "step": 1011 + }, + { + "epoch": 2.4864864864864864, + "grad_norm": 0.25968529590110145, + "learning_rate": 9.517304189435338e-06, + "loss": 0.4322, + "step": 1012 + }, + { + "epoch": 2.488943488943489, + "grad_norm": 0.2552860811626078, + "learning_rate": 9.47176684881603e-06, + "loss": 0.4059, + "step": 1013 + }, + { + "epoch": 2.4914004914004915, + "grad_norm": 0.2535129474406969, + "learning_rate": 9.426229508196721e-06, + "loss": 0.4199, + "step": 1014 + }, + { + "epoch": 2.493857493857494, + "grad_norm": 0.23391837558421832, + "learning_rate": 9.380692167577413e-06, + "loss": 0.4875, + "step": 1015 + }, + { + "epoch": 2.496314496314496, + "grad_norm": 0.24298822208561335, + "learning_rate": 9.335154826958106e-06, + "loss": 0.4601, + "step": 1016 + }, + { + "epoch": 2.498771498771499, + "grad_norm": 0.9278988295393633, + "learning_rate": 9.2896174863388e-06, + "loss": 0.4622, + "step": 1017 + }, + { + "epoch": 2.501228501228501, + "grad_norm": 0.2392256552257423, + "learning_rate": 9.24408014571949e-06, + "loss": 0.4355, + "step": 1018 + }, + { + "epoch": 2.503685503685504, + "grad_norm": 0.2330893968890112, + "learning_rate": 9.198542805100183e-06, + "loss": 0.3879, + "step": 1019 + }, + { + "epoch": 2.506142506142506, + "grad_norm": 0.25975387161037466, + "learning_rate": 9.153005464480875e-06, + "loss": 0.5102, + "step": 1020 + }, + { + "epoch": 2.5085995085995085, + "grad_norm": 0.23725040470847344, + "learning_rate": 9.107468123861566e-06, + "loss": 0.3736, + "step": 1021 + }, + { + "epoch": 2.511056511056511, + "grad_norm": 0.2313618576568214, + "learning_rate": 9.061930783242258e-06, + "loss": 0.4254, + "step": 1022 + }, + { + "epoch": 2.5135135135135136, + "grad_norm": 0.24699951638595727, + "learning_rate": 9.016393442622952e-06, + "loss": 0.4583, + "step": 1023 + }, + { + "epoch": 2.515970515970516, + "grad_norm": 0.25544750210043377, + "learning_rate": 8.970856102003643e-06, + "loss": 0.3913, + "step": 1024 + }, + { + "epoch": 2.5184275184275187, + "grad_norm": 0.255332626493528, + "learning_rate": 8.925318761384335e-06, + "loss": 0.443, + "step": 1025 + }, + { + "epoch": 2.520884520884521, + "grad_norm": 0.24789597999286547, + "learning_rate": 8.879781420765028e-06, + "loss": 0.3952, + "step": 1026 + }, + { + "epoch": 2.5233415233415233, + "grad_norm": 0.23241695995795617, + "learning_rate": 8.83424408014572e-06, + "loss": 0.3795, + "step": 1027 + }, + { + "epoch": 2.5257985257985256, + "grad_norm": 0.22579417060539125, + "learning_rate": 8.788706739526412e-06, + "loss": 0.3723, + "step": 1028 + }, + { + "epoch": 2.5282555282555284, + "grad_norm": 0.22283238385668375, + "learning_rate": 8.743169398907103e-06, + "loss": 0.391, + "step": 1029 + }, + { + "epoch": 2.5307125307125307, + "grad_norm": 0.2246271370216966, + "learning_rate": 8.697632058287797e-06, + "loss": 0.3482, + "step": 1030 + }, + { + "epoch": 2.5331695331695334, + "grad_norm": 0.24663076613217097, + "learning_rate": 8.652094717668488e-06, + "loss": 0.4507, + "step": 1031 + }, + { + "epoch": 2.5356265356265357, + "grad_norm": 0.26050722671516013, + "learning_rate": 8.60655737704918e-06, + "loss": 0.3843, + "step": 1032 + }, + { + "epoch": 2.538083538083538, + "grad_norm": 0.2331322639700471, + "learning_rate": 8.561020036429874e-06, + "loss": 0.3914, + "step": 1033 + }, + { + "epoch": 2.5405405405405403, + "grad_norm": 0.24551537000672188, + "learning_rate": 8.515482695810565e-06, + "loss": 0.4184, + "step": 1034 + }, + { + "epoch": 2.542997542997543, + "grad_norm": 0.24176204248205907, + "learning_rate": 8.469945355191257e-06, + "loss": 0.3952, + "step": 1035 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 0.23506445053772151, + "learning_rate": 8.42440801457195e-06, + "loss": 0.4188, + "step": 1036 + }, + { + "epoch": 2.547911547911548, + "grad_norm": 0.23936543990096357, + "learning_rate": 8.378870673952642e-06, + "loss": 0.4283, + "step": 1037 + }, + { + "epoch": 2.5503685503685505, + "grad_norm": 0.2291008508057063, + "learning_rate": 8.333333333333334e-06, + "loss": 0.3352, + "step": 1038 + }, + { + "epoch": 2.552825552825553, + "grad_norm": 0.24326786660676292, + "learning_rate": 8.287795992714025e-06, + "loss": 0.447, + "step": 1039 + }, + { + "epoch": 2.555282555282555, + "grad_norm": 0.2275126614890594, + "learning_rate": 8.242258652094719e-06, + "loss": 0.4059, + "step": 1040 + }, + { + "epoch": 2.557739557739558, + "grad_norm": 0.23651265321324066, + "learning_rate": 8.196721311475409e-06, + "loss": 0.4521, + "step": 1041 + }, + { + "epoch": 2.56019656019656, + "grad_norm": 0.26505811816438285, + "learning_rate": 8.151183970856102e-06, + "loss": 0.4815, + "step": 1042 + }, + { + "epoch": 2.562653562653563, + "grad_norm": 0.2568061760157405, + "learning_rate": 8.105646630236796e-06, + "loss": 0.4375, + "step": 1043 + }, + { + "epoch": 2.5651105651105652, + "grad_norm": 0.22708969537757387, + "learning_rate": 8.060109289617487e-06, + "loss": 0.4164, + "step": 1044 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.22323849393344242, + "learning_rate": 8.014571948998179e-06, + "loss": 0.4076, + "step": 1045 + }, + { + "epoch": 2.57002457002457, + "grad_norm": 0.25111824108906006, + "learning_rate": 7.96903460837887e-06, + "loss": 0.4767, + "step": 1046 + }, + { + "epoch": 2.5724815724815726, + "grad_norm": 0.260989307316713, + "learning_rate": 7.923497267759564e-06, + "loss": 0.433, + "step": 1047 + }, + { + "epoch": 2.574938574938575, + "grad_norm": 0.26052793667495083, + "learning_rate": 7.877959927140254e-06, + "loss": 0.5054, + "step": 1048 + }, + { + "epoch": 2.5773955773955772, + "grad_norm": 0.24915581597371805, + "learning_rate": 7.832422586520947e-06, + "loss": 0.4814, + "step": 1049 + }, + { + "epoch": 2.57985257985258, + "grad_norm": 0.2543219048137912, + "learning_rate": 7.78688524590164e-06, + "loss": 0.4697, + "step": 1050 + }, + { + "epoch": 2.5823095823095823, + "grad_norm": 0.24030777009135726, + "learning_rate": 7.74134790528233e-06, + "loss": 0.4742, + "step": 1051 + }, + { + "epoch": 2.5847665847665846, + "grad_norm": 0.2231426133500323, + "learning_rate": 7.695810564663024e-06, + "loss": 0.3994, + "step": 1052 + }, + { + "epoch": 2.5872235872235874, + "grad_norm": 0.22308991337870562, + "learning_rate": 7.650273224043716e-06, + "loss": 0.3573, + "step": 1053 + }, + { + "epoch": 2.5896805896805897, + "grad_norm": 0.23216474618491506, + "learning_rate": 7.604735883424409e-06, + "loss": 0.4204, + "step": 1054 + }, + { + "epoch": 2.592137592137592, + "grad_norm": 0.2673680272419057, + "learning_rate": 7.5591985428051e-06, + "loss": 0.4019, + "step": 1055 + }, + { + "epoch": 2.5945945945945947, + "grad_norm": 0.2427622295285736, + "learning_rate": 7.513661202185793e-06, + "loss": 0.4387, + "step": 1056 + }, + { + "epoch": 2.597051597051597, + "grad_norm": 0.22860426639394132, + "learning_rate": 7.468123861566485e-06, + "loss": 0.3722, + "step": 1057 + }, + { + "epoch": 2.5995085995085994, + "grad_norm": 0.22665737637936162, + "learning_rate": 7.422586520947176e-06, + "loss": 0.3897, + "step": 1058 + }, + { + "epoch": 2.601965601965602, + "grad_norm": 0.23303711982308306, + "learning_rate": 7.3770491803278695e-06, + "loss": 0.4285, + "step": 1059 + }, + { + "epoch": 2.6044226044226044, + "grad_norm": 0.26128393033135633, + "learning_rate": 7.331511839708562e-06, + "loss": 0.4375, + "step": 1060 + }, + { + "epoch": 2.6068796068796067, + "grad_norm": 0.25239192353421025, + "learning_rate": 7.2859744990892545e-06, + "loss": 0.4289, + "step": 1061 + }, + { + "epoch": 2.6093366093366095, + "grad_norm": 0.2528160456529098, + "learning_rate": 7.240437158469945e-06, + "loss": 0.4413, + "step": 1062 + }, + { + "epoch": 2.611793611793612, + "grad_norm": 0.23646099966620737, + "learning_rate": 7.194899817850638e-06, + "loss": 0.47, + "step": 1063 + }, + { + "epoch": 2.614250614250614, + "grad_norm": 0.24423364418249202, + "learning_rate": 7.1493624772313305e-06, + "loss": 0.3873, + "step": 1064 + }, + { + "epoch": 2.616707616707617, + "grad_norm": 0.39790290853859617, + "learning_rate": 7.103825136612022e-06, + "loss": 0.4088, + "step": 1065 + }, + { + "epoch": 2.619164619164619, + "grad_norm": 0.24950104367468617, + "learning_rate": 7.058287795992715e-06, + "loss": 0.4047, + "step": 1066 + }, + { + "epoch": 2.6216216216216215, + "grad_norm": 0.2130145601569131, + "learning_rate": 7.012750455373407e-06, + "loss": 0.3621, + "step": 1067 + }, + { + "epoch": 2.6240786240786242, + "grad_norm": 0.24418610790883985, + "learning_rate": 6.967213114754098e-06, + "loss": 0.3692, + "step": 1068 + }, + { + "epoch": 2.6265356265356266, + "grad_norm": 0.24162683820897035, + "learning_rate": 6.921675774134791e-06, + "loss": 0.4028, + "step": 1069 + }, + { + "epoch": 2.628992628992629, + "grad_norm": 0.23928139013451663, + "learning_rate": 6.876138433515483e-06, + "loss": 0.3879, + "step": 1070 + }, + { + "epoch": 2.631449631449631, + "grad_norm": 0.23045710828373894, + "learning_rate": 6.830601092896176e-06, + "loss": 0.4, + "step": 1071 + }, + { + "epoch": 2.633906633906634, + "grad_norm": 0.2697298886110676, + "learning_rate": 6.785063752276867e-06, + "loss": 0.4548, + "step": 1072 + }, + { + "epoch": 2.6363636363636362, + "grad_norm": 0.24532148513902288, + "learning_rate": 6.73952641165756e-06, + "loss": 0.3913, + "step": 1073 + }, + { + "epoch": 2.638820638820639, + "grad_norm": 0.2337130540549015, + "learning_rate": 6.6939890710382525e-06, + "loss": 0.4059, + "step": 1074 + }, + { + "epoch": 2.6412776412776413, + "grad_norm": 0.2528405150657174, + "learning_rate": 6.648451730418943e-06, + "loss": 0.3687, + "step": 1075 + }, + { + "epoch": 2.6437346437346436, + "grad_norm": 0.23392129641022022, + "learning_rate": 6.602914389799636e-06, + "loss": 0.4117, + "step": 1076 + }, + { + "epoch": 2.646191646191646, + "grad_norm": 0.2462017004070107, + "learning_rate": 6.557377049180328e-06, + "loss": 0.4551, + "step": 1077 + }, + { + "epoch": 2.6486486486486487, + "grad_norm": 0.23207850038278316, + "learning_rate": 6.51183970856102e-06, + "loss": 0.4533, + "step": 1078 + }, + { + "epoch": 2.651105651105651, + "grad_norm": 0.22221002852774863, + "learning_rate": 6.466302367941713e-06, + "loss": 0.3899, + "step": 1079 + }, + { + "epoch": 2.6535626535626538, + "grad_norm": 0.22345355290268304, + "learning_rate": 6.420765027322405e-06, + "loss": 0.4201, + "step": 1080 + }, + { + "epoch": 2.656019656019656, + "grad_norm": 0.24769684733774203, + "learning_rate": 6.375227686703098e-06, + "loss": 0.4161, + "step": 1081 + }, + { + "epoch": 2.6584766584766584, + "grad_norm": 0.2407677406182194, + "learning_rate": 6.3296903460837886e-06, + "loss": 0.4225, + "step": 1082 + }, + { + "epoch": 2.6609336609336607, + "grad_norm": 0.2688663002431461, + "learning_rate": 6.284153005464481e-06, + "loss": 0.4576, + "step": 1083 + }, + { + "epoch": 2.6633906633906634, + "grad_norm": 0.21352788135666395, + "learning_rate": 6.238615664845173e-06, + "loss": 0.3745, + "step": 1084 + }, + { + "epoch": 2.6658476658476657, + "grad_norm": 0.2343987025317479, + "learning_rate": 6.193078324225865e-06, + "loss": 0.437, + "step": 1085 + }, + { + "epoch": 2.6683046683046685, + "grad_norm": 0.23634741722118774, + "learning_rate": 6.147540983606558e-06, + "loss": 0.4755, + "step": 1086 + }, + { + "epoch": 2.670761670761671, + "grad_norm": 0.2333977046249411, + "learning_rate": 6.10200364298725e-06, + "loss": 0.4226, + "step": 1087 + }, + { + "epoch": 2.673218673218673, + "grad_norm": 0.24140034792380946, + "learning_rate": 6.056466302367942e-06, + "loss": 0.4381, + "step": 1088 + }, + { + "epoch": 2.6756756756756754, + "grad_norm": 0.24610102385252078, + "learning_rate": 6.010928961748634e-06, + "loss": 0.4788, + "step": 1089 + }, + { + "epoch": 2.678132678132678, + "grad_norm": 0.21651298490313028, + "learning_rate": 5.965391621129326e-06, + "loss": 0.379, + "step": 1090 + }, + { + "epoch": 2.6805896805896805, + "grad_norm": 0.24002364672689916, + "learning_rate": 5.919854280510018e-06, + "loss": 0.4639, + "step": 1091 + }, + { + "epoch": 2.6830466830466833, + "grad_norm": 0.2401559730905027, + "learning_rate": 5.874316939890711e-06, + "loss": 0.4222, + "step": 1092 + }, + { + "epoch": 2.6855036855036856, + "grad_norm": 0.2296738324030033, + "learning_rate": 5.828779599271403e-06, + "loss": 0.4562, + "step": 1093 + }, + { + "epoch": 2.687960687960688, + "grad_norm": 0.24627104393871396, + "learning_rate": 5.783242258652095e-06, + "loss": 0.443, + "step": 1094 + }, + { + "epoch": 2.69041769041769, + "grad_norm": 0.23495018354076735, + "learning_rate": 5.737704918032787e-06, + "loss": 0.4446, + "step": 1095 + }, + { + "epoch": 2.692874692874693, + "grad_norm": 0.2515349565358722, + "learning_rate": 5.692167577413479e-06, + "loss": 0.4446, + "step": 1096 + }, + { + "epoch": 2.6953316953316953, + "grad_norm": 0.23661569080996545, + "learning_rate": 5.646630236794172e-06, + "loss": 0.4218, + "step": 1097 + }, + { + "epoch": 2.697788697788698, + "grad_norm": 0.25053626642778104, + "learning_rate": 5.601092896174863e-06, + "loss": 0.4072, + "step": 1098 + }, + { + "epoch": 2.7002457002457003, + "grad_norm": 0.23587139645082844, + "learning_rate": 5.555555555555556e-06, + "loss": 0.4173, + "step": 1099 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 0.2273539299051293, + "learning_rate": 5.510018214936248e-06, + "loss": 0.378, + "step": 1100 + }, + { + "epoch": 2.705159705159705, + "grad_norm": 0.24970174549990606, + "learning_rate": 5.46448087431694e-06, + "loss": 0.4712, + "step": 1101 + }, + { + "epoch": 2.7076167076167077, + "grad_norm": 0.22200832971393533, + "learning_rate": 5.418943533697633e-06, + "loss": 0.4103, + "step": 1102 + }, + { + "epoch": 2.71007371007371, + "grad_norm": 0.2269123817589866, + "learning_rate": 5.373406193078324e-06, + "loss": 0.4032, + "step": 1103 + }, + { + "epoch": 2.7125307125307128, + "grad_norm": 0.25690335857720653, + "learning_rate": 5.327868852459016e-06, + "loss": 0.4299, + "step": 1104 + }, + { + "epoch": 2.714987714987715, + "grad_norm": 0.24544690097434016, + "learning_rate": 5.2823315118397085e-06, + "loss": 0.4116, + "step": 1105 + }, + { + "epoch": 2.7174447174447174, + "grad_norm": 0.2319686270048615, + "learning_rate": 5.236794171220401e-06, + "loss": 0.3995, + "step": 1106 + }, + { + "epoch": 2.7199017199017197, + "grad_norm": 0.23710945966143346, + "learning_rate": 5.191256830601094e-06, + "loss": 0.4631, + "step": 1107 + }, + { + "epoch": 2.7223587223587224, + "grad_norm": 0.2259046385681155, + "learning_rate": 5.145719489981785e-06, + "loss": 0.4243, + "step": 1108 + }, + { + "epoch": 2.7248157248157248, + "grad_norm": 0.2257040623568255, + "learning_rate": 5.100182149362478e-06, + "loss": 0.4126, + "step": 1109 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.31168912185145126, + "learning_rate": 5.0546448087431695e-06, + "loss": 0.4209, + "step": 1110 + }, + { + "epoch": 2.72972972972973, + "grad_norm": 0.22549576040927052, + "learning_rate": 5.009107468123861e-06, + "loss": 0.3731, + "step": 1111 + }, + { + "epoch": 2.732186732186732, + "grad_norm": 0.24143398730766474, + "learning_rate": 4.963570127504554e-06, + "loss": 0.3649, + "step": 1112 + }, + { + "epoch": 2.7346437346437344, + "grad_norm": 0.22716863260793815, + "learning_rate": 4.918032786885246e-06, + "loss": 0.4307, + "step": 1113 + }, + { + "epoch": 2.737100737100737, + "grad_norm": 0.22591051220728886, + "learning_rate": 4.872495446265939e-06, + "loss": 0.3708, + "step": 1114 + }, + { + "epoch": 2.7395577395577395, + "grad_norm": 0.24622339640633006, + "learning_rate": 4.8269581056466305e-06, + "loss": 0.4399, + "step": 1115 + }, + { + "epoch": 2.7420147420147423, + "grad_norm": 0.2822872905637631, + "learning_rate": 4.781420765027322e-06, + "loss": 0.4346, + "step": 1116 + }, + { + "epoch": 2.7444717444717446, + "grad_norm": 0.22801344793802245, + "learning_rate": 4.735883424408015e-06, + "loss": 0.3956, + "step": 1117 + }, + { + "epoch": 2.746928746928747, + "grad_norm": 0.22155930655884537, + "learning_rate": 4.6903460837887065e-06, + "loss": 0.4215, + "step": 1118 + }, + { + "epoch": 2.749385749385749, + "grad_norm": 0.25029715906542904, + "learning_rate": 4.6448087431694e-06, + "loss": 0.488, + "step": 1119 + }, + { + "epoch": 2.751842751842752, + "grad_norm": 0.24274141839591315, + "learning_rate": 4.5992714025500915e-06, + "loss": 0.4195, + "step": 1120 + }, + { + "epoch": 2.7542997542997543, + "grad_norm": 0.23028544702814976, + "learning_rate": 4.553734061930783e-06, + "loss": 0.3788, + "step": 1121 + }, + { + "epoch": 2.756756756756757, + "grad_norm": 0.24916297952456937, + "learning_rate": 4.508196721311476e-06, + "loss": 0.4244, + "step": 1122 + }, + { + "epoch": 2.7592137592137593, + "grad_norm": 0.5209218145104284, + "learning_rate": 4.4626593806921675e-06, + "loss": 0.4771, + "step": 1123 + }, + { + "epoch": 2.7616707616707616, + "grad_norm": 0.2149644103008232, + "learning_rate": 4.41712204007286e-06, + "loss": 0.38, + "step": 1124 + }, + { + "epoch": 2.764127764127764, + "grad_norm": 3.4124629297354665, + "learning_rate": 4.371584699453552e-06, + "loss": 0.5629, + "step": 1125 + }, + { + "epoch": 2.7665847665847667, + "grad_norm": 0.2530450553088245, + "learning_rate": 4.326047358834244e-06, + "loss": 0.458, + "step": 1126 + }, + { + "epoch": 2.769041769041769, + "grad_norm": 0.24743507038028553, + "learning_rate": 4.280510018214937e-06, + "loss": 0.4901, + "step": 1127 + }, + { + "epoch": 2.7714987714987718, + "grad_norm": 0.24238514475994, + "learning_rate": 4.2349726775956285e-06, + "loss": 0.4958, + "step": 1128 + }, + { + "epoch": 2.773955773955774, + "grad_norm": 0.22170693726990862, + "learning_rate": 4.189435336976321e-06, + "loss": 0.3619, + "step": 1129 + }, + { + "epoch": 2.7764127764127764, + "grad_norm": 0.22200899039595443, + "learning_rate": 4.143897996357013e-06, + "loss": 0.3722, + "step": 1130 + }, + { + "epoch": 2.7788697788697787, + "grad_norm": 0.21557815228436708, + "learning_rate": 4.098360655737704e-06, + "loss": 0.4045, + "step": 1131 + }, + { + "epoch": 2.7813267813267815, + "grad_norm": 0.23101269211409964, + "learning_rate": 4.052823315118398e-06, + "loss": 0.3993, + "step": 1132 + }, + { + "epoch": 2.7837837837837838, + "grad_norm": 0.2506113219628401, + "learning_rate": 4.0072859744990895e-06, + "loss": 0.4394, + "step": 1133 + }, + { + "epoch": 2.786240786240786, + "grad_norm": 0.25829653944630426, + "learning_rate": 3.961748633879782e-06, + "loss": 0.4781, + "step": 1134 + }, + { + "epoch": 2.788697788697789, + "grad_norm": 0.2283437552109554, + "learning_rate": 3.916211293260474e-06, + "loss": 0.4199, + "step": 1135 + }, + { + "epoch": 2.791154791154791, + "grad_norm": 0.23261934599464665, + "learning_rate": 3.870673952641165e-06, + "loss": 0.4376, + "step": 1136 + }, + { + "epoch": 2.7936117936117935, + "grad_norm": 0.22397178290472075, + "learning_rate": 3.825136612021858e-06, + "loss": 0.3982, + "step": 1137 + }, + { + "epoch": 2.796068796068796, + "grad_norm": 0.2398679252310125, + "learning_rate": 3.77959927140255e-06, + "loss": 0.4303, + "step": 1138 + }, + { + "epoch": 2.7985257985257985, + "grad_norm": 0.2521861139284355, + "learning_rate": 3.7340619307832426e-06, + "loss": 0.3795, + "step": 1139 + }, + { + "epoch": 2.800982800982801, + "grad_norm": 0.22383712405363193, + "learning_rate": 3.6885245901639347e-06, + "loss": 0.4348, + "step": 1140 + }, + { + "epoch": 2.8034398034398036, + "grad_norm": 0.22946747083819977, + "learning_rate": 3.6429872495446273e-06, + "loss": 0.4206, + "step": 1141 + }, + { + "epoch": 2.805896805896806, + "grad_norm": 0.21677708187128097, + "learning_rate": 3.597449908925319e-06, + "loss": 0.4147, + "step": 1142 + }, + { + "epoch": 2.808353808353808, + "grad_norm": 0.21734633915684778, + "learning_rate": 3.551912568306011e-06, + "loss": 0.4093, + "step": 1143 + }, + { + "epoch": 2.810810810810811, + "grad_norm": 0.24321384485885375, + "learning_rate": 3.5063752276867036e-06, + "loss": 0.3783, + "step": 1144 + }, + { + "epoch": 2.8132678132678133, + "grad_norm": 0.24297335222493194, + "learning_rate": 3.4608378870673953e-06, + "loss": 0.5014, + "step": 1145 + }, + { + "epoch": 2.8157248157248156, + "grad_norm": 0.23892842770463382, + "learning_rate": 3.415300546448088e-06, + "loss": 0.4653, + "step": 1146 + }, + { + "epoch": 2.8181818181818183, + "grad_norm": 0.23160806679913898, + "learning_rate": 3.36976320582878e-06, + "loss": 0.422, + "step": 1147 + }, + { + "epoch": 2.8206388206388207, + "grad_norm": 0.22304524872709652, + "learning_rate": 3.3242258652094717e-06, + "loss": 0.3961, + "step": 1148 + }, + { + "epoch": 2.823095823095823, + "grad_norm": 0.2139600997939772, + "learning_rate": 3.278688524590164e-06, + "loss": 0.398, + "step": 1149 + }, + { + "epoch": 2.8255528255528253, + "grad_norm": 0.22956564153660272, + "learning_rate": 3.2331511839708563e-06, + "loss": 0.4151, + "step": 1150 + }, + { + "epoch": 2.828009828009828, + "grad_norm": 0.21826847683156242, + "learning_rate": 3.187613843351549e-06, + "loss": 0.3886, + "step": 1151 + }, + { + "epoch": 2.8304668304668303, + "grad_norm": 0.21141003742321426, + "learning_rate": 3.1420765027322406e-06, + "loss": 0.3906, + "step": 1152 + }, + { + "epoch": 2.832923832923833, + "grad_norm": 0.2357603696949525, + "learning_rate": 3.0965391621129327e-06, + "loss": 0.451, + "step": 1153 + }, + { + "epoch": 2.8353808353808354, + "grad_norm": 0.225812066393447, + "learning_rate": 3.051001821493625e-06, + "loss": 0.4064, + "step": 1154 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 0.21406448287602542, + "learning_rate": 3.005464480874317e-06, + "loss": 0.3674, + "step": 1155 + }, + { + "epoch": 2.84029484029484, + "grad_norm": 0.23290227939520636, + "learning_rate": 2.959927140255009e-06, + "loss": 0.4354, + "step": 1156 + }, + { + "epoch": 2.842751842751843, + "grad_norm": 0.2242096154461336, + "learning_rate": 2.9143897996357016e-06, + "loss": 0.4045, + "step": 1157 + }, + { + "epoch": 2.845208845208845, + "grad_norm": 0.23016845169620148, + "learning_rate": 2.8688524590163937e-06, + "loss": 0.4196, + "step": 1158 + }, + { + "epoch": 2.847665847665848, + "grad_norm": 0.21852096936661997, + "learning_rate": 2.823315118397086e-06, + "loss": 0.3982, + "step": 1159 + }, + { + "epoch": 2.85012285012285, + "grad_norm": 0.2108274165792627, + "learning_rate": 2.777777777777778e-06, + "loss": 0.3749, + "step": 1160 + }, + { + "epoch": 2.8525798525798525, + "grad_norm": 0.24160370753779067, + "learning_rate": 2.73224043715847e-06, + "loss": 0.4542, + "step": 1161 + }, + { + "epoch": 2.855036855036855, + "grad_norm": 1.6244239610624458, + "learning_rate": 2.686703096539162e-06, + "loss": 0.4585, + "step": 1162 + }, + { + "epoch": 2.8574938574938575, + "grad_norm": 0.22148267445895137, + "learning_rate": 2.6411657559198543e-06, + "loss": 0.3932, + "step": 1163 + }, + { + "epoch": 2.85995085995086, + "grad_norm": 0.23686699702884864, + "learning_rate": 2.595628415300547e-06, + "loss": 0.4043, + "step": 1164 + }, + { + "epoch": 2.8624078624078626, + "grad_norm": 0.2253190526682749, + "learning_rate": 2.550091074681239e-06, + "loss": 0.3865, + "step": 1165 + }, + { + "epoch": 2.864864864864865, + "grad_norm": 0.23283682010046694, + "learning_rate": 2.5045537340619306e-06, + "loss": 0.4241, + "step": 1166 + }, + { + "epoch": 2.8673218673218672, + "grad_norm": 0.22637501007112718, + "learning_rate": 2.459016393442623e-06, + "loss": 0.4535, + "step": 1167 + }, + { + "epoch": 2.8697788697788695, + "grad_norm": 0.23528102347160695, + "learning_rate": 2.4134790528233153e-06, + "loss": 0.485, + "step": 1168 + }, + { + "epoch": 2.8722358722358723, + "grad_norm": 0.23425484374934466, + "learning_rate": 2.3679417122040074e-06, + "loss": 0.4475, + "step": 1169 + }, + { + "epoch": 2.8746928746928746, + "grad_norm": 0.22149092071411625, + "learning_rate": 2.3224043715847e-06, + "loss": 0.4022, + "step": 1170 + }, + { + "epoch": 2.8771498771498774, + "grad_norm": 0.23427790871326182, + "learning_rate": 2.2768670309653916e-06, + "loss": 0.4638, + "step": 1171 + }, + { + "epoch": 2.8796068796068797, + "grad_norm": 0.23231854684157077, + "learning_rate": 2.2313296903460837e-06, + "loss": 0.4394, + "step": 1172 + }, + { + "epoch": 2.882063882063882, + "grad_norm": 0.2342789974677895, + "learning_rate": 2.185792349726776e-06, + "loss": 0.4825, + "step": 1173 + }, + { + "epoch": 2.8845208845208843, + "grad_norm": 0.22158002172153052, + "learning_rate": 2.1402550091074684e-06, + "loss": 0.386, + "step": 1174 + }, + { + "epoch": 2.886977886977887, + "grad_norm": 0.20696142757418035, + "learning_rate": 2.0947176684881605e-06, + "loss": 0.374, + "step": 1175 + }, + { + "epoch": 2.8894348894348894, + "grad_norm": 0.23895640881238192, + "learning_rate": 2.049180327868852e-06, + "loss": 0.4433, + "step": 1176 + }, + { + "epoch": 2.891891891891892, + "grad_norm": 0.21999586387865822, + "learning_rate": 2.0036429872495447e-06, + "loss": 0.3954, + "step": 1177 + }, + { + "epoch": 2.8943488943488944, + "grad_norm": 7.714103219123681, + "learning_rate": 1.958105646630237e-06, + "loss": 0.5093, + "step": 1178 + }, + { + "epoch": 2.8968058968058967, + "grad_norm": 0.20872227311945366, + "learning_rate": 1.912568306010929e-06, + "loss": 0.3708, + "step": 1179 + }, + { + "epoch": 2.899262899262899, + "grad_norm": 0.23835713585529297, + "learning_rate": 1.8670309653916213e-06, + "loss": 0.4284, + "step": 1180 + }, + { + "epoch": 2.901719901719902, + "grad_norm": 0.22864449909911705, + "learning_rate": 1.8214936247723136e-06, + "loss": 0.428, + "step": 1181 + }, + { + "epoch": 2.904176904176904, + "grad_norm": 0.2406324576550951, + "learning_rate": 1.7759562841530055e-06, + "loss": 0.4624, + "step": 1182 + }, + { + "epoch": 2.906633906633907, + "grad_norm": 0.23431139400422057, + "learning_rate": 1.7304189435336977e-06, + "loss": 0.4387, + "step": 1183 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.21843455420917768, + "learning_rate": 1.68488160291439e-06, + "loss": 0.4249, + "step": 1184 + }, + { + "epoch": 2.9115479115479115, + "grad_norm": 0.2095922735664185, + "learning_rate": 1.639344262295082e-06, + "loss": 0.395, + "step": 1185 + }, + { + "epoch": 2.914004914004914, + "grad_norm": 0.2269069829992154, + "learning_rate": 1.5938069216757744e-06, + "loss": 0.4245, + "step": 1186 + }, + { + "epoch": 2.9164619164619165, + "grad_norm": 0.2119797460187829, + "learning_rate": 1.5482695810564663e-06, + "loss": 0.3898, + "step": 1187 + }, + { + "epoch": 2.918918918918919, + "grad_norm": 0.23711678102007225, + "learning_rate": 1.5027322404371585e-06, + "loss": 0.4812, + "step": 1188 + }, + { + "epoch": 2.9213759213759216, + "grad_norm": 0.2191981863306329, + "learning_rate": 1.4571948998178508e-06, + "loss": 0.4322, + "step": 1189 + }, + { + "epoch": 2.923832923832924, + "grad_norm": 0.22589359686916735, + "learning_rate": 1.411657559198543e-06, + "loss": 0.3886, + "step": 1190 + }, + { + "epoch": 2.9262899262899262, + "grad_norm": 0.24232246496079973, + "learning_rate": 1.366120218579235e-06, + "loss": 0.4787, + "step": 1191 + }, + { + "epoch": 2.9287469287469285, + "grad_norm": 0.23337563262261982, + "learning_rate": 1.3205828779599271e-06, + "loss": 0.4604, + "step": 1192 + }, + { + "epoch": 2.9312039312039313, + "grad_norm": 0.22211870990294277, + "learning_rate": 1.2750455373406195e-06, + "loss": 0.4262, + "step": 1193 + }, + { + "epoch": 2.9336609336609336, + "grad_norm": 0.2299266125727697, + "learning_rate": 1.2295081967213116e-06, + "loss": 0.4976, + "step": 1194 + }, + { + "epoch": 2.9361179361179364, + "grad_norm": 0.2213619140432279, + "learning_rate": 1.1839708561020037e-06, + "loss": 0.4219, + "step": 1195 + }, + { + "epoch": 2.9385749385749387, + "grad_norm": 0.21746767504739525, + "learning_rate": 1.1384335154826958e-06, + "loss": 0.4351, + "step": 1196 + }, + { + "epoch": 2.941031941031941, + "grad_norm": 0.23322041860850679, + "learning_rate": 1.092896174863388e-06, + "loss": 0.4407, + "step": 1197 + }, + { + "epoch": 2.9434889434889433, + "grad_norm": 0.21436428057177767, + "learning_rate": 1.0473588342440803e-06, + "loss": 0.3869, + "step": 1198 + }, + { + "epoch": 2.945945945945946, + "grad_norm": 0.23891934387649086, + "learning_rate": 1.0018214936247724e-06, + "loss": 0.4645, + "step": 1199 + }, + { + "epoch": 2.9484029484029484, + "grad_norm": 0.2278055983373202, + "learning_rate": 9.562841530054645e-07, + "loss": 0.4638, + "step": 1200 + }, + { + "epoch": 2.950859950859951, + "grad_norm": 0.3776834168448288, + "learning_rate": 9.107468123861568e-07, + "loss": 0.3994, + "step": 1201 + }, + { + "epoch": 2.9533169533169534, + "grad_norm": 0.22934397142430324, + "learning_rate": 8.652094717668488e-07, + "loss": 0.4639, + "step": 1202 + }, + { + "epoch": 2.9557739557739557, + "grad_norm": 0.2063303716692425, + "learning_rate": 8.19672131147541e-07, + "loss": 0.3707, + "step": 1203 + }, + { + "epoch": 2.958230958230958, + "grad_norm": 0.2351672804832617, + "learning_rate": 7.741347905282332e-07, + "loss": 0.4168, + "step": 1204 + }, + { + "epoch": 2.960687960687961, + "grad_norm": 0.22161703857940737, + "learning_rate": 7.285974499089254e-07, + "loss": 0.4014, + "step": 1205 + }, + { + "epoch": 2.963144963144963, + "grad_norm": 0.3343193912872951, + "learning_rate": 6.830601092896175e-07, + "loss": 0.3978, + "step": 1206 + }, + { + "epoch": 2.965601965601966, + "grad_norm": 0.22304950387987088, + "learning_rate": 6.375227686703097e-07, + "loss": 0.4135, + "step": 1207 + }, + { + "epoch": 2.968058968058968, + "grad_norm": 0.20953255552793454, + "learning_rate": 5.919854280510018e-07, + "loss": 0.3811, + "step": 1208 + }, + { + "epoch": 2.9705159705159705, + "grad_norm": 0.2329344600488229, + "learning_rate": 5.46448087431694e-07, + "loss": 0.4343, + "step": 1209 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.21299980955076325, + "learning_rate": 5.009107468123862e-07, + "loss": 0.3967, + "step": 1210 + }, + { + "epoch": 2.9754299754299756, + "grad_norm": 0.22514110450209485, + "learning_rate": 4.553734061930784e-07, + "loss": 0.4594, + "step": 1211 + }, + { + "epoch": 2.977886977886978, + "grad_norm": 0.22706086722772673, + "learning_rate": 4.098360655737705e-07, + "loss": 0.4452, + "step": 1212 + }, + { + "epoch": 2.98034398034398, + "grad_norm": 0.22420225612233982, + "learning_rate": 3.642987249544627e-07, + "loss": 0.451, + "step": 1213 + }, + { + "epoch": 2.982800982800983, + "grad_norm": 0.2278750274266437, + "learning_rate": 3.1876138433515486e-07, + "loss": 0.4298, + "step": 1214 + }, + { + "epoch": 2.9852579852579852, + "grad_norm": 0.22450421754608937, + "learning_rate": 2.73224043715847e-07, + "loss": 0.4549, + "step": 1215 + }, + { + "epoch": 2.9877149877149876, + "grad_norm": 0.24947496879816358, + "learning_rate": 2.276867030965392e-07, + "loss": 0.4612, + "step": 1216 + }, + { + "epoch": 2.9901719901719903, + "grad_norm": 0.22458707836271088, + "learning_rate": 1.8214936247723135e-07, + "loss": 0.4051, + "step": 1217 + }, + { + "epoch": 2.9926289926289926, + "grad_norm": 0.21227609068441602, + "learning_rate": 1.366120218579235e-07, + "loss": 0.3807, + "step": 1218 + }, + { + "epoch": 2.995085995085995, + "grad_norm": 0.22465026975771382, + "learning_rate": 9.107468123861567e-08, + "loss": 0.4377, + "step": 1219 + }, + { + "epoch": 2.9975429975429977, + "grad_norm": 0.22388402187392278, + "learning_rate": 4.553734061930784e-08, + "loss": 0.4133, + "step": 1220 + }, + { + "epoch": 3.0, + "grad_norm": 0.2335653945488078, + "learning_rate": 0.0, + "loss": 0.4107, + "step": 1221 + }, + { + "epoch": 3.0, + "step": 1221, + "total_flos": 1.0279209431224812e+18, + "train_loss": 0.6874593345968573, + "train_runtime": 70837.5294, + "train_samples_per_second": 0.275, + "train_steps_per_second": 0.017 + } + ], + "logging_steps": 1, + "max_steps": 1221, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0279209431224812e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}