{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1401, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021413276231263384, "grad_norm": 56.592826307068876, "learning_rate": 3.546099290780142e-07, "loss": 3.0396, "step": 1 }, { "epoch": 0.004282655246252677, "grad_norm": 60.92994210864957, "learning_rate": 7.092198581560284e-07, "loss": 3.4422, "step": 2 }, { "epoch": 0.006423982869379015, "grad_norm": 58.66990590000166, "learning_rate": 1.0638297872340427e-06, "loss": 3.2284, "step": 3 }, { "epoch": 0.008565310492505354, "grad_norm": 57.90854902052254, "learning_rate": 1.4184397163120568e-06, "loss": 3.1414, "step": 4 }, { "epoch": 0.010706638115631691, "grad_norm": 58.311165642100505, "learning_rate": 1.7730496453900712e-06, "loss": 3.2059, "step": 5 }, { "epoch": 0.01284796573875803, "grad_norm": 56.80687951003449, "learning_rate": 2.1276595744680853e-06, "loss": 3.1799, "step": 6 }, { "epoch": 0.014989293361884369, "grad_norm": 47.7995425239194, "learning_rate": 2.4822695035460995e-06, "loss": 2.7636, "step": 7 }, { "epoch": 0.017130620985010708, "grad_norm": 49.227409613395615, "learning_rate": 2.8368794326241136e-06, "loss": 2.8918, "step": 8 }, { "epoch": 0.019271948608137045, "grad_norm": 31.314007438717645, "learning_rate": 3.1914893617021277e-06, "loss": 2.1702, "step": 9 }, { "epoch": 0.021413276231263382, "grad_norm": 24.736226056643922, "learning_rate": 3.5460992907801423e-06, "loss": 1.9433, "step": 10 }, { "epoch": 0.023554603854389723, "grad_norm": 5.183793675647891, "learning_rate": 3.9007092198581565e-06, "loss": 1.2843, "step": 11 }, { "epoch": 0.02569593147751606, "grad_norm": 4.360295224010352, "learning_rate": 4.255319148936171e-06, "loss": 1.2718, "step": 12 }, { "epoch": 0.027837259100642397, "grad_norm": 3.516837280046668, "learning_rate": 4.609929078014184e-06, "loss": 1.1784, "step": 13 }, { "epoch": 0.029978586723768737, "grad_norm": 2.9770123258300263, "learning_rate": 4.964539007092199e-06, "loss": 1.2087, "step": 14 }, { "epoch": 0.032119914346895075, "grad_norm": 3.371301698123799, "learning_rate": 5.319148936170213e-06, "loss": 1.1044, "step": 15 }, { "epoch": 0.034261241970021415, "grad_norm": 2.4929900637756104, "learning_rate": 5.673758865248227e-06, "loss": 1.0305, "step": 16 }, { "epoch": 0.03640256959314775, "grad_norm": 2.3245385930370683, "learning_rate": 6.028368794326241e-06, "loss": 1.0328, "step": 17 }, { "epoch": 0.03854389721627409, "grad_norm": 2.1494749257738106, "learning_rate": 6.3829787234042555e-06, "loss": 0.9676, "step": 18 }, { "epoch": 0.04068522483940043, "grad_norm": 1.7068403808367762, "learning_rate": 6.73758865248227e-06, "loss": 0.9568, "step": 19 }, { "epoch": 0.042826552462526764, "grad_norm": 1.3157208379186793, "learning_rate": 7.092198581560285e-06, "loss": 0.8919, "step": 20 }, { "epoch": 0.044967880085653104, "grad_norm": 1.5755739795069426, "learning_rate": 7.446808510638298e-06, "loss": 0.8786, "step": 21 }, { "epoch": 0.047109207708779445, "grad_norm": 1.4066443822922425, "learning_rate": 7.801418439716313e-06, "loss": 0.8728, "step": 22 }, { "epoch": 0.04925053533190578, "grad_norm": 1.232711682082585, "learning_rate": 8.156028368794328e-06, "loss": 0.8089, "step": 23 }, { "epoch": 0.05139186295503212, "grad_norm": 1.0965349661030956, "learning_rate": 8.510638297872341e-06, "loss": 0.8199, "step": 24 }, { "epoch": 0.05353319057815846, "grad_norm": 0.779920509833393, "learning_rate": 8.865248226950355e-06, "loss": 0.8012, "step": 25 }, { "epoch": 0.055674518201284794, "grad_norm": 0.7514108454773875, "learning_rate": 9.219858156028368e-06, "loss": 0.7586, "step": 26 }, { "epoch": 0.057815845824411134, "grad_norm": 0.8086762191971834, "learning_rate": 9.574468085106383e-06, "loss": 0.7487, "step": 27 }, { "epoch": 0.059957173447537475, "grad_norm": 0.8878919709734377, "learning_rate": 9.929078014184398e-06, "loss": 0.7034, "step": 28 }, { "epoch": 0.06209850107066381, "grad_norm": 0.829679313012889, "learning_rate": 1.0283687943262411e-05, "loss": 0.7474, "step": 29 }, { "epoch": 0.06423982869379015, "grad_norm": 0.6626872942809816, "learning_rate": 1.0638297872340426e-05, "loss": 0.7349, "step": 30 }, { "epoch": 0.06638115631691649, "grad_norm": 0.6753492576166917, "learning_rate": 1.0992907801418441e-05, "loss": 0.7171, "step": 31 }, { "epoch": 0.06852248394004283, "grad_norm": 0.6704627449665963, "learning_rate": 1.1347517730496454e-05, "loss": 0.7126, "step": 32 }, { "epoch": 0.07066381156316917, "grad_norm": 0.5562388268523989, "learning_rate": 1.170212765957447e-05, "loss": 0.6868, "step": 33 }, { "epoch": 0.0728051391862955, "grad_norm": 0.6223987828107421, "learning_rate": 1.2056737588652483e-05, "loss": 0.6845, "step": 34 }, { "epoch": 0.07494646680942184, "grad_norm": 0.5778135109852265, "learning_rate": 1.2411347517730498e-05, "loss": 0.6766, "step": 35 }, { "epoch": 0.07708779443254818, "grad_norm": 0.4742088939687226, "learning_rate": 1.2765957446808511e-05, "loss": 0.6522, "step": 36 }, { "epoch": 0.07922912205567452, "grad_norm": 0.4554617236825081, "learning_rate": 1.3120567375886524e-05, "loss": 0.6551, "step": 37 }, { "epoch": 0.08137044967880086, "grad_norm": 0.49432958372427405, "learning_rate": 1.347517730496454e-05, "loss": 0.6351, "step": 38 }, { "epoch": 0.0835117773019272, "grad_norm": 0.440896644761752, "learning_rate": 1.3829787234042554e-05, "loss": 0.62, "step": 39 }, { "epoch": 0.08565310492505353, "grad_norm": 0.3847657975966914, "learning_rate": 1.418439716312057e-05, "loss": 0.6449, "step": 40 }, { "epoch": 0.08779443254817987, "grad_norm": 0.361109158517152, "learning_rate": 1.4539007092198581e-05, "loss": 0.6603, "step": 41 }, { "epoch": 0.08993576017130621, "grad_norm": 0.35567536401833566, "learning_rate": 1.4893617021276596e-05, "loss": 0.6005, "step": 42 }, { "epoch": 0.09207708779443255, "grad_norm": 0.3656052483807157, "learning_rate": 1.5248226950354611e-05, "loss": 0.6126, "step": 43 }, { "epoch": 0.09421841541755889, "grad_norm": 0.33705396501786156, "learning_rate": 1.5602836879432626e-05, "loss": 0.6066, "step": 44 }, { "epoch": 0.09635974304068523, "grad_norm": 0.30303119503903936, "learning_rate": 1.595744680851064e-05, "loss": 0.6171, "step": 45 }, { "epoch": 0.09850107066381156, "grad_norm": 0.2967613905439793, "learning_rate": 1.6312056737588656e-05, "loss": 0.608, "step": 46 }, { "epoch": 0.1006423982869379, "grad_norm": 0.3767161988103767, "learning_rate": 1.6666666666666667e-05, "loss": 0.6171, "step": 47 }, { "epoch": 0.10278372591006424, "grad_norm": 0.314876767137896, "learning_rate": 1.7021276595744682e-05, "loss": 0.5786, "step": 48 }, { "epoch": 0.10492505353319058, "grad_norm": 0.2767623351226321, "learning_rate": 1.7375886524822697e-05, "loss": 0.6053, "step": 49 }, { "epoch": 0.10706638115631692, "grad_norm": 0.29425544708240703, "learning_rate": 1.773049645390071e-05, "loss": 0.5932, "step": 50 }, { "epoch": 0.10920770877944326, "grad_norm": 0.2867315363666657, "learning_rate": 1.8085106382978724e-05, "loss": 0.5842, "step": 51 }, { "epoch": 0.11134903640256959, "grad_norm": 0.2585858531022653, "learning_rate": 1.8439716312056736e-05, "loss": 0.5853, "step": 52 }, { "epoch": 0.11349036402569593, "grad_norm": 0.2655264874976756, "learning_rate": 1.879432624113475e-05, "loss": 0.58, "step": 53 }, { "epoch": 0.11563169164882227, "grad_norm": 0.30355375200055923, "learning_rate": 1.9148936170212766e-05, "loss": 0.5891, "step": 54 }, { "epoch": 0.11777301927194861, "grad_norm": 0.30060619559855645, "learning_rate": 1.950354609929078e-05, "loss": 0.5657, "step": 55 }, { "epoch": 0.11991434689507495, "grad_norm": 0.2544001308153972, "learning_rate": 1.9858156028368796e-05, "loss": 0.6008, "step": 56 }, { "epoch": 0.12205567451820129, "grad_norm": 0.25996965782644, "learning_rate": 2.0212765957446807e-05, "loss": 0.5703, "step": 57 }, { "epoch": 0.12419700214132762, "grad_norm": 0.2895469648849053, "learning_rate": 2.0567375886524822e-05, "loss": 0.5817, "step": 58 }, { "epoch": 0.12633832976445397, "grad_norm": 0.25922509784815095, "learning_rate": 2.0921985815602837e-05, "loss": 0.5665, "step": 59 }, { "epoch": 0.1284796573875803, "grad_norm": 0.26935959494463335, "learning_rate": 2.1276595744680852e-05, "loss": 0.5634, "step": 60 }, { "epoch": 0.13062098501070663, "grad_norm": 0.2472890335094329, "learning_rate": 2.1631205673758867e-05, "loss": 0.5489, "step": 61 }, { "epoch": 0.13276231263383298, "grad_norm": 0.25220403853856466, "learning_rate": 2.1985815602836882e-05, "loss": 0.5555, "step": 62 }, { "epoch": 0.1349036402569593, "grad_norm": 0.25227853159521985, "learning_rate": 2.2340425531914894e-05, "loss": 0.5977, "step": 63 }, { "epoch": 0.13704496788008566, "grad_norm": 0.22662601243259486, "learning_rate": 2.269503546099291e-05, "loss": 0.5481, "step": 64 }, { "epoch": 0.139186295503212, "grad_norm": 0.2462259567482875, "learning_rate": 2.3049645390070924e-05, "loss": 0.5628, "step": 65 }, { "epoch": 0.14132762312633834, "grad_norm": 0.24332397161016522, "learning_rate": 2.340425531914894e-05, "loss": 0.5553, "step": 66 }, { "epoch": 0.14346895074946467, "grad_norm": 0.23477707463573774, "learning_rate": 2.3758865248226954e-05, "loss": 0.5621, "step": 67 }, { "epoch": 0.145610278372591, "grad_norm": 0.2500380042146856, "learning_rate": 2.4113475177304965e-05, "loss": 0.5661, "step": 68 }, { "epoch": 0.14775160599571735, "grad_norm": 0.26380612632753875, "learning_rate": 2.446808510638298e-05, "loss": 0.5461, "step": 69 }, { "epoch": 0.14989293361884368, "grad_norm": 0.2557903047322243, "learning_rate": 2.4822695035460995e-05, "loss": 0.5506, "step": 70 }, { "epoch": 0.15203426124197003, "grad_norm": 0.25820215490499565, "learning_rate": 2.5177304964539007e-05, "loss": 0.5478, "step": 71 }, { "epoch": 0.15417558886509636, "grad_norm": 0.24758155471819995, "learning_rate": 2.5531914893617022e-05, "loss": 0.5518, "step": 72 }, { "epoch": 0.15631691648822268, "grad_norm": 0.2415443149878719, "learning_rate": 2.5886524822695034e-05, "loss": 0.5505, "step": 73 }, { "epoch": 0.15845824411134904, "grad_norm": 0.24823560199460706, "learning_rate": 2.624113475177305e-05, "loss": 0.5665, "step": 74 }, { "epoch": 0.16059957173447537, "grad_norm": 0.23378147422826043, "learning_rate": 2.6595744680851064e-05, "loss": 0.527, "step": 75 }, { "epoch": 0.16274089935760172, "grad_norm": 0.23375799268334566, "learning_rate": 2.695035460992908e-05, "loss": 0.5522, "step": 76 }, { "epoch": 0.16488222698072805, "grad_norm": 0.27046995095800896, "learning_rate": 2.7304964539007094e-05, "loss": 0.5317, "step": 77 }, { "epoch": 0.1670235546038544, "grad_norm": 0.22869648794405983, "learning_rate": 2.765957446808511e-05, "loss": 0.5488, "step": 78 }, { "epoch": 0.16916488222698073, "grad_norm": 0.2144302125474379, "learning_rate": 2.8014184397163124e-05, "loss": 0.536, "step": 79 }, { "epoch": 0.17130620985010706, "grad_norm": 0.2573958870247804, "learning_rate": 2.836879432624114e-05, "loss": 0.5261, "step": 80 }, { "epoch": 0.1734475374732334, "grad_norm": 0.2613906182039433, "learning_rate": 2.8723404255319154e-05, "loss": 0.5418, "step": 81 }, { "epoch": 0.17558886509635974, "grad_norm": 0.2314728596697455, "learning_rate": 2.9078014184397162e-05, "loss": 0.5258, "step": 82 }, { "epoch": 0.1777301927194861, "grad_norm": 0.255469146374909, "learning_rate": 2.9432624113475177e-05, "loss": 0.5358, "step": 83 }, { "epoch": 0.17987152034261242, "grad_norm": 0.269758850025105, "learning_rate": 2.9787234042553192e-05, "loss": 0.5467, "step": 84 }, { "epoch": 0.18201284796573874, "grad_norm": 0.22108427720048224, "learning_rate": 3.0141843971631207e-05, "loss": 0.5138, "step": 85 }, { "epoch": 0.1841541755888651, "grad_norm": 0.2918049155831516, "learning_rate": 3.0496453900709222e-05, "loss": 0.5358, "step": 86 }, { "epoch": 0.18629550321199143, "grad_norm": 0.252878303302326, "learning_rate": 3.085106382978723e-05, "loss": 0.5164, "step": 87 }, { "epoch": 0.18843683083511778, "grad_norm": 0.28867004146757574, "learning_rate": 3.120567375886525e-05, "loss": 0.54, "step": 88 }, { "epoch": 0.1905781584582441, "grad_norm": 0.30841177860368885, "learning_rate": 3.156028368794326e-05, "loss": 0.5271, "step": 89 }, { "epoch": 0.19271948608137046, "grad_norm": 0.29805101294833053, "learning_rate": 3.191489361702128e-05, "loss": 0.5186, "step": 90 }, { "epoch": 0.1948608137044968, "grad_norm": 0.29287452022732885, "learning_rate": 3.226950354609929e-05, "loss": 0.5288, "step": 91 }, { "epoch": 0.19700214132762311, "grad_norm": 0.2892573734733911, "learning_rate": 3.262411347517731e-05, "loss": 0.5325, "step": 92 }, { "epoch": 0.19914346895074947, "grad_norm": 0.241170324930527, "learning_rate": 3.2978723404255317e-05, "loss": 0.5069, "step": 93 }, { "epoch": 0.2012847965738758, "grad_norm": 0.28618356988432964, "learning_rate": 3.3333333333333335e-05, "loss": 0.528, "step": 94 }, { "epoch": 0.20342612419700215, "grad_norm": 0.2706127494448774, "learning_rate": 3.3687943262411347e-05, "loss": 0.5137, "step": 95 }, { "epoch": 0.20556745182012848, "grad_norm": 0.2920030135213235, "learning_rate": 3.4042553191489365e-05, "loss": 0.5279, "step": 96 }, { "epoch": 0.20770877944325483, "grad_norm": 0.2481402426704902, "learning_rate": 3.4397163120567377e-05, "loss": 0.5001, "step": 97 }, { "epoch": 0.20985010706638116, "grad_norm": 0.27424390116478947, "learning_rate": 3.4751773049645395e-05, "loss": 0.4932, "step": 98 }, { "epoch": 0.21199143468950749, "grad_norm": 0.29648670562970614, "learning_rate": 3.5106382978723407e-05, "loss": 0.529, "step": 99 }, { "epoch": 0.21413276231263384, "grad_norm": 0.2457515803836918, "learning_rate": 3.546099290780142e-05, "loss": 0.5178, "step": 100 }, { "epoch": 0.21627408993576017, "grad_norm": 0.29926928267286307, "learning_rate": 3.5815602836879437e-05, "loss": 0.5224, "step": 101 }, { "epoch": 0.21841541755888652, "grad_norm": 0.24321396233390155, "learning_rate": 3.617021276595745e-05, "loss": 0.5296, "step": 102 }, { "epoch": 0.22055674518201285, "grad_norm": 0.30921888959439514, "learning_rate": 3.6524822695035466e-05, "loss": 0.5179, "step": 103 }, { "epoch": 0.22269807280513917, "grad_norm": 0.272063115950373, "learning_rate": 3.687943262411347e-05, "loss": 0.5283, "step": 104 }, { "epoch": 0.22483940042826553, "grad_norm": 0.26613186863197535, "learning_rate": 3.723404255319149e-05, "loss": 0.5242, "step": 105 }, { "epoch": 0.22698072805139186, "grad_norm": 0.33500951074808644, "learning_rate": 3.75886524822695e-05, "loss": 0.5174, "step": 106 }, { "epoch": 0.2291220556745182, "grad_norm": 0.2643772639229429, "learning_rate": 3.794326241134752e-05, "loss": 0.5343, "step": 107 }, { "epoch": 0.23126338329764454, "grad_norm": 0.31115272338829114, "learning_rate": 3.829787234042553e-05, "loss": 0.5052, "step": 108 }, { "epoch": 0.2334047109207709, "grad_norm": 0.2926558130416796, "learning_rate": 3.865248226950355e-05, "loss": 0.5137, "step": 109 }, { "epoch": 0.23554603854389722, "grad_norm": 0.3255536325363357, "learning_rate": 3.900709219858156e-05, "loss": 0.5279, "step": 110 }, { "epoch": 0.23768736616702354, "grad_norm": 0.33989084909132006, "learning_rate": 3.936170212765958e-05, "loss": 0.4988, "step": 111 }, { "epoch": 0.2398286937901499, "grad_norm": 0.35381236194602544, "learning_rate": 3.971631205673759e-05, "loss": 0.4937, "step": 112 }, { "epoch": 0.24197002141327623, "grad_norm": 0.3510604214761894, "learning_rate": 4.007092198581561e-05, "loss": 0.5178, "step": 113 }, { "epoch": 0.24411134903640258, "grad_norm": 0.2974753747795791, "learning_rate": 4.0425531914893614e-05, "loss": 0.5006, "step": 114 }, { "epoch": 0.2462526766595289, "grad_norm": 0.3391358705953081, "learning_rate": 4.078014184397163e-05, "loss": 0.4991, "step": 115 }, { "epoch": 0.24839400428265523, "grad_norm": 0.3244454146866931, "learning_rate": 4.1134751773049644e-05, "loss": 0.4992, "step": 116 }, { "epoch": 0.2505353319057816, "grad_norm": 0.29806045444607565, "learning_rate": 4.148936170212766e-05, "loss": 0.4879, "step": 117 }, { "epoch": 0.25267665952890794, "grad_norm": 0.34001472132830457, "learning_rate": 4.1843971631205674e-05, "loss": 0.5142, "step": 118 }, { "epoch": 0.25481798715203424, "grad_norm": 0.3147825744118216, "learning_rate": 4.219858156028369e-05, "loss": 0.4975, "step": 119 }, { "epoch": 0.2569593147751606, "grad_norm": 0.343655564131983, "learning_rate": 4.2553191489361704e-05, "loss": 0.5169, "step": 120 }, { "epoch": 0.25910064239828695, "grad_norm": 0.3165564039834663, "learning_rate": 4.2907801418439716e-05, "loss": 0.5038, "step": 121 }, { "epoch": 0.26124197002141325, "grad_norm": 0.3421038891467151, "learning_rate": 4.3262411347517734e-05, "loss": 0.5044, "step": 122 }, { "epoch": 0.2633832976445396, "grad_norm": 0.3236382619911156, "learning_rate": 4.3617021276595746e-05, "loss": 0.5077, "step": 123 }, { "epoch": 0.26552462526766596, "grad_norm": 0.3482419500485224, "learning_rate": 4.3971631205673764e-05, "loss": 0.4822, "step": 124 }, { "epoch": 0.2676659528907923, "grad_norm": 0.3967980846994073, "learning_rate": 4.432624113475177e-05, "loss": 0.4947, "step": 125 }, { "epoch": 0.2698072805139186, "grad_norm": 0.29467264961379613, "learning_rate": 4.468085106382979e-05, "loss": 0.4871, "step": 126 }, { "epoch": 0.27194860813704497, "grad_norm": 0.35596207729536755, "learning_rate": 4.50354609929078e-05, "loss": 0.4831, "step": 127 }, { "epoch": 0.2740899357601713, "grad_norm": 0.37395057401049897, "learning_rate": 4.539007092198582e-05, "loss": 0.5122, "step": 128 }, { "epoch": 0.2762312633832976, "grad_norm": 0.28605544378420467, "learning_rate": 4.574468085106383e-05, "loss": 0.4898, "step": 129 }, { "epoch": 0.278372591006424, "grad_norm": 0.3331655327731713, "learning_rate": 4.609929078014185e-05, "loss": 0.4912, "step": 130 }, { "epoch": 0.28051391862955033, "grad_norm": 0.29374454897662605, "learning_rate": 4.645390070921986e-05, "loss": 0.4814, "step": 131 }, { "epoch": 0.2826552462526767, "grad_norm": 0.30113962440816366, "learning_rate": 4.680851063829788e-05, "loss": 0.4856, "step": 132 }, { "epoch": 0.284796573875803, "grad_norm": 0.30027399950187866, "learning_rate": 4.716312056737589e-05, "loss": 0.4852, "step": 133 }, { "epoch": 0.28693790149892934, "grad_norm": 0.3230425204774485, "learning_rate": 4.751773049645391e-05, "loss": 0.4903, "step": 134 }, { "epoch": 0.2890792291220557, "grad_norm": 0.31234670755297617, "learning_rate": 4.787234042553192e-05, "loss": 0.5041, "step": 135 }, { "epoch": 0.291220556745182, "grad_norm": 0.30804022118995605, "learning_rate": 4.822695035460993e-05, "loss": 0.4818, "step": 136 }, { "epoch": 0.29336188436830835, "grad_norm": 0.3524834300849378, "learning_rate": 4.858156028368794e-05, "loss": 0.4935, "step": 137 }, { "epoch": 0.2955032119914347, "grad_norm": 0.3743768713148858, "learning_rate": 4.893617021276596e-05, "loss": 0.5098, "step": 138 }, { "epoch": 0.29764453961456105, "grad_norm": 0.34028691668800937, "learning_rate": 4.929078014184397e-05, "loss": 0.4935, "step": 139 }, { "epoch": 0.29978586723768735, "grad_norm": 0.3432662688866466, "learning_rate": 4.964539007092199e-05, "loss": 0.4906, "step": 140 }, { "epoch": 0.3019271948608137, "grad_norm": 0.3484159957729913, "learning_rate": 5e-05, "loss": 0.4916, "step": 141 }, { "epoch": 0.30406852248394006, "grad_norm": 0.2906011601786296, "learning_rate": 4.996031746031746e-05, "loss": 0.4851, "step": 142 }, { "epoch": 0.30620985010706636, "grad_norm": 0.31602752642817156, "learning_rate": 4.9920634920634924e-05, "loss": 0.4804, "step": 143 }, { "epoch": 0.3083511777301927, "grad_norm": 0.33931054385910697, "learning_rate": 4.9880952380952385e-05, "loss": 0.4956, "step": 144 }, { "epoch": 0.31049250535331907, "grad_norm": 0.2673250220712651, "learning_rate": 4.9841269841269845e-05, "loss": 0.474, "step": 145 }, { "epoch": 0.31263383297644537, "grad_norm": 0.27590244134076686, "learning_rate": 4.9801587301587306e-05, "loss": 0.4652, "step": 146 }, { "epoch": 0.3147751605995717, "grad_norm": 0.29339636329684166, "learning_rate": 4.976190476190477e-05, "loss": 0.4804, "step": 147 }, { "epoch": 0.3169164882226981, "grad_norm": 0.2842194678371775, "learning_rate": 4.972222222222223e-05, "loss": 0.4884, "step": 148 }, { "epoch": 0.31905781584582443, "grad_norm": 0.3177841927362518, "learning_rate": 4.968253968253969e-05, "loss": 0.4887, "step": 149 }, { "epoch": 0.32119914346895073, "grad_norm": 0.362994905525864, "learning_rate": 4.964285714285715e-05, "loss": 0.4957, "step": 150 }, { "epoch": 0.3233404710920771, "grad_norm": 0.3134548549120884, "learning_rate": 4.960317460317461e-05, "loss": 0.4746, "step": 151 }, { "epoch": 0.32548179871520344, "grad_norm": 0.33022141318925424, "learning_rate": 4.956349206349207e-05, "loss": 0.4836, "step": 152 }, { "epoch": 0.32762312633832974, "grad_norm": 0.2841374060809414, "learning_rate": 4.9523809523809525e-05, "loss": 0.4824, "step": 153 }, { "epoch": 0.3297644539614561, "grad_norm": 0.3042451933872161, "learning_rate": 4.9484126984126985e-05, "loss": 0.4947, "step": 154 }, { "epoch": 0.33190578158458245, "grad_norm": 0.3081245822778917, "learning_rate": 4.9444444444444446e-05, "loss": 0.4882, "step": 155 }, { "epoch": 0.3340471092077088, "grad_norm": 0.2888403289180772, "learning_rate": 4.940476190476191e-05, "loss": 0.4583, "step": 156 }, { "epoch": 0.3361884368308351, "grad_norm": 0.49151318304484476, "learning_rate": 4.936507936507937e-05, "loss": 0.5023, "step": 157 }, { "epoch": 0.33832976445396146, "grad_norm": 0.33926947413240066, "learning_rate": 4.932539682539683e-05, "loss": 0.5, "step": 158 }, { "epoch": 0.3404710920770878, "grad_norm": 0.3432037588535958, "learning_rate": 4.928571428571429e-05, "loss": 0.4892, "step": 159 }, { "epoch": 0.3426124197002141, "grad_norm": 0.33357704996030413, "learning_rate": 4.924603174603175e-05, "loss": 0.483, "step": 160 }, { "epoch": 0.34475374732334046, "grad_norm": 0.3455404282494954, "learning_rate": 4.9206349206349204e-05, "loss": 0.502, "step": 161 }, { "epoch": 0.3468950749464668, "grad_norm": 0.3799400968283991, "learning_rate": 4.9166666666666665e-05, "loss": 0.5005, "step": 162 }, { "epoch": 0.3490364025695932, "grad_norm": 0.28633882509130054, "learning_rate": 4.9126984126984125e-05, "loss": 0.5012, "step": 163 }, { "epoch": 0.3511777301927195, "grad_norm": 0.3692568848737784, "learning_rate": 4.9087301587301586e-05, "loss": 0.4773, "step": 164 }, { "epoch": 0.3533190578158458, "grad_norm": 0.28424265463523657, "learning_rate": 4.904761904761905e-05, "loss": 0.4933, "step": 165 }, { "epoch": 0.3554603854389722, "grad_norm": 0.3433785169927177, "learning_rate": 4.900793650793651e-05, "loss": 0.4708, "step": 166 }, { "epoch": 0.3576017130620985, "grad_norm": 0.322472121983474, "learning_rate": 4.896825396825397e-05, "loss": 0.484, "step": 167 }, { "epoch": 0.35974304068522484, "grad_norm": 0.37196647339980105, "learning_rate": 4.892857142857143e-05, "loss": 0.4872, "step": 168 }, { "epoch": 0.3618843683083512, "grad_norm": 0.3144695427706762, "learning_rate": 4.888888888888889e-05, "loss": 0.4909, "step": 169 }, { "epoch": 0.3640256959314775, "grad_norm": 0.3399481737215869, "learning_rate": 4.884920634920635e-05, "loss": 0.4851, "step": 170 }, { "epoch": 0.36616702355460384, "grad_norm": 0.30151182574451896, "learning_rate": 4.880952380952381e-05, "loss": 0.4772, "step": 171 }, { "epoch": 0.3683083511777302, "grad_norm": 0.3128577892502514, "learning_rate": 4.876984126984127e-05, "loss": 0.4848, "step": 172 }, { "epoch": 0.37044967880085655, "grad_norm": 0.31922897506045883, "learning_rate": 4.873015873015873e-05, "loss": 0.4986, "step": 173 }, { "epoch": 0.37259100642398285, "grad_norm": 0.3017606389198961, "learning_rate": 4.8690476190476194e-05, "loss": 0.4828, "step": 174 }, { "epoch": 0.3747323340471092, "grad_norm": 0.3000491473588088, "learning_rate": 4.8650793650793654e-05, "loss": 0.4752, "step": 175 }, { "epoch": 0.37687366167023556, "grad_norm": 0.3892869618739201, "learning_rate": 4.8611111111111115e-05, "loss": 0.4892, "step": 176 }, { "epoch": 0.37901498929336186, "grad_norm": 0.29614553607421257, "learning_rate": 4.8571428571428576e-05, "loss": 0.5018, "step": 177 }, { "epoch": 0.3811563169164882, "grad_norm": 0.3790956924695203, "learning_rate": 4.853174603174604e-05, "loss": 0.5092, "step": 178 }, { "epoch": 0.38329764453961457, "grad_norm": 0.3316734586371452, "learning_rate": 4.84920634920635e-05, "loss": 0.482, "step": 179 }, { "epoch": 0.3854389721627409, "grad_norm": 0.3404880131426743, "learning_rate": 4.845238095238095e-05, "loss": 0.4981, "step": 180 }, { "epoch": 0.3875802997858672, "grad_norm": 0.2900095290587479, "learning_rate": 4.841269841269841e-05, "loss": 0.4668, "step": 181 }, { "epoch": 0.3897216274089936, "grad_norm": 0.2922900867382386, "learning_rate": 4.837301587301587e-05, "loss": 0.4667, "step": 182 }, { "epoch": 0.39186295503211993, "grad_norm": 0.3181046448048784, "learning_rate": 4.8333333333333334e-05, "loss": 0.4804, "step": 183 }, { "epoch": 0.39400428265524623, "grad_norm": 0.27687702877774883, "learning_rate": 4.8293650793650794e-05, "loss": 0.4775, "step": 184 }, { "epoch": 0.3961456102783726, "grad_norm": 0.2836321426330953, "learning_rate": 4.8253968253968255e-05, "loss": 0.4725, "step": 185 }, { "epoch": 0.39828693790149894, "grad_norm": 0.2913866873418313, "learning_rate": 4.8214285714285716e-05, "loss": 0.4775, "step": 186 }, { "epoch": 0.4004282655246253, "grad_norm": 0.30835624032708503, "learning_rate": 4.817460317460318e-05, "loss": 0.4878, "step": 187 }, { "epoch": 0.4025695931477516, "grad_norm": 0.29588677700901966, "learning_rate": 4.813492063492064e-05, "loss": 0.4732, "step": 188 }, { "epoch": 0.40471092077087795, "grad_norm": 0.30881963199698537, "learning_rate": 4.80952380952381e-05, "loss": 0.4598, "step": 189 }, { "epoch": 0.4068522483940043, "grad_norm": 0.30298345716545283, "learning_rate": 4.805555555555556e-05, "loss": 0.4717, "step": 190 }, { "epoch": 0.4089935760171306, "grad_norm": 0.35885982262554333, "learning_rate": 4.801587301587302e-05, "loss": 0.4752, "step": 191 }, { "epoch": 0.41113490364025695, "grad_norm": 0.3004713627687736, "learning_rate": 4.797619047619048e-05, "loss": 0.4674, "step": 192 }, { "epoch": 0.4132762312633833, "grad_norm": 0.29124273016906205, "learning_rate": 4.793650793650794e-05, "loss": 0.4883, "step": 193 }, { "epoch": 0.41541755888650966, "grad_norm": 0.28170083463492585, "learning_rate": 4.78968253968254e-05, "loss": 0.4808, "step": 194 }, { "epoch": 0.41755888650963596, "grad_norm": 0.2930248614401525, "learning_rate": 4.785714285714286e-05, "loss": 0.4614, "step": 195 }, { "epoch": 0.4197002141327623, "grad_norm": 0.3725197913239573, "learning_rate": 4.781746031746032e-05, "loss": 0.466, "step": 196 }, { "epoch": 0.42184154175588867, "grad_norm": 0.3173770734521219, "learning_rate": 4.7777777777777784e-05, "loss": 0.4839, "step": 197 }, { "epoch": 0.42398286937901497, "grad_norm": 0.31792822631775125, "learning_rate": 4.7738095238095245e-05, "loss": 0.4604, "step": 198 }, { "epoch": 0.4261241970021413, "grad_norm": 0.4547994524865786, "learning_rate": 4.7698412698412706e-05, "loss": 0.458, "step": 199 }, { "epoch": 0.4282655246252677, "grad_norm": 0.3200904129332156, "learning_rate": 4.7658730158730166e-05, "loss": 0.4722, "step": 200 }, { "epoch": 0.430406852248394, "grad_norm": 0.33843214154215645, "learning_rate": 4.761904761904762e-05, "loss": 0.4841, "step": 201 }, { "epoch": 0.43254817987152033, "grad_norm": 0.29671855926132695, "learning_rate": 4.757936507936508e-05, "loss": 0.4896, "step": 202 }, { "epoch": 0.4346895074946467, "grad_norm": 0.31070750776887823, "learning_rate": 4.753968253968254e-05, "loss": 0.4714, "step": 203 }, { "epoch": 0.43683083511777304, "grad_norm": 0.3305729189717249, "learning_rate": 4.75e-05, "loss": 0.4553, "step": 204 }, { "epoch": 0.43897216274089934, "grad_norm": 0.30161254779141056, "learning_rate": 4.746031746031746e-05, "loss": 0.4771, "step": 205 }, { "epoch": 0.4411134903640257, "grad_norm": 0.4165722704133656, "learning_rate": 4.7420634920634924e-05, "loss": 0.4941, "step": 206 }, { "epoch": 0.44325481798715205, "grad_norm": 0.3098035646663218, "learning_rate": 4.738095238095238e-05, "loss": 0.4562, "step": 207 }, { "epoch": 0.44539614561027835, "grad_norm": 0.31217832469780865, "learning_rate": 4.734126984126984e-05, "loss": 0.4593, "step": 208 }, { "epoch": 0.4475374732334047, "grad_norm": 0.33303037422629134, "learning_rate": 4.73015873015873e-05, "loss": 0.4695, "step": 209 }, { "epoch": 0.44967880085653106, "grad_norm": 0.3164429971126553, "learning_rate": 4.726190476190476e-05, "loss": 0.4787, "step": 210 }, { "epoch": 0.4518201284796574, "grad_norm": 0.3667199010122267, "learning_rate": 4.722222222222222e-05, "loss": 0.4741, "step": 211 }, { "epoch": 0.4539614561027837, "grad_norm": 0.32958761965134165, "learning_rate": 4.718253968253968e-05, "loss": 0.476, "step": 212 }, { "epoch": 0.45610278372591007, "grad_norm": 0.3847263613923873, "learning_rate": 4.714285714285714e-05, "loss": 0.4699, "step": 213 }, { "epoch": 0.4582441113490364, "grad_norm": 0.2955982753193451, "learning_rate": 4.71031746031746e-05, "loss": 0.4799, "step": 214 }, { "epoch": 0.4603854389721627, "grad_norm": 0.38809540646072666, "learning_rate": 4.7063492063492064e-05, "loss": 0.4783, "step": 215 }, { "epoch": 0.4625267665952891, "grad_norm": 0.33351402005149655, "learning_rate": 4.7023809523809525e-05, "loss": 0.4671, "step": 216 }, { "epoch": 0.46466809421841543, "grad_norm": 0.3544022577957815, "learning_rate": 4.6984126984126986e-05, "loss": 0.4634, "step": 217 }, { "epoch": 0.4668094218415418, "grad_norm": 0.3440682763213123, "learning_rate": 4.6944444444444446e-05, "loss": 0.4606, "step": 218 }, { "epoch": 0.4689507494646681, "grad_norm": 0.35994592001280135, "learning_rate": 4.690476190476191e-05, "loss": 0.4687, "step": 219 }, { "epoch": 0.47109207708779444, "grad_norm": 0.2792969038742562, "learning_rate": 4.686507936507937e-05, "loss": 0.4675, "step": 220 }, { "epoch": 0.4732334047109208, "grad_norm": 0.3167594669644963, "learning_rate": 4.682539682539683e-05, "loss": 0.447, "step": 221 }, { "epoch": 0.4753747323340471, "grad_norm": 0.28698974715014, "learning_rate": 4.678571428571429e-05, "loss": 0.4824, "step": 222 }, { "epoch": 0.47751605995717344, "grad_norm": 0.2919731601075355, "learning_rate": 4.674603174603175e-05, "loss": 0.4756, "step": 223 }, { "epoch": 0.4796573875802998, "grad_norm": 0.3961977468030219, "learning_rate": 4.670634920634921e-05, "loss": 0.4988, "step": 224 }, { "epoch": 0.4817987152034261, "grad_norm": 0.2939889127181417, "learning_rate": 4.666666666666667e-05, "loss": 0.4782, "step": 225 }, { "epoch": 0.48394004282655245, "grad_norm": 0.38374224244801103, "learning_rate": 4.662698412698413e-05, "loss": 0.4713, "step": 226 }, { "epoch": 0.4860813704496788, "grad_norm": 0.2783898692581709, "learning_rate": 4.658730158730159e-05, "loss": 0.4683, "step": 227 }, { "epoch": 0.48822269807280516, "grad_norm": 0.3514854051266336, "learning_rate": 4.6547619047619054e-05, "loss": 0.4745, "step": 228 }, { "epoch": 0.49036402569593146, "grad_norm": 0.2761349026606947, "learning_rate": 4.6507936507936515e-05, "loss": 0.4777, "step": 229 }, { "epoch": 0.4925053533190578, "grad_norm": 0.2826076093838578, "learning_rate": 4.646825396825397e-05, "loss": 0.4491, "step": 230 }, { "epoch": 0.49464668094218417, "grad_norm": 0.2945939088071528, "learning_rate": 4.642857142857143e-05, "loss": 0.4724, "step": 231 }, { "epoch": 0.49678800856531047, "grad_norm": 0.2750719469025186, "learning_rate": 4.638888888888889e-05, "loss": 0.4592, "step": 232 }, { "epoch": 0.4989293361884368, "grad_norm": 0.2515310029544384, "learning_rate": 4.634920634920635e-05, "loss": 0.4572, "step": 233 }, { "epoch": 0.5010706638115632, "grad_norm": 0.29715312851565084, "learning_rate": 4.630952380952381e-05, "loss": 0.4769, "step": 234 }, { "epoch": 0.5032119914346895, "grad_norm": 0.28840327957350287, "learning_rate": 4.626984126984127e-05, "loss": 0.459, "step": 235 }, { "epoch": 0.5053533190578159, "grad_norm": 0.38388536035323373, "learning_rate": 4.623015873015873e-05, "loss": 0.4645, "step": 236 }, { "epoch": 0.5074946466809421, "grad_norm": 0.34005158978288785, "learning_rate": 4.6190476190476194e-05, "loss": 0.4951, "step": 237 }, { "epoch": 0.5096359743040685, "grad_norm": 0.3135930281832256, "learning_rate": 4.6150793650793655e-05, "loss": 0.4783, "step": 238 }, { "epoch": 0.5117773019271948, "grad_norm": 0.29153181091541064, "learning_rate": 4.6111111111111115e-05, "loss": 0.4783, "step": 239 }, { "epoch": 0.5139186295503212, "grad_norm": 0.35211081607111105, "learning_rate": 4.607142857142857e-05, "loss": 0.4784, "step": 240 }, { "epoch": 0.5160599571734475, "grad_norm": 0.2534327826926188, "learning_rate": 4.603174603174603e-05, "loss": 0.4758, "step": 241 }, { "epoch": 0.5182012847965739, "grad_norm": 0.291715580605378, "learning_rate": 4.599206349206349e-05, "loss": 0.4613, "step": 242 }, { "epoch": 0.5203426124197003, "grad_norm": 0.49953531179115856, "learning_rate": 4.595238095238095e-05, "loss": 0.467, "step": 243 }, { "epoch": 0.5224839400428265, "grad_norm": 0.37988476101063506, "learning_rate": 4.591269841269841e-05, "loss": 0.4668, "step": 244 }, { "epoch": 0.5246252676659529, "grad_norm": 0.3306624298526701, "learning_rate": 4.587301587301587e-05, "loss": 0.4571, "step": 245 }, { "epoch": 0.5267665952890792, "grad_norm": 0.292259379733011, "learning_rate": 4.5833333333333334e-05, "loss": 0.4747, "step": 246 }, { "epoch": 0.5289079229122056, "grad_norm": 0.2817284251841534, "learning_rate": 4.5793650793650795e-05, "loss": 0.4601, "step": 247 }, { "epoch": 0.5310492505353319, "grad_norm": 0.28259998989853713, "learning_rate": 4.5753968253968255e-05, "loss": 0.4489, "step": 248 }, { "epoch": 0.5331905781584583, "grad_norm": 0.3077230211086655, "learning_rate": 4.5714285714285716e-05, "loss": 0.4678, "step": 249 }, { "epoch": 0.5353319057815846, "grad_norm": 0.2783904187217292, "learning_rate": 4.567460317460318e-05, "loss": 0.4401, "step": 250 }, { "epoch": 0.5374732334047109, "grad_norm": 0.31896115951211806, "learning_rate": 4.563492063492064e-05, "loss": 0.4572, "step": 251 }, { "epoch": 0.5396145610278372, "grad_norm": 0.30499818403576123, "learning_rate": 4.55952380952381e-05, "loss": 0.465, "step": 252 }, { "epoch": 0.5417558886509636, "grad_norm": 0.35575649581615343, "learning_rate": 4.555555555555556e-05, "loss": 0.47, "step": 253 }, { "epoch": 0.5438972162740899, "grad_norm": 0.2684090185728518, "learning_rate": 4.551587301587302e-05, "loss": 0.4645, "step": 254 }, { "epoch": 0.5460385438972163, "grad_norm": 0.37921813849517194, "learning_rate": 4.547619047619048e-05, "loss": 0.4678, "step": 255 }, { "epoch": 0.5481798715203426, "grad_norm": 0.26743329008720806, "learning_rate": 4.543650793650794e-05, "loss": 0.4732, "step": 256 }, { "epoch": 0.550321199143469, "grad_norm": 0.31026028532831346, "learning_rate": 4.5396825396825395e-05, "loss": 0.4508, "step": 257 }, { "epoch": 0.5524625267665952, "grad_norm": 0.2581029656532225, "learning_rate": 4.5357142857142856e-05, "loss": 0.471, "step": 258 }, { "epoch": 0.5546038543897216, "grad_norm": 0.26803290950789216, "learning_rate": 4.531746031746032e-05, "loss": 0.4743, "step": 259 }, { "epoch": 0.556745182012848, "grad_norm": 0.38556934693664496, "learning_rate": 4.527777777777778e-05, "loss": 0.4768, "step": 260 }, { "epoch": 0.5588865096359743, "grad_norm": 0.2661542720290909, "learning_rate": 4.523809523809524e-05, "loss": 0.466, "step": 261 }, { "epoch": 0.5610278372591007, "grad_norm": 0.29067138820514277, "learning_rate": 4.51984126984127e-05, "loss": 0.4702, "step": 262 }, { "epoch": 0.563169164882227, "grad_norm": 0.2872183782023422, "learning_rate": 4.515873015873016e-05, "loss": 0.4534, "step": 263 }, { "epoch": 0.5653104925053534, "grad_norm": 0.25367532092796774, "learning_rate": 4.511904761904762e-05, "loss": 0.4551, "step": 264 }, { "epoch": 0.5674518201284796, "grad_norm": 0.3039430947031411, "learning_rate": 4.507936507936508e-05, "loss": 0.4627, "step": 265 }, { "epoch": 0.569593147751606, "grad_norm": 0.28351706499792234, "learning_rate": 4.503968253968254e-05, "loss": 0.4574, "step": 266 }, { "epoch": 0.5717344753747323, "grad_norm": 0.27425219768007014, "learning_rate": 4.5e-05, "loss": 0.4589, "step": 267 }, { "epoch": 0.5738758029978587, "grad_norm": 0.2869408267882252, "learning_rate": 4.4960317460317464e-05, "loss": 0.4475, "step": 268 }, { "epoch": 0.576017130620985, "grad_norm": 0.2762345424358576, "learning_rate": 4.4920634920634924e-05, "loss": 0.4708, "step": 269 }, { "epoch": 0.5781584582441114, "grad_norm": 0.3049597843838145, "learning_rate": 4.4880952380952385e-05, "loss": 0.4689, "step": 270 }, { "epoch": 0.5802997858672377, "grad_norm": 0.2592725090051658, "learning_rate": 4.4841269841269846e-05, "loss": 0.4656, "step": 271 }, { "epoch": 0.582441113490364, "grad_norm": 0.26044781623078483, "learning_rate": 4.4801587301587307e-05, "loss": 0.4638, "step": 272 }, { "epoch": 0.5845824411134903, "grad_norm": 0.2804692407312598, "learning_rate": 4.476190476190477e-05, "loss": 0.4675, "step": 273 }, { "epoch": 0.5867237687366167, "grad_norm": 0.26684962117113786, "learning_rate": 4.472222222222223e-05, "loss": 0.4467, "step": 274 }, { "epoch": 0.588865096359743, "grad_norm": 0.31968480635740154, "learning_rate": 4.468253968253969e-05, "loss": 0.474, "step": 275 }, { "epoch": 0.5910064239828694, "grad_norm": 0.2644188695273227, "learning_rate": 4.464285714285715e-05, "loss": 0.4686, "step": 276 }, { "epoch": 0.5931477516059958, "grad_norm": 0.2930643108233625, "learning_rate": 4.460317460317461e-05, "loss": 0.4669, "step": 277 }, { "epoch": 0.5952890792291221, "grad_norm": 0.3015159130498082, "learning_rate": 4.456349206349207e-05, "loss": 0.4648, "step": 278 }, { "epoch": 0.5974304068522484, "grad_norm": 0.2727279142726974, "learning_rate": 4.4523809523809525e-05, "loss": 0.4612, "step": 279 }, { "epoch": 0.5995717344753747, "grad_norm": 0.2975236904657764, "learning_rate": 4.4484126984126986e-05, "loss": 0.444, "step": 280 }, { "epoch": 0.6017130620985011, "grad_norm": 0.2580099237283999, "learning_rate": 4.4444444444444447e-05, "loss": 0.4792, "step": 281 }, { "epoch": 0.6038543897216274, "grad_norm": 0.2961073515555114, "learning_rate": 4.440476190476191e-05, "loss": 0.4534, "step": 282 }, { "epoch": 0.6059957173447538, "grad_norm": 0.2764889525421342, "learning_rate": 4.436507936507937e-05, "loss": 0.4442, "step": 283 }, { "epoch": 0.6081370449678801, "grad_norm": 0.26435268929370137, "learning_rate": 4.432539682539683e-05, "loss": 0.4452, "step": 284 }, { "epoch": 0.6102783725910065, "grad_norm": 0.2882202813256758, "learning_rate": 4.428571428571428e-05, "loss": 0.4448, "step": 285 }, { "epoch": 0.6124197002141327, "grad_norm": 0.2561848522615123, "learning_rate": 4.4246031746031744e-05, "loss": 0.453, "step": 286 }, { "epoch": 0.6145610278372591, "grad_norm": 0.32128316367276627, "learning_rate": 4.4206349206349204e-05, "loss": 0.4676, "step": 287 }, { "epoch": 0.6167023554603854, "grad_norm": 0.305653864447794, "learning_rate": 4.4166666666666665e-05, "loss": 0.457, "step": 288 }, { "epoch": 0.6188436830835118, "grad_norm": 0.27429918226444405, "learning_rate": 4.4126984126984126e-05, "loss": 0.4592, "step": 289 }, { "epoch": 0.6209850107066381, "grad_norm": 0.26436848550152375, "learning_rate": 4.4087301587301587e-05, "loss": 0.4343, "step": 290 }, { "epoch": 0.6231263383297645, "grad_norm": 0.2892075831032154, "learning_rate": 4.404761904761905e-05, "loss": 0.4405, "step": 291 }, { "epoch": 0.6252676659528907, "grad_norm": 0.27340488703312127, "learning_rate": 4.400793650793651e-05, "loss": 0.4754, "step": 292 }, { "epoch": 0.6274089935760171, "grad_norm": 0.2748574269063886, "learning_rate": 4.396825396825397e-05, "loss": 0.474, "step": 293 }, { "epoch": 0.6295503211991434, "grad_norm": 0.2941612988883471, "learning_rate": 4.392857142857143e-05, "loss": 0.4564, "step": 294 }, { "epoch": 0.6316916488222698, "grad_norm": 0.24636592894226964, "learning_rate": 4.388888888888889e-05, "loss": 0.4422, "step": 295 }, { "epoch": 0.6338329764453962, "grad_norm": 0.28582887621940406, "learning_rate": 4.384920634920635e-05, "loss": 0.4588, "step": 296 }, { "epoch": 0.6359743040685225, "grad_norm": 0.27743437358263945, "learning_rate": 4.380952380952381e-05, "loss": 0.4469, "step": 297 }, { "epoch": 0.6381156316916489, "grad_norm": 0.26242382702944955, "learning_rate": 4.376984126984127e-05, "loss": 0.4398, "step": 298 }, { "epoch": 0.6402569593147751, "grad_norm": 0.28165809226942223, "learning_rate": 4.373015873015873e-05, "loss": 0.4413, "step": 299 }, { "epoch": 0.6423982869379015, "grad_norm": 0.25284838465648923, "learning_rate": 4.3690476190476194e-05, "loss": 0.4342, "step": 300 }, { "epoch": 0.6445396145610278, "grad_norm": 0.31339700087144234, "learning_rate": 4.3650793650793655e-05, "loss": 0.4612, "step": 301 }, { "epoch": 0.6466809421841542, "grad_norm": 0.3104952345189579, "learning_rate": 4.3611111111111116e-05, "loss": 0.4731, "step": 302 }, { "epoch": 0.6488222698072805, "grad_norm": 0.2947572345084962, "learning_rate": 4.3571428571428576e-05, "loss": 0.453, "step": 303 }, { "epoch": 0.6509635974304069, "grad_norm": 0.3455960603661197, "learning_rate": 4.353174603174604e-05, "loss": 0.4644, "step": 304 }, { "epoch": 0.6531049250535332, "grad_norm": 0.25711343366321854, "learning_rate": 4.34920634920635e-05, "loss": 0.4802, "step": 305 }, { "epoch": 0.6552462526766595, "grad_norm": 0.31160993789120733, "learning_rate": 4.345238095238096e-05, "loss": 0.4534, "step": 306 }, { "epoch": 0.6573875802997858, "grad_norm": 0.27484057830146985, "learning_rate": 4.341269841269842e-05, "loss": 0.4529, "step": 307 }, { "epoch": 0.6595289079229122, "grad_norm": 0.26337065068689747, "learning_rate": 4.337301587301587e-05, "loss": 0.4647, "step": 308 }, { "epoch": 0.6616702355460385, "grad_norm": 0.271610173490994, "learning_rate": 4.3333333333333334e-05, "loss": 0.4445, "step": 309 }, { "epoch": 0.6638115631691649, "grad_norm": 0.2528923842806531, "learning_rate": 4.3293650793650795e-05, "loss": 0.4571, "step": 310 }, { "epoch": 0.6659528907922913, "grad_norm": 0.28885045896535533, "learning_rate": 4.3253968253968256e-05, "loss": 0.4554, "step": 311 }, { "epoch": 0.6680942184154176, "grad_norm": 0.25814187425851787, "learning_rate": 4.3214285714285716e-05, "loss": 0.468, "step": 312 }, { "epoch": 0.6702355460385439, "grad_norm": 0.26764371303035933, "learning_rate": 4.317460317460318e-05, "loss": 0.4594, "step": 313 }, { "epoch": 0.6723768736616702, "grad_norm": 0.24837539558393776, "learning_rate": 4.313492063492064e-05, "loss": 0.4371, "step": 314 }, { "epoch": 0.6745182012847966, "grad_norm": 0.25412398449219464, "learning_rate": 4.30952380952381e-05, "loss": 0.4594, "step": 315 }, { "epoch": 0.6766595289079229, "grad_norm": 0.23755251323497587, "learning_rate": 4.305555555555556e-05, "loss": 0.4674, "step": 316 }, { "epoch": 0.6788008565310493, "grad_norm": 0.2924363858039394, "learning_rate": 4.301587301587302e-05, "loss": 0.4688, "step": 317 }, { "epoch": 0.6809421841541756, "grad_norm": 0.2570229512356918, "learning_rate": 4.297619047619048e-05, "loss": 0.4494, "step": 318 }, { "epoch": 0.683083511777302, "grad_norm": 0.2764941603084112, "learning_rate": 4.2936507936507935e-05, "loss": 0.4605, "step": 319 }, { "epoch": 0.6852248394004282, "grad_norm": 0.3153973496882214, "learning_rate": 4.2896825396825396e-05, "loss": 0.4488, "step": 320 }, { "epoch": 0.6873661670235546, "grad_norm": 0.28153011145522383, "learning_rate": 4.2857142857142856e-05, "loss": 0.4705, "step": 321 }, { "epoch": 0.6895074946466809, "grad_norm": 0.3506781644039644, "learning_rate": 4.281746031746032e-05, "loss": 0.4576, "step": 322 }, { "epoch": 0.6916488222698073, "grad_norm": 0.3053110048188775, "learning_rate": 4.277777777777778e-05, "loss": 0.4732, "step": 323 }, { "epoch": 0.6937901498929336, "grad_norm": 0.3240151118248191, "learning_rate": 4.273809523809524e-05, "loss": 0.4554, "step": 324 }, { "epoch": 0.69593147751606, "grad_norm": 0.29062216265241864, "learning_rate": 4.26984126984127e-05, "loss": 0.4342, "step": 325 }, { "epoch": 0.6980728051391863, "grad_norm": 0.25283396063398134, "learning_rate": 4.265873015873016e-05, "loss": 0.4464, "step": 326 }, { "epoch": 0.7002141327623126, "grad_norm": 0.2680651552624257, "learning_rate": 4.261904761904762e-05, "loss": 0.4601, "step": 327 }, { "epoch": 0.702355460385439, "grad_norm": 0.33752170462860476, "learning_rate": 4.257936507936508e-05, "loss": 0.4478, "step": 328 }, { "epoch": 0.7044967880085653, "grad_norm": 0.2190411233461398, "learning_rate": 4.253968253968254e-05, "loss": 0.4465, "step": 329 }, { "epoch": 0.7066381156316917, "grad_norm": 0.2508058536828542, "learning_rate": 4.25e-05, "loss": 0.4452, "step": 330 }, { "epoch": 0.708779443254818, "grad_norm": 0.2377335448608951, "learning_rate": 4.2460317460317464e-05, "loss": 0.4401, "step": 331 }, { "epoch": 0.7109207708779444, "grad_norm": 0.2529235872462942, "learning_rate": 4.2420634920634925e-05, "loss": 0.4651, "step": 332 }, { "epoch": 0.7130620985010707, "grad_norm": 0.2516915529502037, "learning_rate": 4.2380952380952385e-05, "loss": 0.4617, "step": 333 }, { "epoch": 0.715203426124197, "grad_norm": 0.2364540674065309, "learning_rate": 4.2341269841269846e-05, "loss": 0.4711, "step": 334 }, { "epoch": 0.7173447537473233, "grad_norm": 0.25560457992791263, "learning_rate": 4.23015873015873e-05, "loss": 0.4558, "step": 335 }, { "epoch": 0.7194860813704497, "grad_norm": 0.23835034145803924, "learning_rate": 4.226190476190476e-05, "loss": 0.4643, "step": 336 }, { "epoch": 0.721627408993576, "grad_norm": 0.22675744611839385, "learning_rate": 4.222222222222222e-05, "loss": 0.4477, "step": 337 }, { "epoch": 0.7237687366167024, "grad_norm": 0.24050077892906585, "learning_rate": 4.218253968253968e-05, "loss": 0.4417, "step": 338 }, { "epoch": 0.7259100642398287, "grad_norm": 0.25640906605974817, "learning_rate": 4.214285714285714e-05, "loss": 0.4549, "step": 339 }, { "epoch": 0.728051391862955, "grad_norm": 0.228989440935261, "learning_rate": 4.2103174603174604e-05, "loss": 0.4504, "step": 340 }, { "epoch": 0.7301927194860813, "grad_norm": 0.2308736253356299, "learning_rate": 4.2063492063492065e-05, "loss": 0.4577, "step": 341 }, { "epoch": 0.7323340471092077, "grad_norm": 0.25703393548284337, "learning_rate": 4.2023809523809525e-05, "loss": 0.4586, "step": 342 }, { "epoch": 0.734475374732334, "grad_norm": 0.24906898794284202, "learning_rate": 4.1984126984126986e-05, "loss": 0.4457, "step": 343 }, { "epoch": 0.7366167023554604, "grad_norm": 0.2574124725732392, "learning_rate": 4.194444444444445e-05, "loss": 0.4529, "step": 344 }, { "epoch": 0.7387580299785867, "grad_norm": 0.243507960931399, "learning_rate": 4.190476190476191e-05, "loss": 0.4438, "step": 345 }, { "epoch": 0.7408993576017131, "grad_norm": 0.2445092413905703, "learning_rate": 4.186507936507937e-05, "loss": 0.4498, "step": 346 }, { "epoch": 0.7430406852248393, "grad_norm": 0.30753988028224943, "learning_rate": 4.182539682539683e-05, "loss": 0.4598, "step": 347 }, { "epoch": 0.7451820128479657, "grad_norm": 0.2505049178661786, "learning_rate": 4.178571428571429e-05, "loss": 0.4553, "step": 348 }, { "epoch": 0.7473233404710921, "grad_norm": 0.27044349556425973, "learning_rate": 4.174603174603175e-05, "loss": 0.4573, "step": 349 }, { "epoch": 0.7494646680942184, "grad_norm": 0.25124615427090896, "learning_rate": 4.170634920634921e-05, "loss": 0.476, "step": 350 }, { "epoch": 0.7516059957173448, "grad_norm": 0.26703556388241734, "learning_rate": 4.166666666666667e-05, "loss": 0.4366, "step": 351 }, { "epoch": 0.7537473233404711, "grad_norm": 0.24998740358371754, "learning_rate": 4.162698412698413e-05, "loss": 0.4467, "step": 352 }, { "epoch": 0.7558886509635975, "grad_norm": 0.2554947787191526, "learning_rate": 4.1587301587301594e-05, "loss": 0.4414, "step": 353 }, { "epoch": 0.7580299785867237, "grad_norm": 0.29499953780878213, "learning_rate": 4.1547619047619054e-05, "loss": 0.4668, "step": 354 }, { "epoch": 0.7601713062098501, "grad_norm": 0.2634686274433604, "learning_rate": 4.1507936507936515e-05, "loss": 0.4501, "step": 355 }, { "epoch": 0.7623126338329764, "grad_norm": 0.27587805015536104, "learning_rate": 4.1468253968253976e-05, "loss": 0.4345, "step": 356 }, { "epoch": 0.7644539614561028, "grad_norm": 0.27969963429910366, "learning_rate": 4.1428571428571437e-05, "loss": 0.4517, "step": 357 }, { "epoch": 0.7665952890792291, "grad_norm": 0.3006002807511561, "learning_rate": 4.138888888888889e-05, "loss": 0.4363, "step": 358 }, { "epoch": 0.7687366167023555, "grad_norm": 0.24094041832952617, "learning_rate": 4.134920634920635e-05, "loss": 0.4463, "step": 359 }, { "epoch": 0.7708779443254818, "grad_norm": 0.30068476671680583, "learning_rate": 4.130952380952381e-05, "loss": 0.4394, "step": 360 }, { "epoch": 0.7730192719486081, "grad_norm": 0.24223112221889828, "learning_rate": 4.126984126984127e-05, "loss": 0.4437, "step": 361 }, { "epoch": 0.7751605995717344, "grad_norm": 0.3306808578708232, "learning_rate": 4.123015873015873e-05, "loss": 0.463, "step": 362 }, { "epoch": 0.7773019271948608, "grad_norm": 0.22496155789537978, "learning_rate": 4.119047619047619e-05, "loss": 0.4396, "step": 363 }, { "epoch": 0.7794432548179872, "grad_norm": 0.26980303715734333, "learning_rate": 4.115079365079365e-05, "loss": 0.4502, "step": 364 }, { "epoch": 0.7815845824411135, "grad_norm": 0.24531612262771849, "learning_rate": 4.111111111111111e-05, "loss": 0.4371, "step": 365 }, { "epoch": 0.7837259100642399, "grad_norm": 0.23508992950961904, "learning_rate": 4.107142857142857e-05, "loss": 0.4351, "step": 366 }, { "epoch": 0.7858672376873662, "grad_norm": 0.2343557766020594, "learning_rate": 4.103174603174603e-05, "loss": 0.4474, "step": 367 }, { "epoch": 0.7880085653104925, "grad_norm": 0.24946904187399346, "learning_rate": 4.099206349206349e-05, "loss": 0.4432, "step": 368 }, { "epoch": 0.7901498929336188, "grad_norm": 0.253014359764136, "learning_rate": 4.095238095238095e-05, "loss": 0.4528, "step": 369 }, { "epoch": 0.7922912205567452, "grad_norm": 0.22772669050696337, "learning_rate": 4.091269841269841e-05, "loss": 0.4543, "step": 370 }, { "epoch": 0.7944325481798715, "grad_norm": 0.24881653297292614, "learning_rate": 4.0873015873015874e-05, "loss": 0.4312, "step": 371 }, { "epoch": 0.7965738758029979, "grad_norm": 0.25995771340020857, "learning_rate": 4.0833333333333334e-05, "loss": 0.4265, "step": 372 }, { "epoch": 0.7987152034261242, "grad_norm": 0.21693583355693366, "learning_rate": 4.0793650793650795e-05, "loss": 0.4278, "step": 373 }, { "epoch": 0.8008565310492506, "grad_norm": 0.24159689301797868, "learning_rate": 4.0753968253968256e-05, "loss": 0.4437, "step": 374 }, { "epoch": 0.8029978586723768, "grad_norm": 0.25009229119018167, "learning_rate": 4.0714285714285717e-05, "loss": 0.4338, "step": 375 }, { "epoch": 0.8051391862955032, "grad_norm": 0.2486299019264688, "learning_rate": 4.067460317460318e-05, "loss": 0.4506, "step": 376 }, { "epoch": 0.8072805139186295, "grad_norm": 0.247308996752361, "learning_rate": 4.063492063492064e-05, "loss": 0.4414, "step": 377 }, { "epoch": 0.8094218415417559, "grad_norm": 0.2590933139992137, "learning_rate": 4.05952380952381e-05, "loss": 0.4297, "step": 378 }, { "epoch": 0.8115631691648822, "grad_norm": 0.2508458429815614, "learning_rate": 4.055555555555556e-05, "loss": 0.4269, "step": 379 }, { "epoch": 0.8137044967880086, "grad_norm": 0.2879218570314043, "learning_rate": 4.051587301587302e-05, "loss": 0.4466, "step": 380 }, { "epoch": 0.815845824411135, "grad_norm": 0.23441911617011937, "learning_rate": 4.047619047619048e-05, "loss": 0.4412, "step": 381 }, { "epoch": 0.8179871520342612, "grad_norm": 0.32198396205698543, "learning_rate": 4.043650793650794e-05, "loss": 0.4547, "step": 382 }, { "epoch": 0.8201284796573876, "grad_norm": 0.30161712796978346, "learning_rate": 4.03968253968254e-05, "loss": 0.4515, "step": 383 }, { "epoch": 0.8222698072805139, "grad_norm": 0.23761583289371385, "learning_rate": 4.035714285714286e-05, "loss": 0.4343, "step": 384 }, { "epoch": 0.8244111349036403, "grad_norm": 0.27338237171088253, "learning_rate": 4.031746031746032e-05, "loss": 0.445, "step": 385 }, { "epoch": 0.8265524625267666, "grad_norm": 0.2856895105115863, "learning_rate": 4.027777777777778e-05, "loss": 0.4401, "step": 386 }, { "epoch": 0.828693790149893, "grad_norm": 0.2379960246228477, "learning_rate": 4.023809523809524e-05, "loss": 0.4411, "step": 387 }, { "epoch": 0.8308351177730193, "grad_norm": 0.3631643489141991, "learning_rate": 4.01984126984127e-05, "loss": 0.4528, "step": 388 }, { "epoch": 0.8329764453961456, "grad_norm": 0.2664861994481591, "learning_rate": 4.015873015873016e-05, "loss": 0.454, "step": 389 }, { "epoch": 0.8351177730192719, "grad_norm": 0.3054363176379015, "learning_rate": 4.011904761904762e-05, "loss": 0.4535, "step": 390 }, { "epoch": 0.8372591006423983, "grad_norm": 0.31912720459183097, "learning_rate": 4.007936507936508e-05, "loss": 0.4493, "step": 391 }, { "epoch": 0.8394004282655246, "grad_norm": 0.23251206637574764, "learning_rate": 4.003968253968254e-05, "loss": 0.4431, "step": 392 }, { "epoch": 0.841541755888651, "grad_norm": 0.28631581267375344, "learning_rate": 4e-05, "loss": 0.4323, "step": 393 }, { "epoch": 0.8436830835117773, "grad_norm": 0.267723471910929, "learning_rate": 3.9960317460317464e-05, "loss": 0.4473, "step": 394 }, { "epoch": 0.8458244111349036, "grad_norm": 0.27597440700713444, "learning_rate": 3.9920634920634925e-05, "loss": 0.4542, "step": 395 }, { "epoch": 0.8479657387580299, "grad_norm": 0.25433002014223405, "learning_rate": 3.9880952380952386e-05, "loss": 0.4444, "step": 396 }, { "epoch": 0.8501070663811563, "grad_norm": 0.25437045991554746, "learning_rate": 3.984126984126984e-05, "loss": 0.4579, "step": 397 }, { "epoch": 0.8522483940042827, "grad_norm": 0.24534548283140603, "learning_rate": 3.98015873015873e-05, "loss": 0.4725, "step": 398 }, { "epoch": 0.854389721627409, "grad_norm": 0.25435785462287974, "learning_rate": 3.976190476190476e-05, "loss": 0.4512, "step": 399 }, { "epoch": 0.8565310492505354, "grad_norm": 0.24601810950369032, "learning_rate": 3.972222222222222e-05, "loss": 0.448, "step": 400 }, { "epoch": 0.8586723768736617, "grad_norm": 0.2653361455115733, "learning_rate": 3.968253968253968e-05, "loss": 0.439, "step": 401 }, { "epoch": 0.860813704496788, "grad_norm": 0.23673358174194264, "learning_rate": 3.964285714285714e-05, "loss": 0.4413, "step": 402 }, { "epoch": 0.8629550321199143, "grad_norm": 0.24462225822851738, "learning_rate": 3.9603174603174604e-05, "loss": 0.45, "step": 403 }, { "epoch": 0.8650963597430407, "grad_norm": 0.2788329723246181, "learning_rate": 3.9563492063492065e-05, "loss": 0.454, "step": 404 }, { "epoch": 0.867237687366167, "grad_norm": 0.24432656732987582, "learning_rate": 3.9523809523809526e-05, "loss": 0.4346, "step": 405 }, { "epoch": 0.8693790149892934, "grad_norm": 0.3020331491334916, "learning_rate": 3.9484126984126986e-05, "loss": 0.4522, "step": 406 }, { "epoch": 0.8715203426124197, "grad_norm": 0.2856505508961332, "learning_rate": 3.944444444444445e-05, "loss": 0.4485, "step": 407 }, { "epoch": 0.8736616702355461, "grad_norm": 0.2656132022406484, "learning_rate": 3.940476190476191e-05, "loss": 0.4601, "step": 408 }, { "epoch": 0.8758029978586723, "grad_norm": 0.29873704798610634, "learning_rate": 3.936507936507937e-05, "loss": 0.4354, "step": 409 }, { "epoch": 0.8779443254817987, "grad_norm": 0.28600605020427156, "learning_rate": 3.932539682539683e-05, "loss": 0.4452, "step": 410 }, { "epoch": 0.880085653104925, "grad_norm": 0.26767141652320475, "learning_rate": 3.928571428571429e-05, "loss": 0.4414, "step": 411 }, { "epoch": 0.8822269807280514, "grad_norm": 0.35131045572847885, "learning_rate": 3.9246031746031744e-05, "loss": 0.4475, "step": 412 }, { "epoch": 0.8843683083511777, "grad_norm": 0.2370839823487289, "learning_rate": 3.9206349206349205e-05, "loss": 0.4358, "step": 413 }, { "epoch": 0.8865096359743041, "grad_norm": 0.3544368048991166, "learning_rate": 3.9166666666666665e-05, "loss": 0.4494, "step": 414 }, { "epoch": 0.8886509635974305, "grad_norm": 0.3172911304193655, "learning_rate": 3.9126984126984126e-05, "loss": 0.4487, "step": 415 }, { "epoch": 0.8907922912205567, "grad_norm": 0.275970880794438, "learning_rate": 3.908730158730159e-05, "loss": 0.4192, "step": 416 }, { "epoch": 0.892933618843683, "grad_norm": 0.3547661108556819, "learning_rate": 3.904761904761905e-05, "loss": 0.4445, "step": 417 }, { "epoch": 0.8950749464668094, "grad_norm": 0.24666251953784982, "learning_rate": 3.900793650793651e-05, "loss": 0.4486, "step": 418 }, { "epoch": 0.8972162740899358, "grad_norm": 0.33556985351699165, "learning_rate": 3.896825396825397e-05, "loss": 0.4357, "step": 419 }, { "epoch": 0.8993576017130621, "grad_norm": 0.29373216787613254, "learning_rate": 3.892857142857143e-05, "loss": 0.4559, "step": 420 }, { "epoch": 0.9014989293361885, "grad_norm": 0.27236847144104587, "learning_rate": 3.888888888888889e-05, "loss": 0.4509, "step": 421 }, { "epoch": 0.9036402569593148, "grad_norm": 0.3363146705059116, "learning_rate": 3.884920634920635e-05, "loss": 0.4338, "step": 422 }, { "epoch": 0.9057815845824411, "grad_norm": 0.23799725549650105, "learning_rate": 3.880952380952381e-05, "loss": 0.4422, "step": 423 }, { "epoch": 0.9079229122055674, "grad_norm": 0.2828958789244041, "learning_rate": 3.876984126984127e-05, "loss": 0.4597, "step": 424 }, { "epoch": 0.9100642398286938, "grad_norm": 0.3136305684919964, "learning_rate": 3.8730158730158734e-05, "loss": 0.4361, "step": 425 }, { "epoch": 0.9122055674518201, "grad_norm": 0.23243444701464022, "learning_rate": 3.8690476190476195e-05, "loss": 0.4337, "step": 426 }, { "epoch": 0.9143468950749465, "grad_norm": 0.31121643328315307, "learning_rate": 3.8650793650793655e-05, "loss": 0.4429, "step": 427 }, { "epoch": 0.9164882226980728, "grad_norm": 0.24007066200020147, "learning_rate": 3.8611111111111116e-05, "loss": 0.4412, "step": 428 }, { "epoch": 0.9186295503211992, "grad_norm": 0.24526627011399632, "learning_rate": 3.857142857142858e-05, "loss": 0.4326, "step": 429 }, { "epoch": 0.9207708779443254, "grad_norm": 0.2782809072036015, "learning_rate": 3.853174603174604e-05, "loss": 0.435, "step": 430 }, { "epoch": 0.9229122055674518, "grad_norm": 0.22743806598005653, "learning_rate": 3.84920634920635e-05, "loss": 0.4522, "step": 431 }, { "epoch": 0.9250535331905781, "grad_norm": 0.2652959665516378, "learning_rate": 3.845238095238096e-05, "loss": 0.4313, "step": 432 }, { "epoch": 0.9271948608137045, "grad_norm": 0.2320440083455923, "learning_rate": 3.841269841269842e-05, "loss": 0.4336, "step": 433 }, { "epoch": 0.9293361884368309, "grad_norm": 0.22904699603839362, "learning_rate": 3.837301587301588e-05, "loss": 0.4398, "step": 434 }, { "epoch": 0.9314775160599572, "grad_norm": 0.28061696790352825, "learning_rate": 3.8333333333333334e-05, "loss": 0.4398, "step": 435 }, { "epoch": 0.9336188436830836, "grad_norm": 0.22771552947703397, "learning_rate": 3.8293650793650795e-05, "loss": 0.4427, "step": 436 }, { "epoch": 0.9357601713062098, "grad_norm": 0.22379899530445632, "learning_rate": 3.8253968253968256e-05, "loss": 0.4292, "step": 437 }, { "epoch": 0.9379014989293362, "grad_norm": 0.2705759563196105, "learning_rate": 3.821428571428572e-05, "loss": 0.4381, "step": 438 }, { "epoch": 0.9400428265524625, "grad_norm": 0.25257412614253144, "learning_rate": 3.817460317460317e-05, "loss": 0.4441, "step": 439 }, { "epoch": 0.9421841541755889, "grad_norm": 0.2230238602545094, "learning_rate": 3.813492063492063e-05, "loss": 0.4382, "step": 440 }, { "epoch": 0.9443254817987152, "grad_norm": 0.2643425174467501, "learning_rate": 3.809523809523809e-05, "loss": 0.4494, "step": 441 }, { "epoch": 0.9464668094218416, "grad_norm": 0.22879958232087208, "learning_rate": 3.805555555555555e-05, "loss": 0.4686, "step": 442 }, { "epoch": 0.9486081370449678, "grad_norm": 0.3234801610814164, "learning_rate": 3.8015873015873014e-05, "loss": 0.4533, "step": 443 }, { "epoch": 0.9507494646680942, "grad_norm": 0.2238623404459414, "learning_rate": 3.7976190476190474e-05, "loss": 0.4593, "step": 444 }, { "epoch": 0.9528907922912205, "grad_norm": 0.2854000014313601, "learning_rate": 3.7936507936507935e-05, "loss": 0.4404, "step": 445 }, { "epoch": 0.9550321199143469, "grad_norm": 0.22982231285103497, "learning_rate": 3.7896825396825396e-05, "loss": 0.447, "step": 446 }, { "epoch": 0.9571734475374732, "grad_norm": 0.22798223769442008, "learning_rate": 3.785714285714286e-05, "loss": 0.4435, "step": 447 }, { "epoch": 0.9593147751605996, "grad_norm": 0.2698671605451589, "learning_rate": 3.781746031746032e-05, "loss": 0.4468, "step": 448 }, { "epoch": 0.961456102783726, "grad_norm": 0.25052757718595015, "learning_rate": 3.777777777777778e-05, "loss": 0.4559, "step": 449 }, { "epoch": 0.9635974304068522, "grad_norm": 0.2509803927116757, "learning_rate": 3.773809523809524e-05, "loss": 0.4485, "step": 450 }, { "epoch": 0.9657387580299786, "grad_norm": 0.24678486996154717, "learning_rate": 3.76984126984127e-05, "loss": 0.4469, "step": 451 }, { "epoch": 0.9678800856531049, "grad_norm": 0.2297303050823692, "learning_rate": 3.765873015873016e-05, "loss": 0.4427, "step": 452 }, { "epoch": 0.9700214132762313, "grad_norm": 0.24752427508051356, "learning_rate": 3.761904761904762e-05, "loss": 0.4414, "step": 453 }, { "epoch": 0.9721627408993576, "grad_norm": 0.285731490530565, "learning_rate": 3.757936507936508e-05, "loss": 0.4711, "step": 454 }, { "epoch": 0.974304068522484, "grad_norm": 0.2623040174515322, "learning_rate": 3.753968253968254e-05, "loss": 0.4297, "step": 455 }, { "epoch": 0.9764453961456103, "grad_norm": 0.27326887949485235, "learning_rate": 3.7500000000000003e-05, "loss": 0.4498, "step": 456 }, { "epoch": 0.9785867237687366, "grad_norm": 0.2603762615843794, "learning_rate": 3.7460317460317464e-05, "loss": 0.4542, "step": 457 }, { "epoch": 0.9807280513918629, "grad_norm": 0.32975141148086856, "learning_rate": 3.7420634920634925e-05, "loss": 0.4424, "step": 458 }, { "epoch": 0.9828693790149893, "grad_norm": 0.29153212143683266, "learning_rate": 3.7380952380952386e-05, "loss": 0.4483, "step": 459 }, { "epoch": 0.9850107066381156, "grad_norm": 0.29992516288872756, "learning_rate": 3.7341269841269846e-05, "loss": 0.4511, "step": 460 }, { "epoch": 0.987152034261242, "grad_norm": 0.29958162250392306, "learning_rate": 3.730158730158731e-05, "loss": 0.4263, "step": 461 }, { "epoch": 0.9892933618843683, "grad_norm": 0.24964751706597657, "learning_rate": 3.726190476190476e-05, "loss": 0.4324, "step": 462 }, { "epoch": 0.9914346895074947, "grad_norm": 0.2614912237652234, "learning_rate": 3.722222222222222e-05, "loss": 0.4312, "step": 463 }, { "epoch": 0.9935760171306209, "grad_norm": 0.266348778726537, "learning_rate": 3.718253968253968e-05, "loss": 0.4263, "step": 464 }, { "epoch": 0.9957173447537473, "grad_norm": 0.23725530062162245, "learning_rate": 3.7142857142857143e-05, "loss": 0.4364, "step": 465 }, { "epoch": 0.9978586723768736, "grad_norm": 0.30914413313910843, "learning_rate": 3.7103174603174604e-05, "loss": 0.4261, "step": 466 }, { "epoch": 1.0, "grad_norm": 0.23586300310874772, "learning_rate": 3.7063492063492065e-05, "loss": 0.4211, "step": 467 }, { "epoch": 1.0021413276231264, "grad_norm": 0.3369816971461586, "learning_rate": 3.7023809523809526e-05, "loss": 0.3762, "step": 468 }, { "epoch": 1.0042826552462527, "grad_norm": 0.22809858744604314, "learning_rate": 3.6984126984126986e-05, "loss": 0.3646, "step": 469 }, { "epoch": 1.006423982869379, "grad_norm": 0.266143350315859, "learning_rate": 3.694444444444445e-05, "loss": 0.3746, "step": 470 }, { "epoch": 1.0085653104925054, "grad_norm": 0.2884518377103039, "learning_rate": 3.690476190476191e-05, "loss": 0.3666, "step": 471 }, { "epoch": 1.0107066381156318, "grad_norm": 0.28068641149516393, "learning_rate": 3.686507936507937e-05, "loss": 0.3743, "step": 472 }, { "epoch": 1.0128479657387581, "grad_norm": 0.29435491983752216, "learning_rate": 3.682539682539683e-05, "loss": 0.3678, "step": 473 }, { "epoch": 1.0149892933618843, "grad_norm": 0.29581454893570663, "learning_rate": 3.678571428571429e-05, "loss": 0.382, "step": 474 }, { "epoch": 1.0171306209850106, "grad_norm": 0.2567395711302861, "learning_rate": 3.674603174603175e-05, "loss": 0.3781, "step": 475 }, { "epoch": 1.019271948608137, "grad_norm": 0.30179003911878, "learning_rate": 3.6706349206349205e-05, "loss": 0.3815, "step": 476 }, { "epoch": 1.0214132762312633, "grad_norm": 0.279742858383842, "learning_rate": 3.6666666666666666e-05, "loss": 0.3777, "step": 477 }, { "epoch": 1.0235546038543897, "grad_norm": 0.28030505005478956, "learning_rate": 3.6626984126984126e-05, "loss": 0.3656, "step": 478 }, { "epoch": 1.025695931477516, "grad_norm": 0.2713443901873829, "learning_rate": 3.658730158730159e-05, "loss": 0.374, "step": 479 }, { "epoch": 1.0278372591006424, "grad_norm": 0.2643392830096214, "learning_rate": 3.654761904761905e-05, "loss": 0.3649, "step": 480 }, { "epoch": 1.0299785867237687, "grad_norm": 0.3008739451104665, "learning_rate": 3.650793650793651e-05, "loss": 0.3822, "step": 481 }, { "epoch": 1.032119914346895, "grad_norm": 0.23862820873191215, "learning_rate": 3.646825396825397e-05, "loss": 0.3632, "step": 482 }, { "epoch": 1.0342612419700214, "grad_norm": 0.27869337578181386, "learning_rate": 3.642857142857143e-05, "loss": 0.3636, "step": 483 }, { "epoch": 1.0364025695931478, "grad_norm": 0.2533397054393399, "learning_rate": 3.638888888888889e-05, "loss": 0.3666, "step": 484 }, { "epoch": 1.0385438972162742, "grad_norm": 0.21014394289523325, "learning_rate": 3.634920634920635e-05, "loss": 0.3609, "step": 485 }, { "epoch": 1.0406852248394005, "grad_norm": 0.3283463606383022, "learning_rate": 3.630952380952381e-05, "loss": 0.3939, "step": 486 }, { "epoch": 1.0428265524625269, "grad_norm": 0.22387699439354838, "learning_rate": 3.626984126984127e-05, "loss": 0.3611, "step": 487 }, { "epoch": 1.044967880085653, "grad_norm": 0.2690411087151262, "learning_rate": 3.6230158730158734e-05, "loss": 0.3869, "step": 488 }, { "epoch": 1.0471092077087794, "grad_norm": 0.3053411429357243, "learning_rate": 3.619047619047619e-05, "loss": 0.3845, "step": 489 }, { "epoch": 1.0492505353319057, "grad_norm": 0.2631289951026371, "learning_rate": 3.615079365079365e-05, "loss": 0.3698, "step": 490 }, { "epoch": 1.051391862955032, "grad_norm": 0.2670111664926503, "learning_rate": 3.611111111111111e-05, "loss": 0.3692, "step": 491 }, { "epoch": 1.0535331905781584, "grad_norm": 0.2485103124351668, "learning_rate": 3.607142857142857e-05, "loss": 0.3538, "step": 492 }, { "epoch": 1.0556745182012848, "grad_norm": 0.23420232693366563, "learning_rate": 3.603174603174603e-05, "loss": 0.3756, "step": 493 }, { "epoch": 1.0578158458244111, "grad_norm": 0.23665666643547903, "learning_rate": 3.599206349206349e-05, "loss": 0.3577, "step": 494 }, { "epoch": 1.0599571734475375, "grad_norm": 0.2681669094655506, "learning_rate": 3.595238095238095e-05, "loss": 0.3785, "step": 495 }, { "epoch": 1.0620985010706638, "grad_norm": 0.22646726688527194, "learning_rate": 3.591269841269841e-05, "loss": 0.3657, "step": 496 }, { "epoch": 1.0642398286937902, "grad_norm": 0.26622378410440317, "learning_rate": 3.5873015873015874e-05, "loss": 0.3879, "step": 497 }, { "epoch": 1.0663811563169165, "grad_norm": 0.24041744536037002, "learning_rate": 3.5833333333333335e-05, "loss": 0.3799, "step": 498 }, { "epoch": 1.068522483940043, "grad_norm": 0.27516627734803006, "learning_rate": 3.5793650793650795e-05, "loss": 0.3689, "step": 499 }, { "epoch": 1.0706638115631693, "grad_norm": 0.23109318989193225, "learning_rate": 3.5753968253968256e-05, "loss": 0.3662, "step": 500 }, { "epoch": 1.0728051391862956, "grad_norm": 0.2499488422329624, "learning_rate": 3.571428571428572e-05, "loss": 0.3703, "step": 501 }, { "epoch": 1.0749464668094217, "grad_norm": 0.2706738165446803, "learning_rate": 3.567460317460318e-05, "loss": 0.3641, "step": 502 }, { "epoch": 1.077087794432548, "grad_norm": 0.22012676483900637, "learning_rate": 3.563492063492064e-05, "loss": 0.3736, "step": 503 }, { "epoch": 1.0792291220556745, "grad_norm": 0.2650545240192106, "learning_rate": 3.55952380952381e-05, "loss": 0.398, "step": 504 }, { "epoch": 1.0813704496788008, "grad_norm": 0.2520843476038801, "learning_rate": 3.555555555555556e-05, "loss": 0.3597, "step": 505 }, { "epoch": 1.0835117773019272, "grad_norm": 0.24197390108439093, "learning_rate": 3.551587301587302e-05, "loss": 0.3764, "step": 506 }, { "epoch": 1.0856531049250535, "grad_norm": 0.24538696032664498, "learning_rate": 3.547619047619048e-05, "loss": 0.3825, "step": 507 }, { "epoch": 1.0877944325481799, "grad_norm": 0.22888284257244346, "learning_rate": 3.543650793650794e-05, "loss": 0.3745, "step": 508 }, { "epoch": 1.0899357601713062, "grad_norm": 0.24869463423729066, "learning_rate": 3.53968253968254e-05, "loss": 0.3493, "step": 509 }, { "epoch": 1.0920770877944326, "grad_norm": 0.23993057946273721, "learning_rate": 3.5357142857142864e-05, "loss": 0.3663, "step": 510 }, { "epoch": 1.094218415417559, "grad_norm": 0.2657625184616568, "learning_rate": 3.5317460317460324e-05, "loss": 0.3668, "step": 511 }, { "epoch": 1.0963597430406853, "grad_norm": 0.2309814403862246, "learning_rate": 3.527777777777778e-05, "loss": 0.3787, "step": 512 }, { "epoch": 1.0985010706638116, "grad_norm": 0.24234237491113314, "learning_rate": 3.523809523809524e-05, "loss": 0.3493, "step": 513 }, { "epoch": 1.100642398286938, "grad_norm": 0.23974579149766023, "learning_rate": 3.51984126984127e-05, "loss": 0.3641, "step": 514 }, { "epoch": 1.1027837259100641, "grad_norm": 0.24724340356035635, "learning_rate": 3.515873015873016e-05, "loss": 0.3702, "step": 515 }, { "epoch": 1.1049250535331905, "grad_norm": 0.28135285517273, "learning_rate": 3.511904761904762e-05, "loss": 0.3695, "step": 516 }, { "epoch": 1.1070663811563168, "grad_norm": 0.243348659130255, "learning_rate": 3.5079365079365075e-05, "loss": 0.3714, "step": 517 }, { "epoch": 1.1092077087794432, "grad_norm": 0.24745510283381558, "learning_rate": 3.5039682539682536e-05, "loss": 0.3565, "step": 518 }, { "epoch": 1.1113490364025695, "grad_norm": 0.2563919456656394, "learning_rate": 3.5e-05, "loss": 0.3766, "step": 519 }, { "epoch": 1.113490364025696, "grad_norm": 0.22577716624985744, "learning_rate": 3.496031746031746e-05, "loss": 0.3697, "step": 520 }, { "epoch": 1.1156316916488223, "grad_norm": 0.263849669081119, "learning_rate": 3.492063492063492e-05, "loss": 0.3621, "step": 521 }, { "epoch": 1.1177730192719486, "grad_norm": 0.2578266915441593, "learning_rate": 3.488095238095238e-05, "loss": 0.3781, "step": 522 }, { "epoch": 1.119914346895075, "grad_norm": 0.267410790093275, "learning_rate": 3.484126984126984e-05, "loss": 0.3802, "step": 523 }, { "epoch": 1.1220556745182013, "grad_norm": 0.26720246872107734, "learning_rate": 3.48015873015873e-05, "loss": 0.3686, "step": 524 }, { "epoch": 1.1241970021413277, "grad_norm": 0.22844067281982589, "learning_rate": 3.476190476190476e-05, "loss": 0.3892, "step": 525 }, { "epoch": 1.126338329764454, "grad_norm": 0.2522676842739371, "learning_rate": 3.472222222222222e-05, "loss": 0.3695, "step": 526 }, { "epoch": 1.1284796573875804, "grad_norm": 0.20988616282501774, "learning_rate": 3.468253968253968e-05, "loss": 0.3705, "step": 527 }, { "epoch": 1.1306209850107067, "grad_norm": 0.2452290096455659, "learning_rate": 3.4642857142857144e-05, "loss": 0.3643, "step": 528 }, { "epoch": 1.132762312633833, "grad_norm": 0.20376413452570455, "learning_rate": 3.4603174603174604e-05, "loss": 0.3621, "step": 529 }, { "epoch": 1.1349036402569592, "grad_norm": 0.22961373150459624, "learning_rate": 3.4563492063492065e-05, "loss": 0.355, "step": 530 }, { "epoch": 1.1370449678800856, "grad_norm": 0.2430831935717076, "learning_rate": 3.4523809523809526e-05, "loss": 0.3689, "step": 531 }, { "epoch": 1.139186295503212, "grad_norm": 0.2017982939822363, "learning_rate": 3.448412698412699e-05, "loss": 0.3508, "step": 532 }, { "epoch": 1.1413276231263383, "grad_norm": 0.26746451866780874, "learning_rate": 3.444444444444445e-05, "loss": 0.3695, "step": 533 }, { "epoch": 1.1434689507494646, "grad_norm": 0.24736040419305078, "learning_rate": 3.440476190476191e-05, "loss": 0.3604, "step": 534 }, { "epoch": 1.145610278372591, "grad_norm": 0.23139512163454942, "learning_rate": 3.436507936507937e-05, "loss": 0.3597, "step": 535 }, { "epoch": 1.1477516059957173, "grad_norm": 0.24151228114318973, "learning_rate": 3.432539682539683e-05, "loss": 0.3722, "step": 536 }, { "epoch": 1.1498929336188437, "grad_norm": 0.225767578077691, "learning_rate": 3.428571428571429e-05, "loss": 0.3852, "step": 537 }, { "epoch": 1.15203426124197, "grad_norm": 0.2379372088035776, "learning_rate": 3.424603174603175e-05, "loss": 0.377, "step": 538 }, { "epoch": 1.1541755888650964, "grad_norm": 0.22031199802513296, "learning_rate": 3.420634920634921e-05, "loss": 0.3767, "step": 539 }, { "epoch": 1.1563169164882228, "grad_norm": 0.23132051552867036, "learning_rate": 3.4166666666666666e-05, "loss": 0.3528, "step": 540 }, { "epoch": 1.1584582441113491, "grad_norm": 0.24257151344704514, "learning_rate": 3.412698412698413e-05, "loss": 0.3595, "step": 541 }, { "epoch": 1.1605995717344753, "grad_norm": 0.2261792555412017, "learning_rate": 3.408730158730159e-05, "loss": 0.3633, "step": 542 }, { "epoch": 1.1627408993576016, "grad_norm": 0.2829561917904997, "learning_rate": 3.404761904761905e-05, "loss": 0.3648, "step": 543 }, { "epoch": 1.164882226980728, "grad_norm": 0.25098056566365634, "learning_rate": 3.400793650793651e-05, "loss": 0.3702, "step": 544 }, { "epoch": 1.1670235546038543, "grad_norm": 0.24644300251595483, "learning_rate": 3.396825396825397e-05, "loss": 0.3641, "step": 545 }, { "epoch": 1.1691648822269807, "grad_norm": 0.23310081121807147, "learning_rate": 3.392857142857143e-05, "loss": 0.3724, "step": 546 }, { "epoch": 1.171306209850107, "grad_norm": 0.2396918335180637, "learning_rate": 3.388888888888889e-05, "loss": 0.3724, "step": 547 }, { "epoch": 1.1734475374732334, "grad_norm": 0.26305119086996215, "learning_rate": 3.384920634920635e-05, "loss": 0.3663, "step": 548 }, { "epoch": 1.1755888650963597, "grad_norm": 0.23766315816254158, "learning_rate": 3.380952380952381e-05, "loss": 0.3543, "step": 549 }, { "epoch": 1.177730192719486, "grad_norm": 0.24791689978321688, "learning_rate": 3.3769841269841273e-05, "loss": 0.3858, "step": 550 }, { "epoch": 1.1798715203426124, "grad_norm": 0.2427021752634518, "learning_rate": 3.3730158730158734e-05, "loss": 0.3544, "step": 551 }, { "epoch": 1.1820128479657388, "grad_norm": 0.2524229855590501, "learning_rate": 3.3690476190476195e-05, "loss": 0.3932, "step": 552 }, { "epoch": 1.1841541755888652, "grad_norm": 0.2208689079418904, "learning_rate": 3.3650793650793656e-05, "loss": 0.3638, "step": 553 }, { "epoch": 1.1862955032119915, "grad_norm": 0.24091697133765969, "learning_rate": 3.3611111111111116e-05, "loss": 0.351, "step": 554 }, { "epoch": 1.1884368308351179, "grad_norm": 0.24197830327776518, "learning_rate": 3.357142857142857e-05, "loss": 0.3596, "step": 555 }, { "epoch": 1.1905781584582442, "grad_norm": 0.19888129082318382, "learning_rate": 3.353174603174603e-05, "loss": 0.3643, "step": 556 }, { "epoch": 1.1927194860813706, "grad_norm": 0.26369169905142914, "learning_rate": 3.349206349206349e-05, "loss": 0.3605, "step": 557 }, { "epoch": 1.1948608137044967, "grad_norm": 0.2532299216339328, "learning_rate": 3.345238095238095e-05, "loss": 0.3878, "step": 558 }, { "epoch": 1.197002141327623, "grad_norm": 0.23397198291097293, "learning_rate": 3.3412698412698413e-05, "loss": 0.3892, "step": 559 }, { "epoch": 1.1991434689507494, "grad_norm": 0.3051071879009391, "learning_rate": 3.3373015873015874e-05, "loss": 0.3857, "step": 560 }, { "epoch": 1.2012847965738758, "grad_norm": 0.2415676269852608, "learning_rate": 3.3333333333333335e-05, "loss": 0.3529, "step": 561 }, { "epoch": 1.2034261241970021, "grad_norm": 0.2156997194979025, "learning_rate": 3.3293650793650796e-05, "loss": 0.3618, "step": 562 }, { "epoch": 1.2055674518201285, "grad_norm": 0.24244616134298602, "learning_rate": 3.3253968253968256e-05, "loss": 0.3528, "step": 563 }, { "epoch": 1.2077087794432548, "grad_norm": 0.2675996346388178, "learning_rate": 3.321428571428572e-05, "loss": 0.3716, "step": 564 }, { "epoch": 1.2098501070663812, "grad_norm": 0.22977064909622621, "learning_rate": 3.317460317460318e-05, "loss": 0.3651, "step": 565 }, { "epoch": 1.2119914346895075, "grad_norm": 0.22289828933891506, "learning_rate": 3.313492063492064e-05, "loss": 0.3578, "step": 566 }, { "epoch": 1.214132762312634, "grad_norm": 0.24907945525188446, "learning_rate": 3.309523809523809e-05, "loss": 0.3803, "step": 567 }, { "epoch": 1.2162740899357602, "grad_norm": 0.21260828983218635, "learning_rate": 3.3055555555555553e-05, "loss": 0.3823, "step": 568 }, { "epoch": 1.2184154175588866, "grad_norm": 0.2444036303892165, "learning_rate": 3.3015873015873014e-05, "loss": 0.3873, "step": 569 }, { "epoch": 1.2205567451820127, "grad_norm": 0.27044980470468927, "learning_rate": 3.2976190476190475e-05, "loss": 0.3796, "step": 570 }, { "epoch": 1.222698072805139, "grad_norm": 0.23806387274834903, "learning_rate": 3.2936507936507936e-05, "loss": 0.3606, "step": 571 }, { "epoch": 1.2248394004282654, "grad_norm": 0.2541644994496726, "learning_rate": 3.2896825396825396e-05, "loss": 0.3795, "step": 572 }, { "epoch": 1.2269807280513918, "grad_norm": 0.2726575491057458, "learning_rate": 3.285714285714286e-05, "loss": 0.3806, "step": 573 }, { "epoch": 1.2291220556745182, "grad_norm": 0.23105651670608096, "learning_rate": 3.281746031746032e-05, "loss": 0.3692, "step": 574 }, { "epoch": 1.2312633832976445, "grad_norm": 0.23853407830325385, "learning_rate": 3.277777777777778e-05, "loss": 0.3701, "step": 575 }, { "epoch": 1.2334047109207709, "grad_norm": 0.2550243654541794, "learning_rate": 3.273809523809524e-05, "loss": 0.3696, "step": 576 }, { "epoch": 1.2355460385438972, "grad_norm": 0.23511204104744063, "learning_rate": 3.26984126984127e-05, "loss": 0.3853, "step": 577 }, { "epoch": 1.2376873661670236, "grad_norm": 0.26214275864155384, "learning_rate": 3.265873015873016e-05, "loss": 0.3678, "step": 578 }, { "epoch": 1.23982869379015, "grad_norm": 0.23149724483459302, "learning_rate": 3.261904761904762e-05, "loss": 0.3584, "step": 579 }, { "epoch": 1.2419700214132763, "grad_norm": 0.2085157601025498, "learning_rate": 3.257936507936508e-05, "loss": 0.3497, "step": 580 }, { "epoch": 1.2441113490364026, "grad_norm": 0.24198910705782214, "learning_rate": 3.253968253968254e-05, "loss": 0.3712, "step": 581 }, { "epoch": 1.246252676659529, "grad_norm": 0.23194060619444556, "learning_rate": 3.2500000000000004e-05, "loss": 0.3653, "step": 582 }, { "epoch": 1.2483940042826553, "grad_norm": 0.21932550329816566, "learning_rate": 3.2460317460317465e-05, "loss": 0.36, "step": 583 }, { "epoch": 1.2505353319057817, "grad_norm": 0.2485536260156264, "learning_rate": 3.2420634920634925e-05, "loss": 0.3557, "step": 584 }, { "epoch": 1.252676659528908, "grad_norm": 0.21298281815390674, "learning_rate": 3.2380952380952386e-05, "loss": 0.3768, "step": 585 }, { "epoch": 1.2548179871520342, "grad_norm": 0.2308383054301672, "learning_rate": 3.234126984126985e-05, "loss": 0.3613, "step": 586 }, { "epoch": 1.2569593147751605, "grad_norm": 0.2500705573168694, "learning_rate": 3.230158730158731e-05, "loss": 0.3672, "step": 587 }, { "epoch": 1.259100642398287, "grad_norm": 0.24932919856158853, "learning_rate": 3.226190476190477e-05, "loss": 0.3664, "step": 588 }, { "epoch": 1.2612419700214133, "grad_norm": 0.21114923566401123, "learning_rate": 3.222222222222223e-05, "loss": 0.3654, "step": 589 }, { "epoch": 1.2633832976445396, "grad_norm": 0.26655876159854425, "learning_rate": 3.218253968253968e-05, "loss": 0.376, "step": 590 }, { "epoch": 1.265524625267666, "grad_norm": 0.22494122561905386, "learning_rate": 3.2142857142857144e-05, "loss": 0.366, "step": 591 }, { "epoch": 1.2676659528907923, "grad_norm": 0.24850515269385678, "learning_rate": 3.2103174603174605e-05, "loss": 0.3814, "step": 592 }, { "epoch": 1.2698072805139187, "grad_norm": 0.2359476164199983, "learning_rate": 3.2063492063492065e-05, "loss": 0.3876, "step": 593 }, { "epoch": 1.271948608137045, "grad_norm": 0.2170060923253427, "learning_rate": 3.202380952380952e-05, "loss": 0.3662, "step": 594 }, { "epoch": 1.2740899357601714, "grad_norm": 0.2504062463730666, "learning_rate": 3.198412698412698e-05, "loss": 0.3753, "step": 595 }, { "epoch": 1.2762312633832975, "grad_norm": 0.20106817241986427, "learning_rate": 3.194444444444444e-05, "loss": 0.3592, "step": 596 }, { "epoch": 1.2783725910064239, "grad_norm": 0.21200714791632813, "learning_rate": 3.19047619047619e-05, "loss": 0.3509, "step": 597 }, { "epoch": 1.2805139186295502, "grad_norm": 0.21778732718260096, "learning_rate": 3.186507936507936e-05, "loss": 0.3584, "step": 598 }, { "epoch": 1.2826552462526766, "grad_norm": 0.2189934520948945, "learning_rate": 3.182539682539682e-05, "loss": 0.3549, "step": 599 }, { "epoch": 1.284796573875803, "grad_norm": 0.2208172986950504, "learning_rate": 3.1785714285714284e-05, "loss": 0.3571, "step": 600 }, { "epoch": 1.2869379014989293, "grad_norm": 0.22576417970064774, "learning_rate": 3.1746031746031745e-05, "loss": 0.3525, "step": 601 }, { "epoch": 1.2890792291220556, "grad_norm": 0.27692988742074415, "learning_rate": 3.1706349206349205e-05, "loss": 0.3764, "step": 602 }, { "epoch": 1.291220556745182, "grad_norm": 0.21396523695563535, "learning_rate": 3.1666666666666666e-05, "loss": 0.3733, "step": 603 }, { "epoch": 1.2933618843683083, "grad_norm": 0.23416094155920952, "learning_rate": 3.162698412698413e-05, "loss": 0.3559, "step": 604 }, { "epoch": 1.2955032119914347, "grad_norm": 0.2486860214774819, "learning_rate": 3.158730158730159e-05, "loss": 0.3562, "step": 605 }, { "epoch": 1.297644539614561, "grad_norm": 0.2201774327190655, "learning_rate": 3.154761904761905e-05, "loss": 0.3716, "step": 606 }, { "epoch": 1.2997858672376874, "grad_norm": 0.1964840066863156, "learning_rate": 3.150793650793651e-05, "loss": 0.3542, "step": 607 }, { "epoch": 1.3019271948608138, "grad_norm": 0.22767951673276937, "learning_rate": 3.146825396825397e-05, "loss": 0.3612, "step": 608 }, { "epoch": 1.3040685224839401, "grad_norm": 0.22557177823924723, "learning_rate": 3.142857142857143e-05, "loss": 0.3603, "step": 609 }, { "epoch": 1.3062098501070665, "grad_norm": 0.27527013455606647, "learning_rate": 3.138888888888889e-05, "loss": 0.3502, "step": 610 }, { "epoch": 1.3083511777301928, "grad_norm": 0.23325579262107884, "learning_rate": 3.134920634920635e-05, "loss": 0.3631, "step": 611 }, { "epoch": 1.3104925053533192, "grad_norm": 0.22097147014315746, "learning_rate": 3.130952380952381e-05, "loss": 0.3511, "step": 612 }, { "epoch": 1.3126338329764453, "grad_norm": 0.2174864072196349, "learning_rate": 3.1269841269841274e-05, "loss": 0.3672, "step": 613 }, { "epoch": 1.3147751605995717, "grad_norm": 0.2106746804771801, "learning_rate": 3.1230158730158734e-05, "loss": 0.3612, "step": 614 }, { "epoch": 1.316916488222698, "grad_norm": 0.21088050552854007, "learning_rate": 3.1190476190476195e-05, "loss": 0.3471, "step": 615 }, { "epoch": 1.3190578158458244, "grad_norm": 0.2580125309103289, "learning_rate": 3.1150793650793656e-05, "loss": 0.3741, "step": 616 }, { "epoch": 1.3211991434689507, "grad_norm": 0.21505898974697335, "learning_rate": 3.111111111111111e-05, "loss": 0.3763, "step": 617 }, { "epoch": 1.323340471092077, "grad_norm": 0.2465748563264946, "learning_rate": 3.107142857142857e-05, "loss": 0.3766, "step": 618 }, { "epoch": 1.3254817987152034, "grad_norm": 0.3321013586284405, "learning_rate": 3.103174603174603e-05, "loss": 0.3955, "step": 619 }, { "epoch": 1.3276231263383298, "grad_norm": 0.22495906786878164, "learning_rate": 3.099206349206349e-05, "loss": 0.3718, "step": 620 }, { "epoch": 1.3297644539614561, "grad_norm": 0.23719074429579814, "learning_rate": 3.095238095238095e-05, "loss": 0.3544, "step": 621 }, { "epoch": 1.3319057815845825, "grad_norm": 0.23831973310054141, "learning_rate": 3.0912698412698414e-05, "loss": 0.3795, "step": 622 }, { "epoch": 1.3340471092077089, "grad_norm": 0.2128688128559845, "learning_rate": 3.0873015873015874e-05, "loss": 0.3668, "step": 623 }, { "epoch": 1.336188436830835, "grad_norm": 0.22961487193589208, "learning_rate": 3.0833333333333335e-05, "loss": 0.366, "step": 624 }, { "epoch": 1.3383297644539613, "grad_norm": 0.2214674128035989, "learning_rate": 3.0793650793650796e-05, "loss": 0.365, "step": 625 }, { "epoch": 1.3404710920770877, "grad_norm": 0.20498199401783132, "learning_rate": 3.075396825396826e-05, "loss": 0.3556, "step": 626 }, { "epoch": 1.342612419700214, "grad_norm": 0.22359296981746604, "learning_rate": 3.071428571428572e-05, "loss": 0.3812, "step": 627 }, { "epoch": 1.3447537473233404, "grad_norm": 0.21653621158585087, "learning_rate": 3.067460317460318e-05, "loss": 0.3797, "step": 628 }, { "epoch": 1.3468950749464668, "grad_norm": 0.22549871106519903, "learning_rate": 3.063492063492064e-05, "loss": 0.3596, "step": 629 }, { "epoch": 1.3490364025695931, "grad_norm": 0.20475922109140837, "learning_rate": 3.05952380952381e-05, "loss": 0.3735, "step": 630 }, { "epoch": 1.3511777301927195, "grad_norm": 0.209763601459537, "learning_rate": 3.055555555555556e-05, "loss": 0.3709, "step": 631 }, { "epoch": 1.3533190578158458, "grad_norm": 0.2176748492097036, "learning_rate": 3.051587301587302e-05, "loss": 0.3808, "step": 632 }, { "epoch": 1.3554603854389722, "grad_norm": 0.21622906752623547, "learning_rate": 3.0476190476190482e-05, "loss": 0.3604, "step": 633 }, { "epoch": 1.3576017130620985, "grad_norm": 0.20063962917481207, "learning_rate": 3.0436507936507936e-05, "loss": 0.3546, "step": 634 }, { "epoch": 1.359743040685225, "grad_norm": 0.20999519152589322, "learning_rate": 3.0396825396825397e-05, "loss": 0.3597, "step": 635 }, { "epoch": 1.3618843683083512, "grad_norm": 0.20927897537878964, "learning_rate": 3.0357142857142857e-05, "loss": 0.3729, "step": 636 }, { "epoch": 1.3640256959314776, "grad_norm": 0.21686713042567732, "learning_rate": 3.0317460317460318e-05, "loss": 0.3653, "step": 637 }, { "epoch": 1.366167023554604, "grad_norm": 0.21754952121551016, "learning_rate": 3.0277777777777776e-05, "loss": 0.3677, "step": 638 }, { "epoch": 1.3683083511777303, "grad_norm": 0.20450902968020848, "learning_rate": 3.0238095238095236e-05, "loss": 0.3355, "step": 639 }, { "epoch": 1.3704496788008567, "grad_norm": 0.21623695248881408, "learning_rate": 3.0198412698412697e-05, "loss": 0.3806, "step": 640 }, { "epoch": 1.3725910064239828, "grad_norm": 0.21537383605412733, "learning_rate": 3.0158730158730158e-05, "loss": 0.3681, "step": 641 }, { "epoch": 1.3747323340471092, "grad_norm": 0.19837937617514323, "learning_rate": 3.011904761904762e-05, "loss": 0.3622, "step": 642 }, { "epoch": 1.3768736616702355, "grad_norm": 0.209108602319792, "learning_rate": 3.007936507936508e-05, "loss": 0.3484, "step": 643 }, { "epoch": 1.3790149892933619, "grad_norm": 0.2062194560604121, "learning_rate": 3.003968253968254e-05, "loss": 0.3702, "step": 644 }, { "epoch": 1.3811563169164882, "grad_norm": 0.2080771057237384, "learning_rate": 3e-05, "loss": 0.362, "step": 645 }, { "epoch": 1.3832976445396146, "grad_norm": 0.2227326233506996, "learning_rate": 2.996031746031746e-05, "loss": 0.3701, "step": 646 }, { "epoch": 1.385438972162741, "grad_norm": 0.2089744724523155, "learning_rate": 2.9920634920634922e-05, "loss": 0.361, "step": 647 }, { "epoch": 1.3875802997858673, "grad_norm": 0.25404638727035117, "learning_rate": 2.9880952380952383e-05, "loss": 0.3629, "step": 648 }, { "epoch": 1.3897216274089936, "grad_norm": 0.21442147488950894, "learning_rate": 2.9841269841269844e-05, "loss": 0.3513, "step": 649 }, { "epoch": 1.39186295503212, "grad_norm": 0.2481654405500989, "learning_rate": 2.98015873015873e-05, "loss": 0.3662, "step": 650 }, { "epoch": 1.3940042826552461, "grad_norm": 0.22382398124617614, "learning_rate": 2.9761904761904762e-05, "loss": 0.3687, "step": 651 }, { "epoch": 1.3961456102783725, "grad_norm": 0.31356212838373204, "learning_rate": 2.9722222222222223e-05, "loss": 0.3795, "step": 652 }, { "epoch": 1.3982869379014988, "grad_norm": 0.2429332621043318, "learning_rate": 2.9682539682539683e-05, "loss": 0.373, "step": 653 }, { "epoch": 1.4004282655246252, "grad_norm": 0.2249979699253431, "learning_rate": 2.9642857142857144e-05, "loss": 0.3676, "step": 654 }, { "epoch": 1.4025695931477515, "grad_norm": 0.2560531887036681, "learning_rate": 2.9603174603174605e-05, "loss": 0.3637, "step": 655 }, { "epoch": 1.404710920770878, "grad_norm": 0.24116508821199556, "learning_rate": 2.9563492063492066e-05, "loss": 0.3707, "step": 656 }, { "epoch": 1.4068522483940042, "grad_norm": 0.2598274205324474, "learning_rate": 2.9523809523809526e-05, "loss": 0.379, "step": 657 }, { "epoch": 1.4089935760171306, "grad_norm": 0.21575521660148642, "learning_rate": 2.9484126984126987e-05, "loss": 0.376, "step": 658 }, { "epoch": 1.411134903640257, "grad_norm": 0.22041851374967822, "learning_rate": 2.9444444444444448e-05, "loss": 0.3607, "step": 659 }, { "epoch": 1.4132762312633833, "grad_norm": 0.2564088225056767, "learning_rate": 2.940476190476191e-05, "loss": 0.3599, "step": 660 }, { "epoch": 1.4154175588865097, "grad_norm": 0.20549406527051656, "learning_rate": 2.9365079365079366e-05, "loss": 0.3657, "step": 661 }, { "epoch": 1.417558886509636, "grad_norm": 0.2540118149516082, "learning_rate": 2.9325396825396827e-05, "loss": 0.368, "step": 662 }, { "epoch": 1.4197002141327624, "grad_norm": 0.2648103833326966, "learning_rate": 2.9285714285714288e-05, "loss": 0.3689, "step": 663 }, { "epoch": 1.4218415417558887, "grad_norm": 0.24077539771530324, "learning_rate": 2.9246031746031748e-05, "loss": 0.351, "step": 664 }, { "epoch": 1.423982869379015, "grad_norm": 0.24569673366814643, "learning_rate": 2.920634920634921e-05, "loss": 0.3595, "step": 665 }, { "epoch": 1.4261241970021414, "grad_norm": 0.22436619156977872, "learning_rate": 2.916666666666667e-05, "loss": 0.3636, "step": 666 }, { "epoch": 1.4282655246252678, "grad_norm": 0.22702096393924773, "learning_rate": 2.912698412698413e-05, "loss": 0.3577, "step": 667 }, { "epoch": 1.430406852248394, "grad_norm": 0.21223624922722298, "learning_rate": 2.908730158730159e-05, "loss": 0.3677, "step": 668 }, { "epoch": 1.4325481798715203, "grad_norm": 0.2474621671117468, "learning_rate": 2.9047619047619052e-05, "loss": 0.3632, "step": 669 }, { "epoch": 1.4346895074946466, "grad_norm": 0.25684655588636496, "learning_rate": 2.9007936507936513e-05, "loss": 0.3592, "step": 670 }, { "epoch": 1.436830835117773, "grad_norm": 0.25072866337855965, "learning_rate": 2.8968253968253974e-05, "loss": 0.3598, "step": 671 }, { "epoch": 1.4389721627408993, "grad_norm": 0.22582476526482667, "learning_rate": 2.8928571428571434e-05, "loss": 0.3577, "step": 672 }, { "epoch": 1.4411134903640257, "grad_norm": 0.28658549817719403, "learning_rate": 2.8888888888888888e-05, "loss": 0.3839, "step": 673 }, { "epoch": 1.443254817987152, "grad_norm": 0.25430283807134485, "learning_rate": 2.884920634920635e-05, "loss": 0.3643, "step": 674 }, { "epoch": 1.4453961456102784, "grad_norm": 0.22951249670225976, "learning_rate": 2.880952380952381e-05, "loss": 0.3692, "step": 675 }, { "epoch": 1.4475374732334048, "grad_norm": 0.23388364802048373, "learning_rate": 2.876984126984127e-05, "loss": 0.3708, "step": 676 }, { "epoch": 1.4496788008565311, "grad_norm": 0.21976733689801523, "learning_rate": 2.8730158730158728e-05, "loss": 0.3802, "step": 677 }, { "epoch": 1.4518201284796575, "grad_norm": 0.21027240355399024, "learning_rate": 2.869047619047619e-05, "loss": 0.3773, "step": 678 }, { "epoch": 1.4539614561027836, "grad_norm": 0.20199987883594428, "learning_rate": 2.865079365079365e-05, "loss": 0.3637, "step": 679 }, { "epoch": 1.45610278372591, "grad_norm": 0.21923422597696604, "learning_rate": 2.861111111111111e-05, "loss": 0.3619, "step": 680 }, { "epoch": 1.4582441113490363, "grad_norm": 0.21000907335559735, "learning_rate": 2.857142857142857e-05, "loss": 0.3536, "step": 681 }, { "epoch": 1.4603854389721627, "grad_norm": 0.21711620369351062, "learning_rate": 2.853174603174603e-05, "loss": 0.3738, "step": 682 }, { "epoch": 1.462526766595289, "grad_norm": 0.22565854928185133, "learning_rate": 2.8492063492063492e-05, "loss": 0.3608, "step": 683 }, { "epoch": 1.4646680942184154, "grad_norm": 0.24796414653671942, "learning_rate": 2.8452380952380953e-05, "loss": 0.3859, "step": 684 }, { "epoch": 1.4668094218415417, "grad_norm": 0.19936447407500923, "learning_rate": 2.8412698412698414e-05, "loss": 0.3772, "step": 685 }, { "epoch": 1.468950749464668, "grad_norm": 0.2659759151283577, "learning_rate": 2.8373015873015875e-05, "loss": 0.3754, "step": 686 }, { "epoch": 1.4710920770877944, "grad_norm": 0.2142679975770448, "learning_rate": 2.8333333333333335e-05, "loss": 0.3705, "step": 687 }, { "epoch": 1.4732334047109208, "grad_norm": 0.21126669286906707, "learning_rate": 2.8293650793650793e-05, "loss": 0.3831, "step": 688 }, { "epoch": 1.4753747323340471, "grad_norm": 0.24217810422670202, "learning_rate": 2.8253968253968253e-05, "loss": 0.3549, "step": 689 }, { "epoch": 1.4775160599571735, "grad_norm": 0.21568420422411547, "learning_rate": 2.8214285714285714e-05, "loss": 0.3693, "step": 690 }, { "epoch": 1.4796573875802999, "grad_norm": 0.2312299790821551, "learning_rate": 2.8174603174603175e-05, "loss": 0.3764, "step": 691 }, { "epoch": 1.4817987152034262, "grad_norm": 0.233754289817078, "learning_rate": 2.8134920634920636e-05, "loss": 0.3805, "step": 692 }, { "epoch": 1.4839400428265526, "grad_norm": 0.2423944727011651, "learning_rate": 2.8095238095238096e-05, "loss": 0.3636, "step": 693 }, { "epoch": 1.486081370449679, "grad_norm": 0.22905460391929347, "learning_rate": 2.8055555555555557e-05, "loss": 0.3895, "step": 694 }, { "epoch": 1.4882226980728053, "grad_norm": 0.23214001332154588, "learning_rate": 2.8015873015873018e-05, "loss": 0.3605, "step": 695 }, { "epoch": 1.4903640256959314, "grad_norm": 0.19690116100293087, "learning_rate": 2.797619047619048e-05, "loss": 0.3582, "step": 696 }, { "epoch": 1.4925053533190578, "grad_norm": 0.24573029483590722, "learning_rate": 2.793650793650794e-05, "loss": 0.3712, "step": 697 }, { "epoch": 1.4946466809421841, "grad_norm": 0.19231425648369746, "learning_rate": 2.78968253968254e-05, "loss": 0.3497, "step": 698 }, { "epoch": 1.4967880085653105, "grad_norm": 0.22612270059401912, "learning_rate": 2.785714285714286e-05, "loss": 0.3417, "step": 699 }, { "epoch": 1.4989293361884368, "grad_norm": 0.22960092397135454, "learning_rate": 2.781746031746032e-05, "loss": 0.3575, "step": 700 }, { "epoch": 1.5010706638115632, "grad_norm": 0.23684421828804095, "learning_rate": 2.777777777777778e-05, "loss": 0.3786, "step": 701 }, { "epoch": 1.5032119914346895, "grad_norm": 0.2191044054358468, "learning_rate": 2.773809523809524e-05, "loss": 0.3549, "step": 702 }, { "epoch": 1.5053533190578159, "grad_norm": 0.24248023653906947, "learning_rate": 2.76984126984127e-05, "loss": 0.3558, "step": 703 }, { "epoch": 1.507494646680942, "grad_norm": 0.24648400748932908, "learning_rate": 2.765873015873016e-05, "loss": 0.3657, "step": 704 }, { "epoch": 1.5096359743040684, "grad_norm": 0.23645047152874282, "learning_rate": 2.7619047619047622e-05, "loss": 0.3704, "step": 705 }, { "epoch": 1.5117773019271947, "grad_norm": 0.25487174415303726, "learning_rate": 2.7579365079365083e-05, "loss": 0.3582, "step": 706 }, { "epoch": 1.513918629550321, "grad_norm": 0.2331120266364236, "learning_rate": 2.7539682539682544e-05, "loss": 0.3699, "step": 707 }, { "epoch": 1.5160599571734474, "grad_norm": 0.2181485815951784, "learning_rate": 2.7500000000000004e-05, "loss": 0.3751, "step": 708 }, { "epoch": 1.5182012847965738, "grad_norm": 0.24072328359741854, "learning_rate": 2.7460317460317465e-05, "loss": 0.3775, "step": 709 }, { "epoch": 1.5203426124197001, "grad_norm": 0.22869430663467058, "learning_rate": 2.7420634920634926e-05, "loss": 0.3724, "step": 710 }, { "epoch": 1.5224839400428265, "grad_norm": 0.21918246335255903, "learning_rate": 2.7380952380952383e-05, "loss": 0.3547, "step": 711 }, { "epoch": 1.5246252676659529, "grad_norm": 0.2189822768505496, "learning_rate": 2.734126984126984e-05, "loss": 0.3535, "step": 712 }, { "epoch": 1.5267665952890792, "grad_norm": 0.2526005237695246, "learning_rate": 2.73015873015873e-05, "loss": 0.3713, "step": 713 }, { "epoch": 1.5289079229122056, "grad_norm": 0.22601185946675514, "learning_rate": 2.7261904761904762e-05, "loss": 0.3675, "step": 714 }, { "epoch": 1.531049250535332, "grad_norm": 0.20669359644643262, "learning_rate": 2.7222222222222223e-05, "loss": 0.3659, "step": 715 }, { "epoch": 1.5331905781584583, "grad_norm": 0.24031352892120264, "learning_rate": 2.718253968253968e-05, "loss": 0.3602, "step": 716 }, { "epoch": 1.5353319057815846, "grad_norm": 0.22795507587171882, "learning_rate": 2.714285714285714e-05, "loss": 0.3709, "step": 717 }, { "epoch": 1.537473233404711, "grad_norm": 0.20536455549527732, "learning_rate": 2.7103174603174602e-05, "loss": 0.3799, "step": 718 }, { "epoch": 1.5396145610278373, "grad_norm": 0.2943404251055241, "learning_rate": 2.7063492063492062e-05, "loss": 0.3836, "step": 719 }, { "epoch": 1.5417558886509637, "grad_norm": 0.22205668356720148, "learning_rate": 2.7023809523809523e-05, "loss": 0.3664, "step": 720 }, { "epoch": 1.54389721627409, "grad_norm": 0.21954728948498192, "learning_rate": 2.6984126984126984e-05, "loss": 0.3647, "step": 721 }, { "epoch": 1.5460385438972164, "grad_norm": 0.2280345060624478, "learning_rate": 2.6944444444444445e-05, "loss": 0.3713, "step": 722 }, { "epoch": 1.5481798715203428, "grad_norm": 0.22861539448823806, "learning_rate": 2.6904761904761905e-05, "loss": 0.3888, "step": 723 }, { "epoch": 1.550321199143469, "grad_norm": 0.19425559114407048, "learning_rate": 2.6865079365079366e-05, "loss": 0.3739, "step": 724 }, { "epoch": 1.5524625267665952, "grad_norm": 0.24288323051735908, "learning_rate": 2.6825396825396827e-05, "loss": 0.355, "step": 725 }, { "epoch": 1.5546038543897216, "grad_norm": 0.22443141683817264, "learning_rate": 2.6785714285714288e-05, "loss": 0.3888, "step": 726 }, { "epoch": 1.556745182012848, "grad_norm": 0.2182399943573036, "learning_rate": 2.6746031746031745e-05, "loss": 0.3624, "step": 727 }, { "epoch": 1.5588865096359743, "grad_norm": 0.23168062600285244, "learning_rate": 2.6706349206349206e-05, "loss": 0.3585, "step": 728 }, { "epoch": 1.5610278372591007, "grad_norm": 0.21591657004920606, "learning_rate": 2.6666666666666667e-05, "loss": 0.3543, "step": 729 }, { "epoch": 1.563169164882227, "grad_norm": 0.21509414063611806, "learning_rate": 2.6626984126984127e-05, "loss": 0.369, "step": 730 }, { "epoch": 1.5653104925053534, "grad_norm": 0.21543923122674843, "learning_rate": 2.6587301587301588e-05, "loss": 0.3582, "step": 731 }, { "epoch": 1.5674518201284795, "grad_norm": 0.20613757156954682, "learning_rate": 2.654761904761905e-05, "loss": 0.3564, "step": 732 }, { "epoch": 1.5695931477516059, "grad_norm": 0.2068022698111163, "learning_rate": 2.650793650793651e-05, "loss": 0.3714, "step": 733 }, { "epoch": 1.5717344753747322, "grad_norm": 0.19705404333823645, "learning_rate": 2.646825396825397e-05, "loss": 0.3648, "step": 734 }, { "epoch": 1.5738758029978586, "grad_norm": 0.20492416067065145, "learning_rate": 2.642857142857143e-05, "loss": 0.36, "step": 735 }, { "epoch": 1.576017130620985, "grad_norm": 0.20508644827871644, "learning_rate": 2.6388888888888892e-05, "loss": 0.3874, "step": 736 }, { "epoch": 1.5781584582441113, "grad_norm": 0.20727086230283992, "learning_rate": 2.6349206349206353e-05, "loss": 0.3641, "step": 737 }, { "epoch": 1.5802997858672376, "grad_norm": 0.21066080929705655, "learning_rate": 2.6309523809523813e-05, "loss": 0.3775, "step": 738 }, { "epoch": 1.582441113490364, "grad_norm": 0.1858511474413401, "learning_rate": 2.626984126984127e-05, "loss": 0.3493, "step": 739 }, { "epoch": 1.5845824411134903, "grad_norm": 0.1948727545134536, "learning_rate": 2.623015873015873e-05, "loss": 0.3604, "step": 740 }, { "epoch": 1.5867237687366167, "grad_norm": 0.18377772006229867, "learning_rate": 2.6190476190476192e-05, "loss": 0.3656, "step": 741 }, { "epoch": 1.588865096359743, "grad_norm": 0.20759171733907542, "learning_rate": 2.6150793650793653e-05, "loss": 0.36, "step": 742 }, { "epoch": 1.5910064239828694, "grad_norm": 0.20021552648301658, "learning_rate": 2.6111111111111114e-05, "loss": 0.3672, "step": 743 }, { "epoch": 1.5931477516059958, "grad_norm": 0.1987003787234111, "learning_rate": 2.6071428571428574e-05, "loss": 0.3697, "step": 744 }, { "epoch": 1.595289079229122, "grad_norm": 0.1992569458099437, "learning_rate": 2.6031746031746035e-05, "loss": 0.3722, "step": 745 }, { "epoch": 1.5974304068522485, "grad_norm": 0.19490857264680375, "learning_rate": 2.5992063492063496e-05, "loss": 0.3487, "step": 746 }, { "epoch": 1.5995717344753748, "grad_norm": 0.20992568333120643, "learning_rate": 2.5952380952380957e-05, "loss": 0.3721, "step": 747 }, { "epoch": 1.6017130620985012, "grad_norm": 0.20903627954075446, "learning_rate": 2.5912698412698417e-05, "loss": 0.3645, "step": 748 }, { "epoch": 1.6038543897216275, "grad_norm": 0.2206708938047425, "learning_rate": 2.5873015873015878e-05, "loss": 0.3596, "step": 749 }, { "epoch": 1.6059957173447539, "grad_norm": 0.2315828550558367, "learning_rate": 2.5833333333333336e-05, "loss": 0.3833, "step": 750 }, { "epoch": 1.6081370449678802, "grad_norm": 0.20729043706506806, "learning_rate": 2.5793650793650796e-05, "loss": 0.3676, "step": 751 }, { "epoch": 1.6102783725910066, "grad_norm": 0.21304496616203045, "learning_rate": 2.5753968253968254e-05, "loss": 0.3692, "step": 752 }, { "epoch": 1.6124197002141327, "grad_norm": 0.2066888063019116, "learning_rate": 2.5714285714285714e-05, "loss": 0.3513, "step": 753 }, { "epoch": 1.614561027837259, "grad_norm": 0.20035669627587474, "learning_rate": 2.5674603174603172e-05, "loss": 0.3625, "step": 754 }, { "epoch": 1.6167023554603854, "grad_norm": 0.21327760276750352, "learning_rate": 2.5634920634920633e-05, "loss": 0.3642, "step": 755 }, { "epoch": 1.6188436830835118, "grad_norm": 0.2144247994926133, "learning_rate": 2.5595238095238093e-05, "loss": 0.378, "step": 756 }, { "epoch": 1.6209850107066381, "grad_norm": 0.21826292313986448, "learning_rate": 2.5555555555555554e-05, "loss": 0.3682, "step": 757 }, { "epoch": 1.6231263383297645, "grad_norm": 0.2112350938263051, "learning_rate": 2.5515873015873015e-05, "loss": 0.3737, "step": 758 }, { "epoch": 1.6252676659528906, "grad_norm": 0.22455328732151775, "learning_rate": 2.5476190476190476e-05, "loss": 0.3654, "step": 759 }, { "epoch": 1.627408993576017, "grad_norm": 0.18501213734273322, "learning_rate": 2.5436507936507936e-05, "loss": 0.3383, "step": 760 }, { "epoch": 1.6295503211991433, "grad_norm": 0.20942821521960064, "learning_rate": 2.5396825396825397e-05, "loss": 0.3583, "step": 761 }, { "epoch": 1.6316916488222697, "grad_norm": 0.22203677151968088, "learning_rate": 2.5357142857142858e-05, "loss": 0.37, "step": 762 }, { "epoch": 1.633832976445396, "grad_norm": 0.22074729844028526, "learning_rate": 2.531746031746032e-05, "loss": 0.3555, "step": 763 }, { "epoch": 1.6359743040685224, "grad_norm": 0.24745475269473763, "learning_rate": 2.527777777777778e-05, "loss": 0.3913, "step": 764 }, { "epoch": 1.6381156316916488, "grad_norm": 0.21980493117452848, "learning_rate": 2.523809523809524e-05, "loss": 0.3684, "step": 765 }, { "epoch": 1.640256959314775, "grad_norm": 0.21850689842540688, "learning_rate": 2.5198412698412697e-05, "loss": 0.3727, "step": 766 }, { "epoch": 1.6423982869379015, "grad_norm": 0.21446663216573078, "learning_rate": 2.5158730158730158e-05, "loss": 0.3782, "step": 767 }, { "epoch": 1.6445396145610278, "grad_norm": 0.21778149384624332, "learning_rate": 2.511904761904762e-05, "loss": 0.3683, "step": 768 }, { "epoch": 1.6466809421841542, "grad_norm": 0.20468677302613195, "learning_rate": 2.507936507936508e-05, "loss": 0.3771, "step": 769 }, { "epoch": 1.6488222698072805, "grad_norm": 0.23332787991529938, "learning_rate": 2.503968253968254e-05, "loss": 0.367, "step": 770 }, { "epoch": 1.6509635974304069, "grad_norm": 0.21897816586893398, "learning_rate": 2.5e-05, "loss": 0.3783, "step": 771 }, { "epoch": 1.6531049250535332, "grad_norm": 0.2030029868201131, "learning_rate": 2.4960317460317462e-05, "loss": 0.3745, "step": 772 }, { "epoch": 1.6552462526766596, "grad_norm": 0.21310414937938124, "learning_rate": 2.4920634920634923e-05, "loss": 0.3405, "step": 773 }, { "epoch": 1.657387580299786, "grad_norm": 0.22195298457726914, "learning_rate": 2.4880952380952383e-05, "loss": 0.3874, "step": 774 }, { "epoch": 1.6595289079229123, "grad_norm": 0.20300473974521194, "learning_rate": 2.4841269841269844e-05, "loss": 0.3663, "step": 775 }, { "epoch": 1.6616702355460387, "grad_norm": 0.22067636878573646, "learning_rate": 2.4801587301587305e-05, "loss": 0.3887, "step": 776 }, { "epoch": 1.663811563169165, "grad_norm": 0.21073789338832113, "learning_rate": 2.4761904761904762e-05, "loss": 0.3743, "step": 777 }, { "epoch": 1.6659528907922914, "grad_norm": 0.22786185959131758, "learning_rate": 2.4722222222222223e-05, "loss": 0.3743, "step": 778 }, { "epoch": 1.6680942184154177, "grad_norm": 0.20200585214379246, "learning_rate": 2.4682539682539684e-05, "loss": 0.3725, "step": 779 }, { "epoch": 1.6702355460385439, "grad_norm": 0.21952221945022432, "learning_rate": 2.4642857142857145e-05, "loss": 0.3678, "step": 780 }, { "epoch": 1.6723768736616702, "grad_norm": 0.49934368068454904, "learning_rate": 2.4603174603174602e-05, "loss": 0.3964, "step": 781 }, { "epoch": 1.6745182012847966, "grad_norm": 0.19757194754481436, "learning_rate": 2.4563492063492063e-05, "loss": 0.3763, "step": 782 }, { "epoch": 1.676659528907923, "grad_norm": 0.19767924902990838, "learning_rate": 2.4523809523809523e-05, "loss": 0.3543, "step": 783 }, { "epoch": 1.6788008565310493, "grad_norm": 0.20919876927515002, "learning_rate": 2.4484126984126984e-05, "loss": 0.3735, "step": 784 }, { "epoch": 1.6809421841541756, "grad_norm": 0.22040578372419578, "learning_rate": 2.4444444444444445e-05, "loss": 0.3829, "step": 785 }, { "epoch": 1.683083511777302, "grad_norm": 0.22430773595492015, "learning_rate": 2.4404761904761906e-05, "loss": 0.3656, "step": 786 }, { "epoch": 1.685224839400428, "grad_norm": 0.20259143808355987, "learning_rate": 2.4365079365079366e-05, "loss": 0.3647, "step": 787 }, { "epoch": 1.6873661670235545, "grad_norm": 0.21894252518737073, "learning_rate": 2.4325396825396827e-05, "loss": 0.3601, "step": 788 }, { "epoch": 1.6895074946466808, "grad_norm": 0.21462981545836501, "learning_rate": 2.4285714285714288e-05, "loss": 0.3629, "step": 789 }, { "epoch": 1.6916488222698072, "grad_norm": 0.18869575445100586, "learning_rate": 2.424603174603175e-05, "loss": 0.3323, "step": 790 }, { "epoch": 1.6937901498929335, "grad_norm": 0.20241689753494335, "learning_rate": 2.4206349206349206e-05, "loss": 0.3464, "step": 791 }, { "epoch": 1.6959314775160599, "grad_norm": 0.19953981094595588, "learning_rate": 2.4166666666666667e-05, "loss": 0.3458, "step": 792 }, { "epoch": 1.6980728051391862, "grad_norm": 0.19961349385460256, "learning_rate": 2.4126984126984128e-05, "loss": 0.3527, "step": 793 }, { "epoch": 1.7002141327623126, "grad_norm": 0.21973818696015726, "learning_rate": 2.408730158730159e-05, "loss": 0.3611, "step": 794 }, { "epoch": 1.702355460385439, "grad_norm": 0.1887578663893525, "learning_rate": 2.404761904761905e-05, "loss": 0.3624, "step": 795 }, { "epoch": 1.7044967880085653, "grad_norm": 0.19628057987549807, "learning_rate": 2.400793650793651e-05, "loss": 0.3717, "step": 796 }, { "epoch": 1.7066381156316917, "grad_norm": 0.20905187089247354, "learning_rate": 2.396825396825397e-05, "loss": 0.3797, "step": 797 }, { "epoch": 1.708779443254818, "grad_norm": 0.210161778786647, "learning_rate": 2.392857142857143e-05, "loss": 0.3648, "step": 798 }, { "epoch": 1.7109207708779444, "grad_norm": 0.18782590789097356, "learning_rate": 2.3888888888888892e-05, "loss": 0.3672, "step": 799 }, { "epoch": 1.7130620985010707, "grad_norm": 0.21536059285769518, "learning_rate": 2.3849206349206353e-05, "loss": 0.3556, "step": 800 }, { "epoch": 1.715203426124197, "grad_norm": 0.19872849614371818, "learning_rate": 2.380952380952381e-05, "loss": 0.3606, "step": 801 }, { "epoch": 1.7173447537473234, "grad_norm": 0.2190527880750777, "learning_rate": 2.376984126984127e-05, "loss": 0.3778, "step": 802 }, { "epoch": 1.7194860813704498, "grad_norm": 0.21010529253337107, "learning_rate": 2.373015873015873e-05, "loss": 0.355, "step": 803 }, { "epoch": 1.7216274089935761, "grad_norm": 0.20516661849890908, "learning_rate": 2.369047619047619e-05, "loss": 0.3595, "step": 804 }, { "epoch": 1.7237687366167025, "grad_norm": 0.21809159516111487, "learning_rate": 2.365079365079365e-05, "loss": 0.3774, "step": 805 }, { "epoch": 1.7259100642398288, "grad_norm": 0.21787730326337182, "learning_rate": 2.361111111111111e-05, "loss": 0.3716, "step": 806 }, { "epoch": 1.728051391862955, "grad_norm": 0.20722387140693752, "learning_rate": 2.357142857142857e-05, "loss": 0.3538, "step": 807 }, { "epoch": 1.7301927194860813, "grad_norm": 0.19038300413501802, "learning_rate": 2.3531746031746032e-05, "loss": 0.3487, "step": 808 }, { "epoch": 1.7323340471092077, "grad_norm": 0.21385692991090183, "learning_rate": 2.3492063492063493e-05, "loss": 0.3765, "step": 809 }, { "epoch": 1.734475374732334, "grad_norm": 0.20161843453069309, "learning_rate": 2.3452380952380954e-05, "loss": 0.3593, "step": 810 }, { "epoch": 1.7366167023554604, "grad_norm": 0.1991611902428487, "learning_rate": 2.3412698412698414e-05, "loss": 0.3667, "step": 811 }, { "epoch": 1.7387580299785867, "grad_norm": 0.19751266841983958, "learning_rate": 2.3373015873015875e-05, "loss": 0.3725, "step": 812 }, { "epoch": 1.740899357601713, "grad_norm": 0.18754293892801968, "learning_rate": 2.3333333333333336e-05, "loss": 0.3649, "step": 813 }, { "epoch": 1.7430406852248392, "grad_norm": 0.22386270344186163, "learning_rate": 2.3293650793650797e-05, "loss": 0.3685, "step": 814 }, { "epoch": 1.7451820128479656, "grad_norm": 0.1966840040343867, "learning_rate": 2.3253968253968257e-05, "loss": 0.3842, "step": 815 }, { "epoch": 1.747323340471092, "grad_norm": 0.21657779353698456, "learning_rate": 2.3214285714285715e-05, "loss": 0.3667, "step": 816 }, { "epoch": 1.7494646680942183, "grad_norm": 0.2288614122016469, "learning_rate": 2.3174603174603175e-05, "loss": 0.3843, "step": 817 }, { "epoch": 1.7516059957173447, "grad_norm": 0.1836444137030942, "learning_rate": 2.3134920634920636e-05, "loss": 0.3583, "step": 818 }, { "epoch": 1.753747323340471, "grad_norm": 0.23767773436120954, "learning_rate": 2.3095238095238097e-05, "loss": 0.3863, "step": 819 }, { "epoch": 1.7558886509635974, "grad_norm": 0.2041079485593283, "learning_rate": 2.3055555555555558e-05, "loss": 0.3522, "step": 820 }, { "epoch": 1.7580299785867237, "grad_norm": 0.20093188325290554, "learning_rate": 2.3015873015873015e-05, "loss": 0.3795, "step": 821 }, { "epoch": 1.76017130620985, "grad_norm": 0.19995939365331822, "learning_rate": 2.2976190476190476e-05, "loss": 0.3721, "step": 822 }, { "epoch": 1.7623126338329764, "grad_norm": 0.2166910316666174, "learning_rate": 2.2936507936507937e-05, "loss": 0.3634, "step": 823 }, { "epoch": 1.7644539614561028, "grad_norm": 0.18670680995845637, "learning_rate": 2.2896825396825397e-05, "loss": 0.344, "step": 824 }, { "epoch": 1.7665952890792291, "grad_norm": 0.21173006300417144, "learning_rate": 2.2857142857142858e-05, "loss": 0.356, "step": 825 }, { "epoch": 1.7687366167023555, "grad_norm": 0.21040949951223306, "learning_rate": 2.281746031746032e-05, "loss": 0.3816, "step": 826 }, { "epoch": 1.7708779443254818, "grad_norm": 0.20778505679750192, "learning_rate": 2.277777777777778e-05, "loss": 0.3618, "step": 827 }, { "epoch": 1.7730192719486082, "grad_norm": 0.213799456066487, "learning_rate": 2.273809523809524e-05, "loss": 0.3589, "step": 828 }, { "epoch": 1.7751605995717346, "grad_norm": 0.2043033051385102, "learning_rate": 2.2698412698412698e-05, "loss": 0.366, "step": 829 }, { "epoch": 1.777301927194861, "grad_norm": 0.19754414108874052, "learning_rate": 2.265873015873016e-05, "loss": 0.347, "step": 830 }, { "epoch": 1.7794432548179873, "grad_norm": 0.2158296684757847, "learning_rate": 2.261904761904762e-05, "loss": 0.3915, "step": 831 }, { "epoch": 1.7815845824411136, "grad_norm": 0.22243362581646817, "learning_rate": 2.257936507936508e-05, "loss": 0.3854, "step": 832 }, { "epoch": 1.78372591006424, "grad_norm": 0.21571356659346066, "learning_rate": 2.253968253968254e-05, "loss": 0.3409, "step": 833 }, { "epoch": 1.7858672376873663, "grad_norm": 0.22447191028202343, "learning_rate": 2.25e-05, "loss": 0.3749, "step": 834 }, { "epoch": 1.7880085653104925, "grad_norm": 0.215504263290714, "learning_rate": 2.2460317460317462e-05, "loss": 0.3586, "step": 835 }, { "epoch": 1.7901498929336188, "grad_norm": 0.20688541961124565, "learning_rate": 2.2420634920634923e-05, "loss": 0.3723, "step": 836 }, { "epoch": 1.7922912205567452, "grad_norm": 0.20199140517623562, "learning_rate": 2.2380952380952384e-05, "loss": 0.371, "step": 837 }, { "epoch": 1.7944325481798715, "grad_norm": 0.22478296242958862, "learning_rate": 2.2341269841269844e-05, "loss": 0.3634, "step": 838 }, { "epoch": 1.7965738758029979, "grad_norm": 0.19888513887055395, "learning_rate": 2.2301587301587305e-05, "loss": 0.3733, "step": 839 }, { "epoch": 1.7987152034261242, "grad_norm": 0.19421047575082181, "learning_rate": 2.2261904761904763e-05, "loss": 0.3597, "step": 840 }, { "epoch": 1.8008565310492506, "grad_norm": 0.20475219316843102, "learning_rate": 2.2222222222222223e-05, "loss": 0.3708, "step": 841 }, { "epoch": 1.8029978586723767, "grad_norm": 0.2114777102870167, "learning_rate": 2.2182539682539684e-05, "loss": 0.3677, "step": 842 }, { "epoch": 1.805139186295503, "grad_norm": 0.19598571633723222, "learning_rate": 2.214285714285714e-05, "loss": 0.3661, "step": 843 }, { "epoch": 1.8072805139186294, "grad_norm": 0.20252036775324378, "learning_rate": 2.2103174603174602e-05, "loss": 0.3645, "step": 844 }, { "epoch": 1.8094218415417558, "grad_norm": 0.20511403057320743, "learning_rate": 2.2063492063492063e-05, "loss": 0.3633, "step": 845 }, { "epoch": 1.8115631691648821, "grad_norm": 0.20558945002253082, "learning_rate": 2.2023809523809524e-05, "loss": 0.3752, "step": 846 }, { "epoch": 1.8137044967880085, "grad_norm": 0.21428205634679462, "learning_rate": 2.1984126984126984e-05, "loss": 0.3683, "step": 847 }, { "epoch": 1.8158458244111348, "grad_norm": 0.23331079602832885, "learning_rate": 2.1944444444444445e-05, "loss": 0.3671, "step": 848 }, { "epoch": 1.8179871520342612, "grad_norm": 0.21486622134643, "learning_rate": 2.1904761904761906e-05, "loss": 0.3652, "step": 849 }, { "epoch": 1.8201284796573876, "grad_norm": 0.19969930097292724, "learning_rate": 2.1865079365079367e-05, "loss": 0.3689, "step": 850 }, { "epoch": 1.822269807280514, "grad_norm": 0.21534324934924104, "learning_rate": 2.1825396825396827e-05, "loss": 0.3774, "step": 851 }, { "epoch": 1.8244111349036403, "grad_norm": 0.21397843196592883, "learning_rate": 2.1785714285714288e-05, "loss": 0.3694, "step": 852 }, { "epoch": 1.8265524625267666, "grad_norm": 0.23443278557392533, "learning_rate": 2.174603174603175e-05, "loss": 0.3699, "step": 853 }, { "epoch": 1.828693790149893, "grad_norm": 0.19730634328807958, "learning_rate": 2.170634920634921e-05, "loss": 0.3478, "step": 854 }, { "epoch": 1.8308351177730193, "grad_norm": 0.21913953376852294, "learning_rate": 2.1666666666666667e-05, "loss": 0.3567, "step": 855 }, { "epoch": 1.8329764453961457, "grad_norm": 0.20501578279293847, "learning_rate": 2.1626984126984128e-05, "loss": 0.3497, "step": 856 }, { "epoch": 1.835117773019272, "grad_norm": 0.20935735217003343, "learning_rate": 2.158730158730159e-05, "loss": 0.3551, "step": 857 }, { "epoch": 1.8372591006423984, "grad_norm": 0.19034194770904342, "learning_rate": 2.154761904761905e-05, "loss": 0.3628, "step": 858 }, { "epoch": 1.8394004282655247, "grad_norm": 0.18870437952631242, "learning_rate": 2.150793650793651e-05, "loss": 0.3513, "step": 859 }, { "epoch": 1.841541755888651, "grad_norm": 0.1919743307796265, "learning_rate": 2.1468253968253967e-05, "loss": 0.3732, "step": 860 }, { "epoch": 1.8436830835117775, "grad_norm": 0.20417690639096803, "learning_rate": 2.1428571428571428e-05, "loss": 0.3554, "step": 861 }, { "epoch": 1.8458244111349036, "grad_norm": 0.21048046032387568, "learning_rate": 2.138888888888889e-05, "loss": 0.3632, "step": 862 }, { "epoch": 1.84796573875803, "grad_norm": 0.19340209108489914, "learning_rate": 2.134920634920635e-05, "loss": 0.3691, "step": 863 }, { "epoch": 1.8501070663811563, "grad_norm": 0.2147750143951101, "learning_rate": 2.130952380952381e-05, "loss": 0.3691, "step": 864 }, { "epoch": 1.8522483940042827, "grad_norm": 0.19393590878356723, "learning_rate": 2.126984126984127e-05, "loss": 0.3531, "step": 865 }, { "epoch": 1.854389721627409, "grad_norm": 0.18438206629844184, "learning_rate": 2.1230158730158732e-05, "loss": 0.3529, "step": 866 }, { "epoch": 1.8565310492505354, "grad_norm": 0.23087737411403764, "learning_rate": 2.1190476190476193e-05, "loss": 0.369, "step": 867 }, { "epoch": 1.8586723768736617, "grad_norm": 0.20076795586544216, "learning_rate": 2.115079365079365e-05, "loss": 0.3523, "step": 868 }, { "epoch": 1.8608137044967878, "grad_norm": 0.20026389882619192, "learning_rate": 2.111111111111111e-05, "loss": 0.3623, "step": 869 }, { "epoch": 1.8629550321199142, "grad_norm": 0.2187940555605095, "learning_rate": 2.107142857142857e-05, "loss": 0.3579, "step": 870 }, { "epoch": 1.8650963597430406, "grad_norm": 0.22989363140509406, "learning_rate": 2.1031746031746032e-05, "loss": 0.3537, "step": 871 }, { "epoch": 1.867237687366167, "grad_norm": 0.187165767075903, "learning_rate": 2.0992063492063493e-05, "loss": 0.3734, "step": 872 }, { "epoch": 1.8693790149892933, "grad_norm": 0.2107266950762665, "learning_rate": 2.0952380952380954e-05, "loss": 0.3672, "step": 873 }, { "epoch": 1.8715203426124196, "grad_norm": 0.20955760948663935, "learning_rate": 2.0912698412698415e-05, "loss": 0.3584, "step": 874 }, { "epoch": 1.873661670235546, "grad_norm": 0.20510030528233222, "learning_rate": 2.0873015873015875e-05, "loss": 0.3564, "step": 875 }, { "epoch": 1.8758029978586723, "grad_norm": 0.22155578745771093, "learning_rate": 2.0833333333333336e-05, "loss": 0.3543, "step": 876 }, { "epoch": 1.8779443254817987, "grad_norm": 0.21075973711020585, "learning_rate": 2.0793650793650797e-05, "loss": 0.344, "step": 877 }, { "epoch": 1.880085653104925, "grad_norm": 0.21005937923832468, "learning_rate": 2.0753968253968258e-05, "loss": 0.3672, "step": 878 }, { "epoch": 1.8822269807280514, "grad_norm": 0.20105451656897086, "learning_rate": 2.0714285714285718e-05, "loss": 0.3672, "step": 879 }, { "epoch": 1.8843683083511777, "grad_norm": 0.245253064751467, "learning_rate": 2.0674603174603176e-05, "loss": 0.3551, "step": 880 }, { "epoch": 1.886509635974304, "grad_norm": 0.21136163316257608, "learning_rate": 2.0634920634920636e-05, "loss": 0.363, "step": 881 }, { "epoch": 1.8886509635974305, "grad_norm": 0.20602573266640592, "learning_rate": 2.0595238095238094e-05, "loss": 0.3537, "step": 882 }, { "epoch": 1.8907922912205568, "grad_norm": 0.20197617146682256, "learning_rate": 2.0555555555555555e-05, "loss": 0.3525, "step": 883 }, { "epoch": 1.8929336188436832, "grad_norm": 0.20854372113696887, "learning_rate": 2.0515873015873015e-05, "loss": 0.3514, "step": 884 }, { "epoch": 1.8950749464668095, "grad_norm": 0.4220973103981105, "learning_rate": 2.0476190476190476e-05, "loss": 0.3666, "step": 885 }, { "epoch": 1.8972162740899359, "grad_norm": 0.19662405811862105, "learning_rate": 2.0436507936507937e-05, "loss": 0.3557, "step": 886 }, { "epoch": 1.8993576017130622, "grad_norm": 0.21424028404578552, "learning_rate": 2.0396825396825398e-05, "loss": 0.3705, "step": 887 }, { "epoch": 1.9014989293361886, "grad_norm": 0.20444704514356857, "learning_rate": 2.0357142857142858e-05, "loss": 0.3671, "step": 888 }, { "epoch": 1.903640256959315, "grad_norm": 0.209872146829905, "learning_rate": 2.031746031746032e-05, "loss": 0.3675, "step": 889 }, { "epoch": 1.905781584582441, "grad_norm": 0.2128263431128925, "learning_rate": 2.027777777777778e-05, "loss": 0.3578, "step": 890 }, { "epoch": 1.9079229122055674, "grad_norm": 0.21350097793924863, "learning_rate": 2.023809523809524e-05, "loss": 0.3579, "step": 891 }, { "epoch": 1.9100642398286938, "grad_norm": 0.19676311955725848, "learning_rate": 2.01984126984127e-05, "loss": 0.3453, "step": 892 }, { "epoch": 1.9122055674518201, "grad_norm": 0.18886161439157978, "learning_rate": 2.015873015873016e-05, "loss": 0.3576, "step": 893 }, { "epoch": 1.9143468950749465, "grad_norm": 0.19774803879409486, "learning_rate": 2.011904761904762e-05, "loss": 0.3534, "step": 894 }, { "epoch": 1.9164882226980728, "grad_norm": 0.23099750551614034, "learning_rate": 2.007936507936508e-05, "loss": 0.3699, "step": 895 }, { "epoch": 1.9186295503211992, "grad_norm": 0.2045355403156652, "learning_rate": 2.003968253968254e-05, "loss": 0.359, "step": 896 }, { "epoch": 1.9207708779443253, "grad_norm": 0.1971397275342457, "learning_rate": 2e-05, "loss": 0.3635, "step": 897 }, { "epoch": 1.9229122055674517, "grad_norm": 0.21049030917982176, "learning_rate": 1.9960317460317462e-05, "loss": 0.3745, "step": 898 }, { "epoch": 1.925053533190578, "grad_norm": 0.222721408427582, "learning_rate": 1.992063492063492e-05, "loss": 0.3848, "step": 899 }, { "epoch": 1.9271948608137044, "grad_norm": 0.21296826592149146, "learning_rate": 1.988095238095238e-05, "loss": 0.3845, "step": 900 }, { "epoch": 1.9293361884368307, "grad_norm": 0.19916011701018338, "learning_rate": 1.984126984126984e-05, "loss": 0.3653, "step": 901 }, { "epoch": 1.931477516059957, "grad_norm": 0.2010065237557244, "learning_rate": 1.9801587301587302e-05, "loss": 0.346, "step": 902 }, { "epoch": 1.9336188436830835, "grad_norm": 0.22109341655275905, "learning_rate": 1.9761904761904763e-05, "loss": 0.3811, "step": 903 }, { "epoch": 1.9357601713062098, "grad_norm": 0.21704603464265765, "learning_rate": 1.9722222222222224e-05, "loss": 0.3697, "step": 904 }, { "epoch": 1.9379014989293362, "grad_norm": 0.19589016044717736, "learning_rate": 1.9682539682539684e-05, "loss": 0.3561, "step": 905 }, { "epoch": 1.9400428265524625, "grad_norm": 0.1844953806039601, "learning_rate": 1.9642857142857145e-05, "loss": 0.355, "step": 906 }, { "epoch": 1.9421841541755889, "grad_norm": 0.2450452225863573, "learning_rate": 1.9603174603174602e-05, "loss": 0.34, "step": 907 }, { "epoch": 1.9443254817987152, "grad_norm": 0.1985635379433612, "learning_rate": 1.9563492063492063e-05, "loss": 0.3724, "step": 908 }, { "epoch": 1.9464668094218416, "grad_norm": 0.20873253034957434, "learning_rate": 1.9523809523809524e-05, "loss": 0.3693, "step": 909 }, { "epoch": 1.948608137044968, "grad_norm": 0.22144116581790702, "learning_rate": 1.9484126984126985e-05, "loss": 0.3729, "step": 910 }, { "epoch": 1.9507494646680943, "grad_norm": 0.23269442568625717, "learning_rate": 1.9444444444444445e-05, "loss": 0.3807, "step": 911 }, { "epoch": 1.9528907922912206, "grad_norm": 0.21675592494303747, "learning_rate": 1.9404761904761906e-05, "loss": 0.3533, "step": 912 }, { "epoch": 1.955032119914347, "grad_norm": 0.2197742949597075, "learning_rate": 1.9365079365079367e-05, "loss": 0.3607, "step": 913 }, { "epoch": 1.9571734475374734, "grad_norm": 0.21883406348238188, "learning_rate": 1.9325396825396828e-05, "loss": 0.3635, "step": 914 }, { "epoch": 1.9593147751605997, "grad_norm": 0.2026715665088903, "learning_rate": 1.928571428571429e-05, "loss": 0.358, "step": 915 }, { "epoch": 1.961456102783726, "grad_norm": 0.1974253749656061, "learning_rate": 1.924603174603175e-05, "loss": 0.3474, "step": 916 }, { "epoch": 1.9635974304068522, "grad_norm": 0.19294611529574526, "learning_rate": 1.920634920634921e-05, "loss": 0.3625, "step": 917 }, { "epoch": 1.9657387580299786, "grad_norm": 0.22729056002121276, "learning_rate": 1.9166666666666667e-05, "loss": 0.3548, "step": 918 }, { "epoch": 1.967880085653105, "grad_norm": 0.20766953098408916, "learning_rate": 1.9126984126984128e-05, "loss": 0.3599, "step": 919 }, { "epoch": 1.9700214132762313, "grad_norm": 0.23817301013936817, "learning_rate": 1.9087301587301585e-05, "loss": 0.3787, "step": 920 }, { "epoch": 1.9721627408993576, "grad_norm": 0.21184335304060803, "learning_rate": 1.9047619047619046e-05, "loss": 0.356, "step": 921 }, { "epoch": 1.974304068522484, "grad_norm": 0.19810583708611373, "learning_rate": 1.9007936507936507e-05, "loss": 0.373, "step": 922 }, { "epoch": 1.9764453961456103, "grad_norm": 0.217549480144923, "learning_rate": 1.8968253968253968e-05, "loss": 0.3611, "step": 923 }, { "epoch": 1.9785867237687365, "grad_norm": 0.2376544391246724, "learning_rate": 1.892857142857143e-05, "loss": 0.3867, "step": 924 }, { "epoch": 1.9807280513918628, "grad_norm": 0.19811936382123432, "learning_rate": 1.888888888888889e-05, "loss": 0.3706, "step": 925 }, { "epoch": 1.9828693790149892, "grad_norm": 0.196442826279356, "learning_rate": 1.884920634920635e-05, "loss": 0.3766, "step": 926 }, { "epoch": 1.9850107066381155, "grad_norm": 0.21591468021679908, "learning_rate": 1.880952380952381e-05, "loss": 0.3573, "step": 927 }, { "epoch": 1.9871520342612419, "grad_norm": 0.20004992781857317, "learning_rate": 1.876984126984127e-05, "loss": 0.3573, "step": 928 }, { "epoch": 1.9892933618843682, "grad_norm": 0.18155989142218865, "learning_rate": 1.8730158730158732e-05, "loss": 0.3508, "step": 929 }, { "epoch": 1.9914346895074946, "grad_norm": 0.19220975795928547, "learning_rate": 1.8690476190476193e-05, "loss": 0.3665, "step": 930 }, { "epoch": 1.993576017130621, "grad_norm": 0.20680633105790955, "learning_rate": 1.8650793650793654e-05, "loss": 0.3709, "step": 931 }, { "epoch": 1.9957173447537473, "grad_norm": 0.19718351617490748, "learning_rate": 1.861111111111111e-05, "loss": 0.3674, "step": 932 }, { "epoch": 1.9978586723768736, "grad_norm": 0.20385992307338152, "learning_rate": 1.8571428571428572e-05, "loss": 0.3647, "step": 933 }, { "epoch": 2.0, "grad_norm": 0.19952173716760616, "learning_rate": 1.8531746031746032e-05, "loss": 0.3498, "step": 934 }, { "epoch": 2.0021413276231264, "grad_norm": 0.2887390533219721, "learning_rate": 1.8492063492063493e-05, "loss": 0.2978, "step": 935 }, { "epoch": 2.0042826552462527, "grad_norm": 0.2261310029497388, "learning_rate": 1.8452380952380954e-05, "loss": 0.2753, "step": 936 }, { "epoch": 2.006423982869379, "grad_norm": 0.19608695195304876, "learning_rate": 1.8412698412698415e-05, "loss": 0.2894, "step": 937 }, { "epoch": 2.0085653104925054, "grad_norm": 0.2670758274924303, "learning_rate": 1.8373015873015875e-05, "loss": 0.2855, "step": 938 }, { "epoch": 2.0107066381156318, "grad_norm": 0.2729391112548352, "learning_rate": 1.8333333333333333e-05, "loss": 0.2766, "step": 939 }, { "epoch": 2.012847965738758, "grad_norm": 0.2268053076448433, "learning_rate": 1.8293650793650794e-05, "loss": 0.2825, "step": 940 }, { "epoch": 2.0149892933618845, "grad_norm": 0.22891152322973052, "learning_rate": 1.8253968253968254e-05, "loss": 0.2704, "step": 941 }, { "epoch": 2.017130620985011, "grad_norm": 0.24951718116860663, "learning_rate": 1.8214285714285715e-05, "loss": 0.2925, "step": 942 }, { "epoch": 2.019271948608137, "grad_norm": 0.24317478809263185, "learning_rate": 1.8174603174603176e-05, "loss": 0.2852, "step": 943 }, { "epoch": 2.0214132762312635, "grad_norm": 0.22035333150669048, "learning_rate": 1.8134920634920637e-05, "loss": 0.28, "step": 944 }, { "epoch": 2.02355460385439, "grad_norm": 0.22762363253514034, "learning_rate": 1.8095238095238094e-05, "loss": 0.2894, "step": 945 }, { "epoch": 2.0256959314775163, "grad_norm": 0.22788105608795867, "learning_rate": 1.8055555555555555e-05, "loss": 0.2743, "step": 946 }, { "epoch": 2.0278372591006426, "grad_norm": 0.20269423040955745, "learning_rate": 1.8015873015873015e-05, "loss": 0.2663, "step": 947 }, { "epoch": 2.0299785867237685, "grad_norm": 0.22523427656669212, "learning_rate": 1.7976190476190476e-05, "loss": 0.2781, "step": 948 }, { "epoch": 2.032119914346895, "grad_norm": 0.21588221204986352, "learning_rate": 1.7936507936507937e-05, "loss": 0.2788, "step": 949 }, { "epoch": 2.0342612419700212, "grad_norm": 0.195728356650988, "learning_rate": 1.7896825396825398e-05, "loss": 0.2769, "step": 950 }, { "epoch": 2.0364025695931476, "grad_norm": 0.2161839738558612, "learning_rate": 1.785714285714286e-05, "loss": 0.2941, "step": 951 }, { "epoch": 2.038543897216274, "grad_norm": 0.23767256005231727, "learning_rate": 1.781746031746032e-05, "loss": 0.2895, "step": 952 }, { "epoch": 2.0406852248394003, "grad_norm": 0.20171938363284947, "learning_rate": 1.777777777777778e-05, "loss": 0.2781, "step": 953 }, { "epoch": 2.0428265524625266, "grad_norm": 0.1856341413030586, "learning_rate": 1.773809523809524e-05, "loss": 0.2685, "step": 954 }, { "epoch": 2.044967880085653, "grad_norm": 0.2145432452323137, "learning_rate": 1.76984126984127e-05, "loss": 0.281, "step": 955 }, { "epoch": 2.0471092077087794, "grad_norm": 0.2112321163664146, "learning_rate": 1.7658730158730162e-05, "loss": 0.2977, "step": 956 }, { "epoch": 2.0492505353319057, "grad_norm": 0.2067934311888714, "learning_rate": 1.761904761904762e-05, "loss": 0.2793, "step": 957 }, { "epoch": 2.051391862955032, "grad_norm": 0.21063958513546374, "learning_rate": 1.757936507936508e-05, "loss": 0.2807, "step": 958 }, { "epoch": 2.0535331905781584, "grad_norm": 0.19678228876621853, "learning_rate": 1.7539682539682538e-05, "loss": 0.2773, "step": 959 }, { "epoch": 2.0556745182012848, "grad_norm": 0.1926373898265207, "learning_rate": 1.75e-05, "loss": 0.2747, "step": 960 }, { "epoch": 2.057815845824411, "grad_norm": 0.20704097224369902, "learning_rate": 1.746031746031746e-05, "loss": 0.2722, "step": 961 }, { "epoch": 2.0599571734475375, "grad_norm": 0.19808242466728723, "learning_rate": 1.742063492063492e-05, "loss": 0.271, "step": 962 }, { "epoch": 2.062098501070664, "grad_norm": 0.19665606509062533, "learning_rate": 1.738095238095238e-05, "loss": 0.2769, "step": 963 }, { "epoch": 2.06423982869379, "grad_norm": 0.20159555272227964, "learning_rate": 1.734126984126984e-05, "loss": 0.2754, "step": 964 }, { "epoch": 2.0663811563169165, "grad_norm": 0.20715083834741455, "learning_rate": 1.7301587301587302e-05, "loss": 0.2819, "step": 965 }, { "epoch": 2.068522483940043, "grad_norm": 0.1994028177934581, "learning_rate": 1.7261904761904763e-05, "loss": 0.2737, "step": 966 }, { "epoch": 2.0706638115631693, "grad_norm": 0.21226238723876503, "learning_rate": 1.7222222222222224e-05, "loss": 0.2755, "step": 967 }, { "epoch": 2.0728051391862956, "grad_norm": 0.21621507021709063, "learning_rate": 1.7182539682539684e-05, "loss": 0.2871, "step": 968 }, { "epoch": 2.074946466809422, "grad_norm": 0.2054891028222431, "learning_rate": 1.7142857142857145e-05, "loss": 0.281, "step": 969 }, { "epoch": 2.0770877944325483, "grad_norm": 0.20802135509652517, "learning_rate": 1.7103174603174606e-05, "loss": 0.2706, "step": 970 }, { "epoch": 2.0792291220556747, "grad_norm": 0.19737703811731966, "learning_rate": 1.7063492063492063e-05, "loss": 0.2825, "step": 971 }, { "epoch": 2.081370449678801, "grad_norm": 0.18858696479602974, "learning_rate": 1.7023809523809524e-05, "loss": 0.2631, "step": 972 }, { "epoch": 2.0835117773019274, "grad_norm": 0.19571849751196396, "learning_rate": 1.6984126984126985e-05, "loss": 0.2878, "step": 973 }, { "epoch": 2.0856531049250537, "grad_norm": 0.19389617623346578, "learning_rate": 1.6944444444444446e-05, "loss": 0.2821, "step": 974 }, { "epoch": 2.08779443254818, "grad_norm": 0.19768747519412655, "learning_rate": 1.6904761904761906e-05, "loss": 0.2836, "step": 975 }, { "epoch": 2.089935760171306, "grad_norm": 0.19370599491498322, "learning_rate": 1.6865079365079367e-05, "loss": 0.2712, "step": 976 }, { "epoch": 2.0920770877944324, "grad_norm": 0.20240036491722738, "learning_rate": 1.6825396825396828e-05, "loss": 0.287, "step": 977 }, { "epoch": 2.0942184154175587, "grad_norm": 0.2036262743697464, "learning_rate": 1.6785714285714285e-05, "loss": 0.297, "step": 978 }, { "epoch": 2.096359743040685, "grad_norm": 0.20250473360666518, "learning_rate": 1.6746031746031746e-05, "loss": 0.2777, "step": 979 }, { "epoch": 2.0985010706638114, "grad_norm": 0.20192268740605415, "learning_rate": 1.6706349206349207e-05, "loss": 0.2864, "step": 980 }, { "epoch": 2.1006423982869378, "grad_norm": 0.19075170756571364, "learning_rate": 1.6666666666666667e-05, "loss": 0.2701, "step": 981 }, { "epoch": 2.102783725910064, "grad_norm": 0.19697486168056122, "learning_rate": 1.6626984126984128e-05, "loss": 0.2811, "step": 982 }, { "epoch": 2.1049250535331905, "grad_norm": 0.19096670479918926, "learning_rate": 1.658730158730159e-05, "loss": 0.2769, "step": 983 }, { "epoch": 2.107066381156317, "grad_norm": 0.19574292129874354, "learning_rate": 1.6547619047619046e-05, "loss": 0.2701, "step": 984 }, { "epoch": 2.109207708779443, "grad_norm": 0.21404348512022472, "learning_rate": 1.6507936507936507e-05, "loss": 0.2849, "step": 985 }, { "epoch": 2.1113490364025695, "grad_norm": 0.19567445704922734, "learning_rate": 1.6468253968253968e-05, "loss": 0.2753, "step": 986 }, { "epoch": 2.113490364025696, "grad_norm": 0.2069043774274054, "learning_rate": 1.642857142857143e-05, "loss": 0.2776, "step": 987 }, { "epoch": 2.1156316916488223, "grad_norm": 0.22126156756781362, "learning_rate": 1.638888888888889e-05, "loss": 0.2979, "step": 988 }, { "epoch": 2.1177730192719486, "grad_norm": 0.21047914160745446, "learning_rate": 1.634920634920635e-05, "loss": 0.2759, "step": 989 }, { "epoch": 2.119914346895075, "grad_norm": 0.1866961799813759, "learning_rate": 1.630952380952381e-05, "loss": 0.2734, "step": 990 }, { "epoch": 2.1220556745182013, "grad_norm": 0.19697846429602314, "learning_rate": 1.626984126984127e-05, "loss": 0.2932, "step": 991 }, { "epoch": 2.1241970021413277, "grad_norm": 0.1907591280617684, "learning_rate": 1.6230158730158732e-05, "loss": 0.2743, "step": 992 }, { "epoch": 2.126338329764454, "grad_norm": 0.18785957871532322, "learning_rate": 1.6190476190476193e-05, "loss": 0.2797, "step": 993 }, { "epoch": 2.1284796573875804, "grad_norm": 0.1881059477765473, "learning_rate": 1.6150793650793654e-05, "loss": 0.2795, "step": 994 }, { "epoch": 2.1306209850107067, "grad_norm": 0.1895829464393248, "learning_rate": 1.6111111111111115e-05, "loss": 0.2699, "step": 995 }, { "epoch": 2.132762312633833, "grad_norm": 0.1894894322187353, "learning_rate": 1.6071428571428572e-05, "loss": 0.2839, "step": 996 }, { "epoch": 2.1349036402569594, "grad_norm": 0.20942109541121104, "learning_rate": 1.6031746031746033e-05, "loss": 0.276, "step": 997 }, { "epoch": 2.137044967880086, "grad_norm": 0.1929206423206803, "learning_rate": 1.599206349206349e-05, "loss": 0.2781, "step": 998 }, { "epoch": 2.139186295503212, "grad_norm": 0.20059791418349765, "learning_rate": 1.595238095238095e-05, "loss": 0.2731, "step": 999 }, { "epoch": 2.1413276231263385, "grad_norm": 0.19734665993628578, "learning_rate": 1.591269841269841e-05, "loss": 0.2841, "step": 1000 }, { "epoch": 2.143468950749465, "grad_norm": 0.19634335711712297, "learning_rate": 1.5873015873015872e-05, "loss": 0.2861, "step": 1001 }, { "epoch": 2.145610278372591, "grad_norm": 0.18641688183989846, "learning_rate": 1.5833333333333333e-05, "loss": 0.2699, "step": 1002 }, { "epoch": 2.147751605995717, "grad_norm": 0.19839241096997418, "learning_rate": 1.5793650793650794e-05, "loss": 0.2746, "step": 1003 }, { "epoch": 2.1498929336188435, "grad_norm": 0.1903736062839147, "learning_rate": 1.5753968253968255e-05, "loss": 0.2649, "step": 1004 }, { "epoch": 2.15203426124197, "grad_norm": 0.19577687838702312, "learning_rate": 1.5714285714285715e-05, "loss": 0.2885, "step": 1005 }, { "epoch": 2.154175588865096, "grad_norm": 0.20084961494956982, "learning_rate": 1.5674603174603176e-05, "loss": 0.277, "step": 1006 }, { "epoch": 2.1563169164882225, "grad_norm": 0.19044578659203962, "learning_rate": 1.5634920634920637e-05, "loss": 0.2759, "step": 1007 }, { "epoch": 2.158458244111349, "grad_norm": 0.20168687457620502, "learning_rate": 1.5595238095238098e-05, "loss": 0.2883, "step": 1008 }, { "epoch": 2.1605995717344753, "grad_norm": 0.1917192379674667, "learning_rate": 1.5555555555555555e-05, "loss": 0.2822, "step": 1009 }, { "epoch": 2.1627408993576016, "grad_norm": 0.21720443733726488, "learning_rate": 1.5515873015873016e-05, "loss": 0.287, "step": 1010 }, { "epoch": 2.164882226980728, "grad_norm": 0.19059089571719792, "learning_rate": 1.5476190476190476e-05, "loss": 0.2827, "step": 1011 }, { "epoch": 2.1670235546038543, "grad_norm": 0.20266248908092088, "learning_rate": 1.5436507936507937e-05, "loss": 0.2944, "step": 1012 }, { "epoch": 2.1691648822269807, "grad_norm": 0.20424509394375523, "learning_rate": 1.5396825396825398e-05, "loss": 0.2736, "step": 1013 }, { "epoch": 2.171306209850107, "grad_norm": 0.19086220079548602, "learning_rate": 1.535714285714286e-05, "loss": 0.2772, "step": 1014 }, { "epoch": 2.1734475374732334, "grad_norm": 0.19412331734644836, "learning_rate": 1.531746031746032e-05, "loss": 0.2748, "step": 1015 }, { "epoch": 2.1755888650963597, "grad_norm": 0.20630378133521415, "learning_rate": 1.527777777777778e-05, "loss": 0.2938, "step": 1016 }, { "epoch": 2.177730192719486, "grad_norm": 0.18492977798415206, "learning_rate": 1.5238095238095241e-05, "loss": 0.2745, "step": 1017 }, { "epoch": 2.1798715203426124, "grad_norm": 0.2039798887458119, "learning_rate": 1.5198412698412698e-05, "loss": 0.2755, "step": 1018 }, { "epoch": 2.182012847965739, "grad_norm": 0.20121459942322972, "learning_rate": 1.5158730158730159e-05, "loss": 0.2862, "step": 1019 }, { "epoch": 2.184154175588865, "grad_norm": 0.1827134805798544, "learning_rate": 1.5119047619047618e-05, "loss": 0.2718, "step": 1020 }, { "epoch": 2.1862955032119915, "grad_norm": 0.17917411734145225, "learning_rate": 1.5079365079365079e-05, "loss": 0.2695, "step": 1021 }, { "epoch": 2.188436830835118, "grad_norm": 0.18767988072287312, "learning_rate": 1.503968253968254e-05, "loss": 0.2684, "step": 1022 }, { "epoch": 2.190578158458244, "grad_norm": 0.1942652275305181, "learning_rate": 1.5e-05, "loss": 0.2803, "step": 1023 }, { "epoch": 2.1927194860813706, "grad_norm": 0.20926677447772102, "learning_rate": 1.4960317460317461e-05, "loss": 0.2843, "step": 1024 }, { "epoch": 2.194860813704497, "grad_norm": 0.1991623873969263, "learning_rate": 1.4920634920634922e-05, "loss": 0.2874, "step": 1025 }, { "epoch": 2.1970021413276233, "grad_norm": 0.18616156346828455, "learning_rate": 1.4880952380952381e-05, "loss": 0.2836, "step": 1026 }, { "epoch": 2.1991434689507496, "grad_norm": 0.2001544304384233, "learning_rate": 1.4841269841269842e-05, "loss": 0.292, "step": 1027 }, { "epoch": 2.201284796573876, "grad_norm": 0.19807876601861757, "learning_rate": 1.4801587301587302e-05, "loss": 0.2815, "step": 1028 }, { "epoch": 2.2034261241970023, "grad_norm": 0.18403372703681253, "learning_rate": 1.4761904761904763e-05, "loss": 0.2733, "step": 1029 }, { "epoch": 2.2055674518201283, "grad_norm": 0.1886899672546482, "learning_rate": 1.4722222222222224e-05, "loss": 0.2664, "step": 1030 }, { "epoch": 2.207708779443255, "grad_norm": 0.19490843580765388, "learning_rate": 1.4682539682539683e-05, "loss": 0.2899, "step": 1031 }, { "epoch": 2.209850107066381, "grad_norm": 0.18904649297785464, "learning_rate": 1.4642857142857144e-05, "loss": 0.2805, "step": 1032 }, { "epoch": 2.2119914346895073, "grad_norm": 0.18795150882002612, "learning_rate": 1.4603174603174605e-05, "loss": 0.2855, "step": 1033 }, { "epoch": 2.2141327623126337, "grad_norm": 0.19796365275830283, "learning_rate": 1.4563492063492065e-05, "loss": 0.2818, "step": 1034 }, { "epoch": 2.21627408993576, "grad_norm": 0.18219404726440058, "learning_rate": 1.4523809523809526e-05, "loss": 0.287, "step": 1035 }, { "epoch": 2.2184154175588864, "grad_norm": 0.19237435500214886, "learning_rate": 1.4484126984126987e-05, "loss": 0.2821, "step": 1036 }, { "epoch": 2.2205567451820127, "grad_norm": 0.21247677887677272, "learning_rate": 1.4444444444444444e-05, "loss": 0.2806, "step": 1037 }, { "epoch": 2.222698072805139, "grad_norm": 0.19613559228915892, "learning_rate": 1.4404761904761905e-05, "loss": 0.2705, "step": 1038 }, { "epoch": 2.2248394004282654, "grad_norm": 0.20206602453648073, "learning_rate": 1.4365079365079364e-05, "loss": 0.2936, "step": 1039 }, { "epoch": 2.226980728051392, "grad_norm": 0.21986971551175186, "learning_rate": 1.4325396825396825e-05, "loss": 0.281, "step": 1040 }, { "epoch": 2.229122055674518, "grad_norm": 0.19544790141801302, "learning_rate": 1.4285714285714285e-05, "loss": 0.2772, "step": 1041 }, { "epoch": 2.2312633832976445, "grad_norm": 0.187999178943079, "learning_rate": 1.4246031746031746e-05, "loss": 0.2712, "step": 1042 }, { "epoch": 2.233404710920771, "grad_norm": 0.20155673411774347, "learning_rate": 1.4206349206349207e-05, "loss": 0.2796, "step": 1043 }, { "epoch": 2.235546038543897, "grad_norm": 0.24201763616546484, "learning_rate": 1.4166666666666668e-05, "loss": 0.2821, "step": 1044 }, { "epoch": 2.2376873661670236, "grad_norm": 0.20108574451228317, "learning_rate": 1.4126984126984127e-05, "loss": 0.2886, "step": 1045 }, { "epoch": 2.23982869379015, "grad_norm": 0.19434223451617216, "learning_rate": 1.4087301587301587e-05, "loss": 0.2814, "step": 1046 }, { "epoch": 2.2419700214132763, "grad_norm": 0.19496558913111342, "learning_rate": 1.4047619047619048e-05, "loss": 0.2809, "step": 1047 }, { "epoch": 2.2441113490364026, "grad_norm": 0.20072045682891357, "learning_rate": 1.4007936507936509e-05, "loss": 0.2791, "step": 1048 }, { "epoch": 2.246252676659529, "grad_norm": 0.19738112557289852, "learning_rate": 1.396825396825397e-05, "loss": 0.2756, "step": 1049 }, { "epoch": 2.2483940042826553, "grad_norm": 0.18581301381293844, "learning_rate": 1.392857142857143e-05, "loss": 0.2696, "step": 1050 }, { "epoch": 2.2505353319057817, "grad_norm": 0.19262455905174403, "learning_rate": 1.388888888888889e-05, "loss": 0.2758, "step": 1051 }, { "epoch": 2.252676659528908, "grad_norm": 0.2044076909216322, "learning_rate": 1.384920634920635e-05, "loss": 0.2759, "step": 1052 }, { "epoch": 2.2548179871520344, "grad_norm": 0.20744550401581244, "learning_rate": 1.3809523809523811e-05, "loss": 0.2952, "step": 1053 }, { "epoch": 2.2569593147751608, "grad_norm": 0.19946431730972056, "learning_rate": 1.3769841269841272e-05, "loss": 0.2776, "step": 1054 }, { "epoch": 2.259100642398287, "grad_norm": 0.2061227664972564, "learning_rate": 1.3730158730158733e-05, "loss": 0.273, "step": 1055 }, { "epoch": 2.2612419700214135, "grad_norm": 0.1958943903490819, "learning_rate": 1.3690476190476192e-05, "loss": 0.281, "step": 1056 }, { "epoch": 2.2633832976445394, "grad_norm": 0.17986444789632078, "learning_rate": 1.365079365079365e-05, "loss": 0.282, "step": 1057 }, { "epoch": 2.265524625267666, "grad_norm": 0.20331838773796787, "learning_rate": 1.3611111111111111e-05, "loss": 0.2822, "step": 1058 }, { "epoch": 2.267665952890792, "grad_norm": 0.1854886814182152, "learning_rate": 1.357142857142857e-05, "loss": 0.2871, "step": 1059 }, { "epoch": 2.2698072805139184, "grad_norm": 0.19038399977635145, "learning_rate": 1.3531746031746031e-05, "loss": 0.2705, "step": 1060 }, { "epoch": 2.271948608137045, "grad_norm": 0.18815130656204335, "learning_rate": 1.3492063492063492e-05, "loss": 0.2756, "step": 1061 }, { "epoch": 2.274089935760171, "grad_norm": 0.19455215664990538, "learning_rate": 1.3452380952380953e-05, "loss": 0.283, "step": 1062 }, { "epoch": 2.2762312633832975, "grad_norm": 0.20059126614368925, "learning_rate": 1.3412698412698413e-05, "loss": 0.2785, "step": 1063 }, { "epoch": 2.278372591006424, "grad_norm": 0.20535076737597538, "learning_rate": 1.3373015873015873e-05, "loss": 0.2827, "step": 1064 }, { "epoch": 2.28051391862955, "grad_norm": 0.19133006544393558, "learning_rate": 1.3333333333333333e-05, "loss": 0.2876, "step": 1065 }, { "epoch": 2.2826552462526766, "grad_norm": 0.18259775238692044, "learning_rate": 1.3293650793650794e-05, "loss": 0.2778, "step": 1066 }, { "epoch": 2.284796573875803, "grad_norm": 0.18347983126035391, "learning_rate": 1.3253968253968255e-05, "loss": 0.2761, "step": 1067 }, { "epoch": 2.2869379014989293, "grad_norm": 0.18777852570980433, "learning_rate": 1.3214285714285716e-05, "loss": 0.2818, "step": 1068 }, { "epoch": 2.2890792291220556, "grad_norm": 0.18410525589405727, "learning_rate": 1.3174603174603176e-05, "loss": 0.2656, "step": 1069 }, { "epoch": 2.291220556745182, "grad_norm": 0.19035078229116137, "learning_rate": 1.3134920634920635e-05, "loss": 0.2816, "step": 1070 }, { "epoch": 2.2933618843683083, "grad_norm": 0.20015240192900935, "learning_rate": 1.3095238095238096e-05, "loss": 0.3031, "step": 1071 }, { "epoch": 2.2955032119914347, "grad_norm": 0.18184078914671584, "learning_rate": 1.3055555555555557e-05, "loss": 0.2784, "step": 1072 }, { "epoch": 2.297644539614561, "grad_norm": 0.19404174251026468, "learning_rate": 1.3015873015873018e-05, "loss": 0.2761, "step": 1073 }, { "epoch": 2.2997858672376874, "grad_norm": 0.18586683492531947, "learning_rate": 1.2976190476190478e-05, "loss": 0.2781, "step": 1074 }, { "epoch": 2.3019271948608138, "grad_norm": 0.18361259801329616, "learning_rate": 1.2936507936507939e-05, "loss": 0.2588, "step": 1075 }, { "epoch": 2.30406852248394, "grad_norm": 0.19747669576089893, "learning_rate": 1.2896825396825398e-05, "loss": 0.2819, "step": 1076 }, { "epoch": 2.3062098501070665, "grad_norm": 0.1962675355280158, "learning_rate": 1.2857142857142857e-05, "loss": 0.2755, "step": 1077 }, { "epoch": 2.308351177730193, "grad_norm": 0.19942980028292878, "learning_rate": 1.2817460317460316e-05, "loss": 0.2818, "step": 1078 }, { "epoch": 2.310492505353319, "grad_norm": 0.19501376392056294, "learning_rate": 1.2777777777777777e-05, "loss": 0.2882, "step": 1079 }, { "epoch": 2.3126338329764455, "grad_norm": 0.18868853481419254, "learning_rate": 1.2738095238095238e-05, "loss": 0.2762, "step": 1080 }, { "epoch": 2.314775160599572, "grad_norm": 0.18259220767781897, "learning_rate": 1.2698412698412699e-05, "loss": 0.2862, "step": 1081 }, { "epoch": 2.3169164882226982, "grad_norm": 0.20057460578986203, "learning_rate": 1.265873015873016e-05, "loss": 0.289, "step": 1082 }, { "epoch": 2.3190578158458246, "grad_norm": 0.20457960878239465, "learning_rate": 1.261904761904762e-05, "loss": 0.2855, "step": 1083 }, { "epoch": 2.3211991434689505, "grad_norm": 0.19996856442993224, "learning_rate": 1.2579365079365079e-05, "loss": 0.2831, "step": 1084 }, { "epoch": 2.3233404710920773, "grad_norm": 0.19193162471753858, "learning_rate": 1.253968253968254e-05, "loss": 0.2788, "step": 1085 }, { "epoch": 2.325481798715203, "grad_norm": 0.20129531906043394, "learning_rate": 1.25e-05, "loss": 0.2653, "step": 1086 }, { "epoch": 2.3276231263383296, "grad_norm": 0.19714902241567792, "learning_rate": 1.2460317460317461e-05, "loss": 0.2898, "step": 1087 }, { "epoch": 2.329764453961456, "grad_norm": 0.20515273593194386, "learning_rate": 1.2420634920634922e-05, "loss": 0.2977, "step": 1088 }, { "epoch": 2.3319057815845823, "grad_norm": 0.19580178817727573, "learning_rate": 1.2380952380952381e-05, "loss": 0.2802, "step": 1089 }, { "epoch": 2.3340471092077086, "grad_norm": 0.17768572531273108, "learning_rate": 1.2341269841269842e-05, "loss": 0.2703, "step": 1090 }, { "epoch": 2.336188436830835, "grad_norm": 0.1956542549620933, "learning_rate": 1.2301587301587301e-05, "loss": 0.2757, "step": 1091 }, { "epoch": 2.3383297644539613, "grad_norm": 0.20127910379978708, "learning_rate": 1.2261904761904762e-05, "loss": 0.2837, "step": 1092 }, { "epoch": 2.3404710920770877, "grad_norm": 0.20060227683373613, "learning_rate": 1.2222222222222222e-05, "loss": 0.2802, "step": 1093 }, { "epoch": 2.342612419700214, "grad_norm": 0.19431031021153464, "learning_rate": 1.2182539682539683e-05, "loss": 0.291, "step": 1094 }, { "epoch": 2.3447537473233404, "grad_norm": 0.19260339789181197, "learning_rate": 1.2142857142857144e-05, "loss": 0.2802, "step": 1095 }, { "epoch": 2.3468950749464668, "grad_norm": 0.20821596374744575, "learning_rate": 1.2103174603174603e-05, "loss": 0.2866, "step": 1096 }, { "epoch": 2.349036402569593, "grad_norm": 0.20675441129158606, "learning_rate": 1.2063492063492064e-05, "loss": 0.2926, "step": 1097 }, { "epoch": 2.3511777301927195, "grad_norm": 0.1968034926711773, "learning_rate": 1.2023809523809525e-05, "loss": 0.2757, "step": 1098 }, { "epoch": 2.353319057815846, "grad_norm": 0.18488353953473421, "learning_rate": 1.1984126984126985e-05, "loss": 0.2868, "step": 1099 }, { "epoch": 2.355460385438972, "grad_norm": 0.20338110721794142, "learning_rate": 1.1944444444444446e-05, "loss": 0.2882, "step": 1100 }, { "epoch": 2.3576017130620985, "grad_norm": 0.19166668181356114, "learning_rate": 1.1904761904761905e-05, "loss": 0.292, "step": 1101 }, { "epoch": 2.359743040685225, "grad_norm": 0.1927455443301824, "learning_rate": 1.1865079365079366e-05, "loss": 0.2649, "step": 1102 }, { "epoch": 2.3618843683083512, "grad_norm": 0.17477212497660993, "learning_rate": 1.1825396825396825e-05, "loss": 0.2752, "step": 1103 }, { "epoch": 2.3640256959314776, "grad_norm": 0.19987372705216191, "learning_rate": 1.1785714285714286e-05, "loss": 0.2819, "step": 1104 }, { "epoch": 2.366167023554604, "grad_norm": 0.1922206507729967, "learning_rate": 1.1746031746031746e-05, "loss": 0.2823, "step": 1105 }, { "epoch": 2.3683083511777303, "grad_norm": 0.18451313459925092, "learning_rate": 1.1706349206349207e-05, "loss": 0.2858, "step": 1106 }, { "epoch": 2.3704496788008567, "grad_norm": 0.194347680132915, "learning_rate": 1.1666666666666668e-05, "loss": 0.2885, "step": 1107 }, { "epoch": 2.372591006423983, "grad_norm": 0.19502504685509406, "learning_rate": 1.1626984126984129e-05, "loss": 0.2818, "step": 1108 }, { "epoch": 2.3747323340471094, "grad_norm": 0.18898431759295656, "learning_rate": 1.1587301587301588e-05, "loss": 0.2904, "step": 1109 }, { "epoch": 2.3768736616702357, "grad_norm": 0.19153164662259894, "learning_rate": 1.1547619047619048e-05, "loss": 0.2726, "step": 1110 }, { "epoch": 2.3790149892933616, "grad_norm": 0.18268604277337688, "learning_rate": 1.1507936507936508e-05, "loss": 0.279, "step": 1111 }, { "epoch": 2.3811563169164884, "grad_norm": 0.17533082071419925, "learning_rate": 1.1468253968253968e-05, "loss": 0.2732, "step": 1112 }, { "epoch": 2.3832976445396143, "grad_norm": 0.2003790102740343, "learning_rate": 1.1428571428571429e-05, "loss": 0.2951, "step": 1113 }, { "epoch": 2.385438972162741, "grad_norm": 0.18705857015717448, "learning_rate": 1.138888888888889e-05, "loss": 0.2793, "step": 1114 }, { "epoch": 2.387580299785867, "grad_norm": 0.1868482525264962, "learning_rate": 1.1349206349206349e-05, "loss": 0.2747, "step": 1115 }, { "epoch": 2.3897216274089934, "grad_norm": 0.18021169330552053, "learning_rate": 1.130952380952381e-05, "loss": 0.2812, "step": 1116 }, { "epoch": 2.3918629550321198, "grad_norm": 0.19301415468138103, "learning_rate": 1.126984126984127e-05, "loss": 0.2886, "step": 1117 }, { "epoch": 2.394004282655246, "grad_norm": 0.19072636775125612, "learning_rate": 1.1230158730158731e-05, "loss": 0.2806, "step": 1118 }, { "epoch": 2.3961456102783725, "grad_norm": 0.1840802920208615, "learning_rate": 1.1190476190476192e-05, "loss": 0.2879, "step": 1119 }, { "epoch": 2.398286937901499, "grad_norm": 0.18258129644157975, "learning_rate": 1.1150793650793653e-05, "loss": 0.2803, "step": 1120 }, { "epoch": 2.400428265524625, "grad_norm": 0.1827848542873828, "learning_rate": 1.1111111111111112e-05, "loss": 0.2881, "step": 1121 }, { "epoch": 2.4025695931477515, "grad_norm": 0.18900064907660674, "learning_rate": 1.107142857142857e-05, "loss": 0.2938, "step": 1122 }, { "epoch": 2.404710920770878, "grad_norm": 0.18804850484657887, "learning_rate": 1.1031746031746031e-05, "loss": 0.2858, "step": 1123 }, { "epoch": 2.4068522483940042, "grad_norm": 0.2006000850638827, "learning_rate": 1.0992063492063492e-05, "loss": 0.2876, "step": 1124 }, { "epoch": 2.4089935760171306, "grad_norm": 0.18714684164420253, "learning_rate": 1.0952380952380953e-05, "loss": 0.2803, "step": 1125 }, { "epoch": 2.411134903640257, "grad_norm": 0.19792928506355778, "learning_rate": 1.0912698412698414e-05, "loss": 0.2936, "step": 1126 }, { "epoch": 2.4132762312633833, "grad_norm": 0.18865247304598853, "learning_rate": 1.0873015873015874e-05, "loss": 0.2778, "step": 1127 }, { "epoch": 2.4154175588865097, "grad_norm": 0.1929275031551357, "learning_rate": 1.0833333333333334e-05, "loss": 0.2777, "step": 1128 }, { "epoch": 2.417558886509636, "grad_norm": 0.1806973386093782, "learning_rate": 1.0793650793650794e-05, "loss": 0.2763, "step": 1129 }, { "epoch": 2.4197002141327624, "grad_norm": 0.19706536820552648, "learning_rate": 1.0753968253968255e-05, "loss": 0.2883, "step": 1130 }, { "epoch": 2.4218415417558887, "grad_norm": 0.1910372018652057, "learning_rate": 1.0714285714285714e-05, "loss": 0.2837, "step": 1131 }, { "epoch": 2.423982869379015, "grad_norm": 0.1788916292232767, "learning_rate": 1.0674603174603175e-05, "loss": 0.2841, "step": 1132 }, { "epoch": 2.4261241970021414, "grad_norm": 0.18659149187977309, "learning_rate": 1.0634920634920636e-05, "loss": 0.2894, "step": 1133 }, { "epoch": 2.428265524625268, "grad_norm": 0.1925123938840746, "learning_rate": 1.0595238095238096e-05, "loss": 0.2838, "step": 1134 }, { "epoch": 2.430406852248394, "grad_norm": 0.1776215701785684, "learning_rate": 1.0555555555555555e-05, "loss": 0.2707, "step": 1135 }, { "epoch": 2.4325481798715205, "grad_norm": 0.20843365977457115, "learning_rate": 1.0515873015873016e-05, "loss": 0.2892, "step": 1136 }, { "epoch": 2.434689507494647, "grad_norm": 0.19764138815197937, "learning_rate": 1.0476190476190477e-05, "loss": 0.2648, "step": 1137 }, { "epoch": 2.436830835117773, "grad_norm": 0.19252613410306155, "learning_rate": 1.0436507936507938e-05, "loss": 0.2911, "step": 1138 }, { "epoch": 2.4389721627408996, "grad_norm": 0.20085600997800981, "learning_rate": 1.0396825396825398e-05, "loss": 0.2826, "step": 1139 }, { "epoch": 2.4411134903640255, "grad_norm": 0.188049018268479, "learning_rate": 1.0357142857142859e-05, "loss": 0.2911, "step": 1140 }, { "epoch": 2.4432548179871523, "grad_norm": 0.19493824847548077, "learning_rate": 1.0317460317460318e-05, "loss": 0.286, "step": 1141 }, { "epoch": 2.445396145610278, "grad_norm": 0.191504777727128, "learning_rate": 1.0277777777777777e-05, "loss": 0.2933, "step": 1142 }, { "epoch": 2.4475374732334045, "grad_norm": 0.19108814910860578, "learning_rate": 1.0238095238095238e-05, "loss": 0.2785, "step": 1143 }, { "epoch": 2.449678800856531, "grad_norm": 0.20799246077044664, "learning_rate": 1.0198412698412699e-05, "loss": 0.2837, "step": 1144 }, { "epoch": 2.4518201284796572, "grad_norm": 0.1824764184302531, "learning_rate": 1.015873015873016e-05, "loss": 0.2652, "step": 1145 }, { "epoch": 2.4539614561027836, "grad_norm": 0.1824775079826765, "learning_rate": 1.011904761904762e-05, "loss": 0.2716, "step": 1146 }, { "epoch": 2.45610278372591, "grad_norm": 0.1932203896888414, "learning_rate": 1.007936507936508e-05, "loss": 0.276, "step": 1147 }, { "epoch": 2.4582441113490363, "grad_norm": 0.2007871128283347, "learning_rate": 1.003968253968254e-05, "loss": 0.264, "step": 1148 }, { "epoch": 2.4603854389721627, "grad_norm": 0.21083133706351928, "learning_rate": 1e-05, "loss": 0.2816, "step": 1149 }, { "epoch": 2.462526766595289, "grad_norm": 0.19278202499501207, "learning_rate": 9.96031746031746e-06, "loss": 0.2823, "step": 1150 }, { "epoch": 2.4646680942184154, "grad_norm": 0.18797686590775003, "learning_rate": 9.92063492063492e-06, "loss": 0.2879, "step": 1151 }, { "epoch": 2.4668094218415417, "grad_norm": 0.20454699746081098, "learning_rate": 9.880952380952381e-06, "loss": 0.2672, "step": 1152 }, { "epoch": 2.468950749464668, "grad_norm": 0.18367484130511327, "learning_rate": 9.841269841269842e-06, "loss": 0.2806, "step": 1153 }, { "epoch": 2.4710920770877944, "grad_norm": 0.1831686319686948, "learning_rate": 9.801587301587301e-06, "loss": 0.273, "step": 1154 }, { "epoch": 2.473233404710921, "grad_norm": 0.19694402234544842, "learning_rate": 9.761904761904762e-06, "loss": 0.2878, "step": 1155 }, { "epoch": 2.475374732334047, "grad_norm": 0.19408589237490334, "learning_rate": 9.722222222222223e-06, "loss": 0.2943, "step": 1156 }, { "epoch": 2.4775160599571735, "grad_norm": 0.19445588500969652, "learning_rate": 9.682539682539683e-06, "loss": 0.2784, "step": 1157 }, { "epoch": 2.4796573875803, "grad_norm": 0.17963888726600663, "learning_rate": 9.642857142857144e-06, "loss": 0.2752, "step": 1158 }, { "epoch": 2.481798715203426, "grad_norm": 0.19347132870813594, "learning_rate": 9.603174603174605e-06, "loss": 0.2867, "step": 1159 }, { "epoch": 2.4839400428265526, "grad_norm": 0.18730325675923679, "learning_rate": 9.563492063492064e-06, "loss": 0.2762, "step": 1160 }, { "epoch": 2.486081370449679, "grad_norm": 0.18460049863260458, "learning_rate": 9.523809523809523e-06, "loss": 0.2872, "step": 1161 }, { "epoch": 2.4882226980728053, "grad_norm": 0.1907698896483033, "learning_rate": 9.484126984126984e-06, "loss": 0.287, "step": 1162 }, { "epoch": 2.4903640256959316, "grad_norm": 0.18810661893439526, "learning_rate": 9.444444444444445e-06, "loss": 0.2797, "step": 1163 }, { "epoch": 2.492505353319058, "grad_norm": 0.19192431651136277, "learning_rate": 9.404761904761905e-06, "loss": 0.2779, "step": 1164 }, { "epoch": 2.4946466809421843, "grad_norm": 0.1791506541350854, "learning_rate": 9.365079365079366e-06, "loss": 0.278, "step": 1165 }, { "epoch": 2.4967880085653107, "grad_norm": 0.18069237791730375, "learning_rate": 9.325396825396827e-06, "loss": 0.2792, "step": 1166 }, { "epoch": 2.4989293361884366, "grad_norm": 0.18412348076309296, "learning_rate": 9.285714285714286e-06, "loss": 0.2771, "step": 1167 }, { "epoch": 2.5010706638115634, "grad_norm": 0.1813880522758016, "learning_rate": 9.246031746031747e-06, "loss": 0.2835, "step": 1168 }, { "epoch": 2.5032119914346893, "grad_norm": 0.19273095158091466, "learning_rate": 9.206349206349207e-06, "loss": 0.2923, "step": 1169 }, { "epoch": 2.505353319057816, "grad_norm": 0.18337449464589062, "learning_rate": 9.166666666666666e-06, "loss": 0.2884, "step": 1170 }, { "epoch": 2.507494646680942, "grad_norm": 0.19024259739381238, "learning_rate": 9.126984126984127e-06, "loss": 0.2956, "step": 1171 }, { "epoch": 2.5096359743040684, "grad_norm": 0.19901572512340374, "learning_rate": 9.087301587301588e-06, "loss": 0.2737, "step": 1172 }, { "epoch": 2.5117773019271947, "grad_norm": 0.194998468534562, "learning_rate": 9.047619047619047e-06, "loss": 0.28, "step": 1173 }, { "epoch": 2.513918629550321, "grad_norm": 0.18412933977392476, "learning_rate": 9.007936507936508e-06, "loss": 0.2819, "step": 1174 }, { "epoch": 2.5160599571734474, "grad_norm": 0.18374938776839936, "learning_rate": 8.968253968253968e-06, "loss": 0.2813, "step": 1175 }, { "epoch": 2.518201284796574, "grad_norm": 0.18751207889857766, "learning_rate": 8.92857142857143e-06, "loss": 0.2831, "step": 1176 }, { "epoch": 2.5203426124197, "grad_norm": 0.17923076174542152, "learning_rate": 8.88888888888889e-06, "loss": 0.2682, "step": 1177 }, { "epoch": 2.5224839400428265, "grad_norm": 0.18035308954107676, "learning_rate": 8.84920634920635e-06, "loss": 0.2677, "step": 1178 }, { "epoch": 2.524625267665953, "grad_norm": 0.22973038098447218, "learning_rate": 8.80952380952381e-06, "loss": 0.2874, "step": 1179 }, { "epoch": 2.526766595289079, "grad_norm": 0.18325732719083584, "learning_rate": 8.769841269841269e-06, "loss": 0.271, "step": 1180 }, { "epoch": 2.5289079229122056, "grad_norm": 0.1838482554089651, "learning_rate": 8.73015873015873e-06, "loss": 0.2743, "step": 1181 }, { "epoch": 2.531049250535332, "grad_norm": 0.18469422748009917, "learning_rate": 8.69047619047619e-06, "loss": 0.2809, "step": 1182 }, { "epoch": 2.5331905781584583, "grad_norm": 0.19824918591422117, "learning_rate": 8.650793650793651e-06, "loss": 0.2956, "step": 1183 }, { "epoch": 2.5353319057815846, "grad_norm": 0.18477009428383825, "learning_rate": 8.611111111111112e-06, "loss": 0.2746, "step": 1184 }, { "epoch": 2.537473233404711, "grad_norm": 0.18367643368070655, "learning_rate": 8.571428571428573e-06, "loss": 0.2779, "step": 1185 }, { "epoch": 2.5396145610278373, "grad_norm": 0.18552020064619668, "learning_rate": 8.531746031746032e-06, "loss": 0.2726, "step": 1186 }, { "epoch": 2.5417558886509637, "grad_norm": 0.18823754582760785, "learning_rate": 8.492063492063492e-06, "loss": 0.2936, "step": 1187 }, { "epoch": 2.54389721627409, "grad_norm": 0.1838357043381974, "learning_rate": 8.452380952380953e-06, "loss": 0.2754, "step": 1188 }, { "epoch": 2.5460385438972164, "grad_norm": 0.1930931282449352, "learning_rate": 8.412698412698414e-06, "loss": 0.2848, "step": 1189 }, { "epoch": 2.5481798715203428, "grad_norm": 0.18519908919348946, "learning_rate": 8.373015873015873e-06, "loss": 0.2776, "step": 1190 }, { "epoch": 2.550321199143469, "grad_norm": 0.18754273175059186, "learning_rate": 8.333333333333334e-06, "loss": 0.2799, "step": 1191 }, { "epoch": 2.552462526766595, "grad_norm": 0.18545398781411668, "learning_rate": 8.293650793650794e-06, "loss": 0.2651, "step": 1192 }, { "epoch": 2.554603854389722, "grad_norm": 0.1847349009972924, "learning_rate": 8.253968253968254e-06, "loss": 0.2714, "step": 1193 }, { "epoch": 2.5567451820128477, "grad_norm": 0.19268643145654266, "learning_rate": 8.214285714285714e-06, "loss": 0.279, "step": 1194 }, { "epoch": 2.5588865096359745, "grad_norm": 0.19778277420764503, "learning_rate": 8.174603174603175e-06, "loss": 0.2778, "step": 1195 }, { "epoch": 2.5610278372591004, "grad_norm": 0.1998660395641265, "learning_rate": 8.134920634920636e-06, "loss": 0.2864, "step": 1196 }, { "epoch": 2.5631691648822272, "grad_norm": 0.19294158536685677, "learning_rate": 8.095238095238097e-06, "loss": 0.2696, "step": 1197 }, { "epoch": 2.565310492505353, "grad_norm": 0.18786548806483286, "learning_rate": 8.055555555555557e-06, "loss": 0.2692, "step": 1198 }, { "epoch": 2.5674518201284795, "grad_norm": 0.19390591817924888, "learning_rate": 8.015873015873016e-06, "loss": 0.278, "step": 1199 }, { "epoch": 2.569593147751606, "grad_norm": 0.18907409735630998, "learning_rate": 7.976190476190475e-06, "loss": 0.2885, "step": 1200 }, { "epoch": 2.571734475374732, "grad_norm": 0.20258943279374564, "learning_rate": 7.936507936507936e-06, "loss": 0.303, "step": 1201 }, { "epoch": 2.5738758029978586, "grad_norm": 0.19454350482759478, "learning_rate": 7.896825396825397e-06, "loss": 0.2739, "step": 1202 }, { "epoch": 2.576017130620985, "grad_norm": 0.18958864960792593, "learning_rate": 7.857142857142858e-06, "loss": 0.2804, "step": 1203 }, { "epoch": 2.5781584582441113, "grad_norm": 0.1982183376800514, "learning_rate": 7.817460317460318e-06, "loss": 0.2991, "step": 1204 }, { "epoch": 2.5802997858672376, "grad_norm": 0.1913275931410259, "learning_rate": 7.777777777777777e-06, "loss": 0.2888, "step": 1205 }, { "epoch": 2.582441113490364, "grad_norm": 0.21019225544474518, "learning_rate": 7.738095238095238e-06, "loss": 0.2887, "step": 1206 }, { "epoch": 2.5845824411134903, "grad_norm": 0.2042220303342691, "learning_rate": 7.698412698412699e-06, "loss": 0.3031, "step": 1207 }, { "epoch": 2.5867237687366167, "grad_norm": 0.19819365750871445, "learning_rate": 7.65873015873016e-06, "loss": 0.2896, "step": 1208 }, { "epoch": 2.588865096359743, "grad_norm": 0.18837665341574075, "learning_rate": 7.6190476190476205e-06, "loss": 0.2775, "step": 1209 }, { "epoch": 2.5910064239828694, "grad_norm": 0.18886359577025771, "learning_rate": 7.5793650793650795e-06, "loss": 0.2718, "step": 1210 }, { "epoch": 2.5931477516059958, "grad_norm": 0.19164944758078695, "learning_rate": 7.5396825396825394e-06, "loss": 0.2705, "step": 1211 }, { "epoch": 2.595289079229122, "grad_norm": 0.1946688680184309, "learning_rate": 7.5e-06, "loss": 0.2817, "step": 1212 }, { "epoch": 2.5974304068522485, "grad_norm": 0.17995484096025638, "learning_rate": 7.460317460317461e-06, "loss": 0.2763, "step": 1213 }, { "epoch": 2.599571734475375, "grad_norm": 0.18827882235831994, "learning_rate": 7.420634920634921e-06, "loss": 0.2649, "step": 1214 }, { "epoch": 2.601713062098501, "grad_norm": 0.1911172359855126, "learning_rate": 7.380952380952382e-06, "loss": 0.2785, "step": 1215 }, { "epoch": 2.6038543897216275, "grad_norm": 0.20732711252193384, "learning_rate": 7.3412698412698415e-06, "loss": 0.2956, "step": 1216 }, { "epoch": 2.605995717344754, "grad_norm": 0.19615616622963164, "learning_rate": 7.301587301587302e-06, "loss": 0.2961, "step": 1217 }, { "epoch": 2.6081370449678802, "grad_norm": 0.18728761641057254, "learning_rate": 7.261904761904763e-06, "loss": 0.2884, "step": 1218 }, { "epoch": 2.6102783725910066, "grad_norm": 0.18873348095093828, "learning_rate": 7.222222222222222e-06, "loss": 0.2788, "step": 1219 }, { "epoch": 2.612419700214133, "grad_norm": 0.1895538111432152, "learning_rate": 7.182539682539682e-06, "loss": 0.2874, "step": 1220 }, { "epoch": 2.614561027837259, "grad_norm": 0.17912222034312614, "learning_rate": 7.142857142857143e-06, "loss": 0.2712, "step": 1221 }, { "epoch": 2.6167023554603857, "grad_norm": 0.18895587187642374, "learning_rate": 7.1031746031746035e-06, "loss": 0.2777, "step": 1222 }, { "epoch": 2.6188436830835116, "grad_norm": 0.18078738265388186, "learning_rate": 7.063492063492063e-06, "loss": 0.282, "step": 1223 }, { "epoch": 2.6209850107066384, "grad_norm": 0.1761591551782532, "learning_rate": 7.023809523809524e-06, "loss": 0.2746, "step": 1224 }, { "epoch": 2.6231263383297643, "grad_norm": 0.18492923876395373, "learning_rate": 6.984126984126985e-06, "loss": 0.2758, "step": 1225 }, { "epoch": 2.6252676659528906, "grad_norm": 0.18920352772469154, "learning_rate": 6.944444444444445e-06, "loss": 0.2711, "step": 1226 }, { "epoch": 2.627408993576017, "grad_norm": 0.18550091010732825, "learning_rate": 6.9047619047619055e-06, "loss": 0.2697, "step": 1227 }, { "epoch": 2.6295503211991433, "grad_norm": 0.1785541730706682, "learning_rate": 6.865079365079366e-06, "loss": 0.2763, "step": 1228 }, { "epoch": 2.6316916488222697, "grad_norm": 0.17857492350218257, "learning_rate": 6.825396825396825e-06, "loss": 0.2691, "step": 1229 }, { "epoch": 2.633832976445396, "grad_norm": 0.1753222130027031, "learning_rate": 6.785714285714285e-06, "loss": 0.2717, "step": 1230 }, { "epoch": 2.6359743040685224, "grad_norm": 0.19278231524559503, "learning_rate": 6.746031746031746e-06, "loss": 0.2784, "step": 1231 }, { "epoch": 2.6381156316916488, "grad_norm": 0.20070264553264905, "learning_rate": 6.706349206349207e-06, "loss": 0.2987, "step": 1232 }, { "epoch": 2.640256959314775, "grad_norm": 0.2010595496902971, "learning_rate": 6.666666666666667e-06, "loss": 0.3015, "step": 1233 }, { "epoch": 2.6423982869379015, "grad_norm": 0.18108631640949424, "learning_rate": 6.626984126984127e-06, "loss": 0.2814, "step": 1234 }, { "epoch": 2.644539614561028, "grad_norm": 0.18555134865906892, "learning_rate": 6.587301587301588e-06, "loss": 0.2677, "step": 1235 }, { "epoch": 2.646680942184154, "grad_norm": 0.18862946658278595, "learning_rate": 6.547619047619048e-06, "loss": 0.2738, "step": 1236 }, { "epoch": 2.6488222698072805, "grad_norm": 0.1859370117993973, "learning_rate": 6.507936507936509e-06, "loss": 0.2845, "step": 1237 }, { "epoch": 2.650963597430407, "grad_norm": 0.18716469379849815, "learning_rate": 6.4682539682539696e-06, "loss": 0.2716, "step": 1238 }, { "epoch": 2.6531049250535332, "grad_norm": 0.18552514906798132, "learning_rate": 6.428571428571429e-06, "loss": 0.2742, "step": 1239 }, { "epoch": 2.6552462526766596, "grad_norm": 0.19953825231892838, "learning_rate": 6.3888888888888885e-06, "loss": 0.2964, "step": 1240 }, { "epoch": 2.657387580299786, "grad_norm": 0.18682163268591973, "learning_rate": 6.349206349206349e-06, "loss": 0.2692, "step": 1241 }, { "epoch": 2.6595289079229123, "grad_norm": 0.1794385339174016, "learning_rate": 6.30952380952381e-06, "loss": 0.2703, "step": 1242 }, { "epoch": 2.6616702355460387, "grad_norm": 0.1887882853913073, "learning_rate": 6.26984126984127e-06, "loss": 0.2874, "step": 1243 }, { "epoch": 2.663811563169165, "grad_norm": 0.1872385299129515, "learning_rate": 6.230158730158731e-06, "loss": 0.2863, "step": 1244 }, { "epoch": 2.6659528907922914, "grad_norm": 0.1819046482781277, "learning_rate": 6.190476190476191e-06, "loss": 0.2705, "step": 1245 }, { "epoch": 2.6680942184154177, "grad_norm": 0.19114242870168724, "learning_rate": 6.1507936507936505e-06, "loss": 0.2782, "step": 1246 }, { "epoch": 2.670235546038544, "grad_norm": 0.19244904517702394, "learning_rate": 6.111111111111111e-06, "loss": 0.2683, "step": 1247 }, { "epoch": 2.67237687366167, "grad_norm": 0.18118655863654376, "learning_rate": 6.071428571428572e-06, "loss": 0.2776, "step": 1248 }, { "epoch": 2.674518201284797, "grad_norm": 0.1895477899880136, "learning_rate": 6.031746031746032e-06, "loss": 0.2955, "step": 1249 }, { "epoch": 2.6766595289079227, "grad_norm": 0.2037032863019117, "learning_rate": 5.992063492063493e-06, "loss": 0.278, "step": 1250 }, { "epoch": 2.6788008565310495, "grad_norm": 0.215090909915821, "learning_rate": 5.9523809523809525e-06, "loss": 0.2779, "step": 1251 }, { "epoch": 2.6809421841541754, "grad_norm": 0.1806317361792395, "learning_rate": 5.9126984126984124e-06, "loss": 0.279, "step": 1252 }, { "epoch": 2.683083511777302, "grad_norm": 0.17940601877117468, "learning_rate": 5.873015873015873e-06, "loss": 0.2834, "step": 1253 }, { "epoch": 2.685224839400428, "grad_norm": 0.181650657819111, "learning_rate": 5.833333333333334e-06, "loss": 0.2867, "step": 1254 }, { "epoch": 2.6873661670235545, "grad_norm": 0.19030036855437318, "learning_rate": 5.793650793650794e-06, "loss": 0.2742, "step": 1255 }, { "epoch": 2.689507494646681, "grad_norm": 0.18217728969404254, "learning_rate": 5.753968253968254e-06, "loss": 0.2798, "step": 1256 }, { "epoch": 2.691648822269807, "grad_norm": 0.17476001635638663, "learning_rate": 5.7142857142857145e-06, "loss": 0.2736, "step": 1257 }, { "epoch": 2.6937901498929335, "grad_norm": 0.16928638733834248, "learning_rate": 5.674603174603174e-06, "loss": 0.2695, "step": 1258 }, { "epoch": 2.69593147751606, "grad_norm": 0.17143125336122542, "learning_rate": 5.634920634920635e-06, "loss": 0.2746, "step": 1259 }, { "epoch": 2.6980728051391862, "grad_norm": 0.17538867599551303, "learning_rate": 5.595238095238096e-06, "loss": 0.2829, "step": 1260 }, { "epoch": 2.7002141327623126, "grad_norm": 0.17983290126938117, "learning_rate": 5.555555555555556e-06, "loss": 0.2702, "step": 1261 }, { "epoch": 2.702355460385439, "grad_norm": 0.17677089972588253, "learning_rate": 5.515873015873016e-06, "loss": 0.2843, "step": 1262 }, { "epoch": 2.7044967880085653, "grad_norm": 0.1844485018306519, "learning_rate": 5.4761904761904765e-06, "loss": 0.2813, "step": 1263 }, { "epoch": 2.7066381156316917, "grad_norm": 0.18659859307800958, "learning_rate": 5.436507936507937e-06, "loss": 0.2795, "step": 1264 }, { "epoch": 2.708779443254818, "grad_norm": 0.17342140616127946, "learning_rate": 5.396825396825397e-06, "loss": 0.2905, "step": 1265 }, { "epoch": 2.7109207708779444, "grad_norm": 0.1751230736265568, "learning_rate": 5.357142857142857e-06, "loss": 0.2769, "step": 1266 }, { "epoch": 2.7130620985010707, "grad_norm": 0.18568051901207977, "learning_rate": 5.317460317460318e-06, "loss": 0.2744, "step": 1267 }, { "epoch": 2.715203426124197, "grad_norm": 0.18188150911305687, "learning_rate": 5.277777777777778e-06, "loss": 0.2811, "step": 1268 }, { "epoch": 2.7173447537473234, "grad_norm": 0.18160082816537443, "learning_rate": 5.2380952380952384e-06, "loss": 0.2776, "step": 1269 }, { "epoch": 2.71948608137045, "grad_norm": 0.178758160602488, "learning_rate": 5.198412698412699e-06, "loss": 0.2651, "step": 1270 }, { "epoch": 2.721627408993576, "grad_norm": 0.17965972231368907, "learning_rate": 5.158730158730159e-06, "loss": 0.2818, "step": 1271 }, { "epoch": 2.7237687366167025, "grad_norm": 0.19233800455601202, "learning_rate": 5.119047619047619e-06, "loss": 0.2899, "step": 1272 }, { "epoch": 2.725910064239829, "grad_norm": 0.2193361858358134, "learning_rate": 5.07936507936508e-06, "loss": 0.3, "step": 1273 }, { "epoch": 2.728051391862955, "grad_norm": 0.19832192608275423, "learning_rate": 5.03968253968254e-06, "loss": 0.2809, "step": 1274 }, { "epoch": 2.730192719486081, "grad_norm": 0.18205252478668657, "learning_rate": 5e-06, "loss": 0.276, "step": 1275 }, { "epoch": 2.732334047109208, "grad_norm": 0.18256411369477107, "learning_rate": 4.96031746031746e-06, "loss": 0.2827, "step": 1276 }, { "epoch": 2.734475374732334, "grad_norm": 0.174235840356806, "learning_rate": 4.920634920634921e-06, "loss": 0.2824, "step": 1277 }, { "epoch": 2.7366167023554606, "grad_norm": 0.18107414207263575, "learning_rate": 4.880952380952381e-06, "loss": 0.287, "step": 1278 }, { "epoch": 2.7387580299785865, "grad_norm": 0.1843871245288681, "learning_rate": 4.841269841269842e-06, "loss": 0.2948, "step": 1279 }, { "epoch": 2.7408993576017133, "grad_norm": 0.18283457450751056, "learning_rate": 4.8015873015873025e-06, "loss": 0.2657, "step": 1280 }, { "epoch": 2.7430406852248392, "grad_norm": 0.1802921251741436, "learning_rate": 4.7619047619047615e-06, "loss": 0.2813, "step": 1281 }, { "epoch": 2.7451820128479656, "grad_norm": 0.17894436649475992, "learning_rate": 4.722222222222222e-06, "loss": 0.2775, "step": 1282 }, { "epoch": 2.747323340471092, "grad_norm": 0.1744670375408823, "learning_rate": 4.682539682539683e-06, "loss": 0.2629, "step": 1283 }, { "epoch": 2.7494646680942183, "grad_norm": 0.18257816801514468, "learning_rate": 4.642857142857143e-06, "loss": 0.2803, "step": 1284 }, { "epoch": 2.7516059957173447, "grad_norm": 0.19392465112933904, "learning_rate": 4.603174603174604e-06, "loss": 0.2836, "step": 1285 }, { "epoch": 2.753747323340471, "grad_norm": 0.18598309828859388, "learning_rate": 4.563492063492064e-06, "loss": 0.2703, "step": 1286 }, { "epoch": 2.7558886509635974, "grad_norm": 0.1844599683925985, "learning_rate": 4.5238095238095235e-06, "loss": 0.2928, "step": 1287 }, { "epoch": 2.7580299785867237, "grad_norm": 0.17453928950018807, "learning_rate": 4.484126984126984e-06, "loss": 0.2772, "step": 1288 }, { "epoch": 2.76017130620985, "grad_norm": 0.1825967298341021, "learning_rate": 4.444444444444445e-06, "loss": 0.2837, "step": 1289 }, { "epoch": 2.7623126338329764, "grad_norm": 0.17501549592815271, "learning_rate": 4.404761904761905e-06, "loss": 0.2708, "step": 1290 }, { "epoch": 2.764453961456103, "grad_norm": 0.17613103586860301, "learning_rate": 4.365079365079365e-06, "loss": 0.2812, "step": 1291 }, { "epoch": 2.766595289079229, "grad_norm": 0.18330117899392398, "learning_rate": 4.3253968253968256e-06, "loss": 0.2606, "step": 1292 }, { "epoch": 2.7687366167023555, "grad_norm": 0.18120028818225953, "learning_rate": 4.285714285714286e-06, "loss": 0.284, "step": 1293 }, { "epoch": 2.770877944325482, "grad_norm": 0.17679996406713563, "learning_rate": 4.246031746031746e-06, "loss": 0.2772, "step": 1294 }, { "epoch": 2.773019271948608, "grad_norm": 0.17472312513614138, "learning_rate": 4.206349206349207e-06, "loss": 0.281, "step": 1295 }, { "epoch": 2.7751605995717346, "grad_norm": 0.17058631787650447, "learning_rate": 4.166666666666667e-06, "loss": 0.2867, "step": 1296 }, { "epoch": 2.777301927194861, "grad_norm": 0.1724204672936674, "learning_rate": 4.126984126984127e-06, "loss": 0.2753, "step": 1297 }, { "epoch": 2.7794432548179873, "grad_norm": 0.1783817683336482, "learning_rate": 4.0873015873015875e-06, "loss": 0.2734, "step": 1298 }, { "epoch": 2.7815845824411136, "grad_norm": 0.1768963927750415, "learning_rate": 4.047619047619048e-06, "loss": 0.286, "step": 1299 }, { "epoch": 2.78372591006424, "grad_norm": 0.1785831360758202, "learning_rate": 4.007936507936508e-06, "loss": 0.288, "step": 1300 }, { "epoch": 2.7858672376873663, "grad_norm": 0.1973578170792123, "learning_rate": 3.968253968253968e-06, "loss": 0.2851, "step": 1301 }, { "epoch": 2.7880085653104922, "grad_norm": 0.17966086659509484, "learning_rate": 3.928571428571429e-06, "loss": 0.2764, "step": 1302 }, { "epoch": 2.790149892933619, "grad_norm": 0.18832071036234518, "learning_rate": 3.888888888888889e-06, "loss": 0.2722, "step": 1303 }, { "epoch": 2.792291220556745, "grad_norm": 0.17616870935040177, "learning_rate": 3.8492063492063495e-06, "loss": 0.2799, "step": 1304 }, { "epoch": 2.7944325481798717, "grad_norm": 0.18136072174228499, "learning_rate": 3.8095238095238102e-06, "loss": 0.28, "step": 1305 }, { "epoch": 2.7965738758029977, "grad_norm": 0.19260217036977118, "learning_rate": 3.7698412698412697e-06, "loss": 0.2795, "step": 1306 }, { "epoch": 2.7987152034261245, "grad_norm": 0.17281606530646676, "learning_rate": 3.7301587301587305e-06, "loss": 0.2694, "step": 1307 }, { "epoch": 2.8008565310492504, "grad_norm": 0.17227995573571817, "learning_rate": 3.690476190476191e-06, "loss": 0.2642, "step": 1308 }, { "epoch": 2.8029978586723767, "grad_norm": 0.17354690029249242, "learning_rate": 3.650793650793651e-06, "loss": 0.2743, "step": 1309 }, { "epoch": 2.805139186295503, "grad_norm": 0.18596920138748899, "learning_rate": 3.611111111111111e-06, "loss": 0.2666, "step": 1310 }, { "epoch": 2.8072805139186294, "grad_norm": 0.19988273525925654, "learning_rate": 3.5714285714285714e-06, "loss": 0.2939, "step": 1311 }, { "epoch": 2.809421841541756, "grad_norm": 0.18393678561262797, "learning_rate": 3.5317460317460317e-06, "loss": 0.2927, "step": 1312 }, { "epoch": 2.811563169164882, "grad_norm": 0.18018691120657057, "learning_rate": 3.4920634920634924e-06, "loss": 0.2664, "step": 1313 }, { "epoch": 2.8137044967880085, "grad_norm": 0.18530712590206283, "learning_rate": 3.4523809523809528e-06, "loss": 0.277, "step": 1314 }, { "epoch": 2.815845824411135, "grad_norm": 0.18226232243626736, "learning_rate": 3.4126984126984127e-06, "loss": 0.281, "step": 1315 }, { "epoch": 2.817987152034261, "grad_norm": 0.19031200668984619, "learning_rate": 3.373015873015873e-06, "loss": 0.2866, "step": 1316 }, { "epoch": 2.8201284796573876, "grad_norm": 0.1746390900856267, "learning_rate": 3.3333333333333333e-06, "loss": 0.2805, "step": 1317 }, { "epoch": 2.822269807280514, "grad_norm": 0.17409052183537008, "learning_rate": 3.293650793650794e-06, "loss": 0.2748, "step": 1318 }, { "epoch": 2.8244111349036403, "grad_norm": 0.1782862674279465, "learning_rate": 3.2539682539682544e-06, "loss": 0.271, "step": 1319 }, { "epoch": 2.8265524625267666, "grad_norm": 0.18225874553079588, "learning_rate": 3.2142857142857143e-06, "loss": 0.2602, "step": 1320 }, { "epoch": 2.828693790149893, "grad_norm": 0.17783389559399063, "learning_rate": 3.1746031746031746e-06, "loss": 0.2762, "step": 1321 }, { "epoch": 2.8308351177730193, "grad_norm": 0.17855207309001997, "learning_rate": 3.134920634920635e-06, "loss": 0.2872, "step": 1322 }, { "epoch": 2.8329764453961457, "grad_norm": 0.17935885821626021, "learning_rate": 3.0952380952380953e-06, "loss": 0.2794, "step": 1323 }, { "epoch": 2.835117773019272, "grad_norm": 0.18232652024256538, "learning_rate": 3.0555555555555556e-06, "loss": 0.2831, "step": 1324 }, { "epoch": 2.8372591006423984, "grad_norm": 0.17948962840127805, "learning_rate": 3.015873015873016e-06, "loss": 0.2776, "step": 1325 }, { "epoch": 2.8394004282655247, "grad_norm": 0.18464576865432494, "learning_rate": 2.9761904761904763e-06, "loss": 0.2941, "step": 1326 }, { "epoch": 2.841541755888651, "grad_norm": 0.17028939672073803, "learning_rate": 2.9365079365079366e-06, "loss": 0.2672, "step": 1327 }, { "epoch": 2.8436830835117775, "grad_norm": 0.16916196038993978, "learning_rate": 2.896825396825397e-06, "loss": 0.2741, "step": 1328 }, { "epoch": 2.8458244111349034, "grad_norm": 0.175350985222415, "learning_rate": 2.8571428571428573e-06, "loss": 0.2796, "step": 1329 }, { "epoch": 2.84796573875803, "grad_norm": 0.17515671194072407, "learning_rate": 2.8174603174603176e-06, "loss": 0.2775, "step": 1330 }, { "epoch": 2.850107066381156, "grad_norm": 0.17259697407853658, "learning_rate": 2.777777777777778e-06, "loss": 0.2688, "step": 1331 }, { "epoch": 2.852248394004283, "grad_norm": 0.1739659486779704, "learning_rate": 2.7380952380952382e-06, "loss": 0.2842, "step": 1332 }, { "epoch": 2.854389721627409, "grad_norm": 0.1897848574714334, "learning_rate": 2.6984126984126986e-06, "loss": 0.2736, "step": 1333 }, { "epoch": 2.8565310492505356, "grad_norm": 0.17285981110921295, "learning_rate": 2.658730158730159e-06, "loss": 0.2689, "step": 1334 }, { "epoch": 2.8586723768736615, "grad_norm": 0.18566863960917998, "learning_rate": 2.6190476190476192e-06, "loss": 0.2656, "step": 1335 }, { "epoch": 2.860813704496788, "grad_norm": 0.17490830341679464, "learning_rate": 2.5793650793650795e-06, "loss": 0.282, "step": 1336 }, { "epoch": 2.862955032119914, "grad_norm": 0.16847205946436153, "learning_rate": 2.53968253968254e-06, "loss": 0.2686, "step": 1337 }, { "epoch": 2.8650963597430406, "grad_norm": 0.167671663902774, "learning_rate": 2.5e-06, "loss": 0.2723, "step": 1338 }, { "epoch": 2.867237687366167, "grad_norm": 0.17505967809467687, "learning_rate": 2.4603174603174605e-06, "loss": 0.2741, "step": 1339 }, { "epoch": 2.8693790149892933, "grad_norm": 0.18340786011909083, "learning_rate": 2.420634920634921e-06, "loss": 0.2929, "step": 1340 }, { "epoch": 2.8715203426124196, "grad_norm": 0.17940501576635812, "learning_rate": 2.3809523809523808e-06, "loss": 0.2714, "step": 1341 }, { "epoch": 2.873661670235546, "grad_norm": 0.182484237285869, "learning_rate": 2.3412698412698415e-06, "loss": 0.275, "step": 1342 }, { "epoch": 2.8758029978586723, "grad_norm": 0.18020814887479553, "learning_rate": 2.301587301587302e-06, "loss": 0.2825, "step": 1343 }, { "epoch": 2.8779443254817987, "grad_norm": 0.17660806936645942, "learning_rate": 2.2619047619047617e-06, "loss": 0.285, "step": 1344 }, { "epoch": 2.880085653104925, "grad_norm": 0.17617046995854874, "learning_rate": 2.2222222222222225e-06, "loss": 0.2754, "step": 1345 }, { "epoch": 2.8822269807280514, "grad_norm": 0.1744513421687539, "learning_rate": 2.1825396825396824e-06, "loss": 0.2951, "step": 1346 }, { "epoch": 2.8843683083511777, "grad_norm": 0.17176452515074214, "learning_rate": 2.142857142857143e-06, "loss": 0.2668, "step": 1347 }, { "epoch": 2.886509635974304, "grad_norm": 0.17904296673736972, "learning_rate": 2.1031746031746035e-06, "loss": 0.2701, "step": 1348 }, { "epoch": 2.8886509635974305, "grad_norm": 0.17647016428567208, "learning_rate": 2.0634920634920634e-06, "loss": 0.2716, "step": 1349 }, { "epoch": 2.890792291220557, "grad_norm": 0.17235816276714014, "learning_rate": 2.023809523809524e-06, "loss": 0.265, "step": 1350 }, { "epoch": 2.892933618843683, "grad_norm": 0.18658165461489193, "learning_rate": 1.984126984126984e-06, "loss": 0.2799, "step": 1351 }, { "epoch": 2.8950749464668095, "grad_norm": 0.17186186390295335, "learning_rate": 1.9444444444444444e-06, "loss": 0.2714, "step": 1352 }, { "epoch": 2.897216274089936, "grad_norm": 0.17654423828340407, "learning_rate": 1.9047619047619051e-06, "loss": 0.2748, "step": 1353 }, { "epoch": 2.8993576017130622, "grad_norm": 0.17779611321463967, "learning_rate": 1.8650793650793652e-06, "loss": 0.2817, "step": 1354 }, { "epoch": 2.9014989293361886, "grad_norm": 0.16906995082417278, "learning_rate": 1.8253968253968256e-06, "loss": 0.2779, "step": 1355 }, { "epoch": 2.903640256959315, "grad_norm": 0.1795175517079968, "learning_rate": 1.7857142857142857e-06, "loss": 0.275, "step": 1356 }, { "epoch": 2.9057815845824413, "grad_norm": 0.24069812689827963, "learning_rate": 1.7460317460317462e-06, "loss": 0.2842, "step": 1357 }, { "epoch": 2.907922912205567, "grad_norm": 0.18261102094972528, "learning_rate": 1.7063492063492063e-06, "loss": 0.2863, "step": 1358 }, { "epoch": 2.910064239828694, "grad_norm": 0.17916467521323348, "learning_rate": 1.6666666666666667e-06, "loss": 0.2703, "step": 1359 }, { "epoch": 2.91220556745182, "grad_norm": 0.1684177935210992, "learning_rate": 1.6269841269841272e-06, "loss": 0.2633, "step": 1360 }, { "epoch": 2.9143468950749467, "grad_norm": 0.17482795526886843, "learning_rate": 1.5873015873015873e-06, "loss": 0.2733, "step": 1361 }, { "epoch": 2.9164882226980726, "grad_norm": 0.17189677626587546, "learning_rate": 1.5476190476190476e-06, "loss": 0.2786, "step": 1362 }, { "epoch": 2.9186295503211994, "grad_norm": 0.1793311737015449, "learning_rate": 1.507936507936508e-06, "loss": 0.3015, "step": 1363 }, { "epoch": 2.9207708779443253, "grad_norm": 0.1751443843257153, "learning_rate": 1.4682539682539683e-06, "loss": 0.2701, "step": 1364 }, { "epoch": 2.9229122055674517, "grad_norm": 0.17885380423708533, "learning_rate": 1.4285714285714286e-06, "loss": 0.2732, "step": 1365 }, { "epoch": 2.925053533190578, "grad_norm": 0.17570565566220792, "learning_rate": 1.388888888888889e-06, "loss": 0.2797, "step": 1366 }, { "epoch": 2.9271948608137044, "grad_norm": 0.1665971208193023, "learning_rate": 1.3492063492063493e-06, "loss": 0.2726, "step": 1367 }, { "epoch": 2.9293361884368307, "grad_norm": 0.18707229378818965, "learning_rate": 1.3095238095238096e-06, "loss": 0.3001, "step": 1368 }, { "epoch": 2.931477516059957, "grad_norm": 0.172875108848975, "learning_rate": 1.26984126984127e-06, "loss": 0.2817, "step": 1369 }, { "epoch": 2.9336188436830835, "grad_norm": 0.17022686353646632, "learning_rate": 1.2301587301587303e-06, "loss": 0.2776, "step": 1370 }, { "epoch": 2.93576017130621, "grad_norm": 0.17967995503693934, "learning_rate": 1.1904761904761904e-06, "loss": 0.2683, "step": 1371 }, { "epoch": 2.937901498929336, "grad_norm": 0.17708372843749126, "learning_rate": 1.150793650793651e-06, "loss": 0.2908, "step": 1372 }, { "epoch": 2.9400428265524625, "grad_norm": 0.16830115974661627, "learning_rate": 1.1111111111111112e-06, "loss": 0.2561, "step": 1373 }, { "epoch": 2.942184154175589, "grad_norm": 0.17929164665415462, "learning_rate": 1.0714285714285716e-06, "loss": 0.2714, "step": 1374 }, { "epoch": 2.9443254817987152, "grad_norm": 0.16840343711745376, "learning_rate": 1.0317460317460317e-06, "loss": 0.2671, "step": 1375 }, { "epoch": 2.9464668094218416, "grad_norm": 0.17040496165809252, "learning_rate": 9.92063492063492e-07, "loss": 0.2711, "step": 1376 }, { "epoch": 2.948608137044968, "grad_norm": 0.17750648853336623, "learning_rate": 9.523809523809526e-07, "loss": 0.2661, "step": 1377 }, { "epoch": 2.9507494646680943, "grad_norm": 0.17641859608757207, "learning_rate": 9.126984126984128e-07, "loss": 0.282, "step": 1378 }, { "epoch": 2.9528907922912206, "grad_norm": 0.18027367970879757, "learning_rate": 8.730158730158731e-07, "loss": 0.2795, "step": 1379 }, { "epoch": 2.955032119914347, "grad_norm": 0.1770116936691827, "learning_rate": 8.333333333333333e-07, "loss": 0.2846, "step": 1380 }, { "epoch": 2.9571734475374734, "grad_norm": 0.1718006942921136, "learning_rate": 7.936507936507937e-07, "loss": 0.2812, "step": 1381 }, { "epoch": 2.9593147751605997, "grad_norm": 0.1770273339812149, "learning_rate": 7.53968253968254e-07, "loss": 0.2896, "step": 1382 }, { "epoch": 2.961456102783726, "grad_norm": 0.17902070118474797, "learning_rate": 7.142857142857143e-07, "loss": 0.2978, "step": 1383 }, { "epoch": 2.9635974304068524, "grad_norm": 0.17533054931108796, "learning_rate": 6.746031746031746e-07, "loss": 0.2769, "step": 1384 }, { "epoch": 2.9657387580299783, "grad_norm": 0.16993493071735802, "learning_rate": 6.34920634920635e-07, "loss": 0.2827, "step": 1385 }, { "epoch": 2.967880085653105, "grad_norm": 0.17684972171074492, "learning_rate": 5.952380952380952e-07, "loss": 0.2917, "step": 1386 }, { "epoch": 2.970021413276231, "grad_norm": 0.17897804806977222, "learning_rate": 5.555555555555556e-07, "loss": 0.2834, "step": 1387 }, { "epoch": 2.972162740899358, "grad_norm": 0.19557715172028603, "learning_rate": 5.158730158730158e-07, "loss": 0.3017, "step": 1388 }, { "epoch": 2.9743040685224837, "grad_norm": 0.17627182121205032, "learning_rate": 4.761904761904763e-07, "loss": 0.2767, "step": 1389 }, { "epoch": 2.9764453961456105, "grad_norm": 0.17369058443780255, "learning_rate": 4.3650793650793655e-07, "loss": 0.2798, "step": 1390 }, { "epoch": 2.9785867237687365, "grad_norm": 0.1740941045189046, "learning_rate": 3.9682539682539683e-07, "loss": 0.2683, "step": 1391 }, { "epoch": 2.980728051391863, "grad_norm": 0.17915684223215836, "learning_rate": 3.5714285714285716e-07, "loss": 0.2873, "step": 1392 }, { "epoch": 2.982869379014989, "grad_norm": 0.1679332636215031, "learning_rate": 3.174603174603175e-07, "loss": 0.2745, "step": 1393 }, { "epoch": 2.9850107066381155, "grad_norm": 0.17729248198099018, "learning_rate": 2.777777777777778e-07, "loss": 0.2756, "step": 1394 }, { "epoch": 2.987152034261242, "grad_norm": 0.16814830396556607, "learning_rate": 2.3809523809523814e-07, "loss": 0.2713, "step": 1395 }, { "epoch": 2.9892933618843682, "grad_norm": 0.1765432649703903, "learning_rate": 1.9841269841269841e-07, "loss": 0.2875, "step": 1396 }, { "epoch": 2.9914346895074946, "grad_norm": 0.1741654729465487, "learning_rate": 1.5873015873015874e-07, "loss": 0.2909, "step": 1397 }, { "epoch": 2.993576017130621, "grad_norm": 0.1709774041473495, "learning_rate": 1.1904761904761907e-07, "loss": 0.2796, "step": 1398 }, { "epoch": 2.9957173447537473, "grad_norm": 0.1642668435658554, "learning_rate": 7.936507936507937e-08, "loss": 0.267, "step": 1399 }, { "epoch": 2.9978586723768736, "grad_norm": 0.17312311494944055, "learning_rate": 3.9682539682539686e-08, "loss": 0.2641, "step": 1400 }, { "epoch": 3.0, "grad_norm": 0.17221087745505198, "learning_rate": 0.0, "loss": 0.2679, "step": 1401 }, { "epoch": 3.0, "step": 1401, "total_flos": 1.5578375880118895e+19, "train_loss": 0.39975321637486494, "train_runtime": 116840.5239, "train_samples_per_second": 0.192, "train_steps_per_second": 0.012 } ], "logging_steps": 1, "max_steps": 1401, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5578375880118895e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }