diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -11,9819 +11,9819 @@ "log_history": [ { "epoch": 0.0021413276231263384, - "grad_norm": 56.592826307068876, + "grad_norm": 57.48312185632039, "learning_rate": 3.546099290780142e-07, - "loss": 3.0396, + "loss": 11.0536, "step": 1 }, { "epoch": 0.004282655246252677, - "grad_norm": 60.92994210864957, + "grad_norm": 56.04665199470871, "learning_rate": 7.092198581560284e-07, - "loss": 3.4422, + "loss": 11.108, "step": 2 }, { "epoch": 0.006423982869379015, - "grad_norm": 58.66990590000166, + "grad_norm": 57.69353550561418, "learning_rate": 1.0638297872340427e-06, - "loss": 3.2284, + "loss": 11.118, "step": 3 }, { "epoch": 0.008565310492505354, - "grad_norm": 57.90854902052254, + "grad_norm": 57.098115231260124, "learning_rate": 1.4184397163120568e-06, - "loss": 3.1414, + "loss": 11.0464, "step": 4 }, { "epoch": 0.010706638115631691, - "grad_norm": 58.311165642100505, + "grad_norm": 58.67499851336777, "learning_rate": 1.7730496453900712e-06, - "loss": 3.2059, + "loss": 10.9625, "step": 5 }, { "epoch": 0.01284796573875803, - "grad_norm": 56.80687951003449, + "grad_norm": 58.414650577956415, "learning_rate": 2.1276595744680853e-06, - "loss": 3.1799, + "loss": 10.9616, "step": 6 }, { "epoch": 0.014989293361884369, - "grad_norm": 47.7995425239194, + "grad_norm": 64.47207593228225, "learning_rate": 2.4822695035460995e-06, - "loss": 2.7636, + "loss": 10.6887, "step": 7 }, { "epoch": 0.017130620985010708, - "grad_norm": 49.227409613395615, + "grad_norm": 64.98938276175704, "learning_rate": 2.8368794326241136e-06, - "loss": 2.8918, + "loss": 10.6541, "step": 8 }, { "epoch": 0.019271948608137045, - "grad_norm": 31.314007438717645, + "grad_norm": 96.47909270749008, "learning_rate": 3.1914893617021277e-06, - "loss": 2.1702, + "loss": 9.209, "step": 9 }, { "epoch": 0.021413276231263382, - "grad_norm": 24.736226056643922, + "grad_norm": 111.61956792060187, "learning_rate": 3.5460992907801423e-06, - "loss": 1.9433, + "loss": 8.6722, "step": 10 }, { "epoch": 0.023554603854389723, - "grad_norm": 5.183793675647891, + "grad_norm": 66.79651103290082, "learning_rate": 3.9007092198581565e-06, - "loss": 1.2843, + "loss": 3.7251, "step": 11 }, { "epoch": 0.02569593147751606, - "grad_norm": 4.360295224010352, + "grad_norm": 56.39679177509825, "learning_rate": 4.255319148936171e-06, - "loss": 1.2718, + "loss": 3.2238, "step": 12 }, { "epoch": 0.027837259100642397, - "grad_norm": 3.516837280046668, + "grad_norm": 76.3878582360963, "learning_rate": 4.609929078014184e-06, - "loss": 1.1784, + "loss": 2.5959, "step": 13 }, { "epoch": 0.029978586723768737, - "grad_norm": 2.9770123258300263, + "grad_norm": 31.950352388440905, "learning_rate": 4.964539007092199e-06, - "loss": 1.2087, + "loss": 2.3433, "step": 14 }, { "epoch": 0.032119914346895075, - "grad_norm": 3.371301698123799, + "grad_norm": 6.951146815684312, "learning_rate": 5.319148936170213e-06, - "loss": 1.1044, + "loss": 1.4451, "step": 15 }, { "epoch": 0.034261241970021415, - "grad_norm": 2.4929900637756104, + "grad_norm": 4.818010610834237, "learning_rate": 5.673758865248227e-06, - "loss": 1.0305, + "loss": 1.294, "step": 16 }, { "epoch": 0.03640256959314775, - "grad_norm": 2.3245385930370683, + "grad_norm": 3.6480447594140033, "learning_rate": 6.028368794326241e-06, - "loss": 1.0328, + "loss": 1.256, "step": 17 }, { "epoch": 0.03854389721627409, - "grad_norm": 2.1494749257738106, + "grad_norm": 2.720810833620355, "learning_rate": 6.3829787234042555e-06, - "loss": 0.9676, + "loss": 1.1491, "step": 18 }, { "epoch": 0.04068522483940043, - "grad_norm": 1.7068403808367762, + "grad_norm": 2.19419575616829, "learning_rate": 6.73758865248227e-06, - "loss": 0.9568, + "loss": 1.1041, "step": 19 }, { "epoch": 0.042826552462526764, - "grad_norm": 1.3157208379186793, + "grad_norm": 1.5787733821950154, "learning_rate": 7.092198581560285e-06, - "loss": 0.8919, + "loss": 0.9964, "step": 20 }, { "epoch": 0.044967880085653104, - "grad_norm": 1.5755739795069426, + "grad_norm": 6.086448551226785, "learning_rate": 7.446808510638298e-06, - "loss": 0.8786, + "loss": 0.9374, "step": 21 }, { "epoch": 0.047109207708779445, - "grad_norm": 1.4066443822922425, + "grad_norm": 1.508606587040093, "learning_rate": 7.801418439716313e-06, - "loss": 0.8728, + "loss": 0.9276, "step": 22 }, { "epoch": 0.04925053533190578, - "grad_norm": 1.232711682082585, + "grad_norm": 1.0854043801940743, "learning_rate": 8.156028368794328e-06, - "loss": 0.8089, + "loss": 0.8555, "step": 23 }, { "epoch": 0.05139186295503212, - "grad_norm": 1.0965349661030956, + "grad_norm": 0.9443347844565788, "learning_rate": 8.510638297872341e-06, - "loss": 0.8199, + "loss": 0.8648, "step": 24 }, { "epoch": 0.05353319057815846, - "grad_norm": 0.779920509833393, + "grad_norm": 0.8257150817721549, "learning_rate": 8.865248226950355e-06, - "loss": 0.8012, + "loss": 0.8433, "step": 25 }, { "epoch": 0.055674518201284794, - "grad_norm": 0.7514108454773875, + "grad_norm": 0.7611313593085876, "learning_rate": 9.219858156028368e-06, - "loss": 0.7586, + "loss": 0.7927, "step": 26 }, { "epoch": 0.057815845824411134, - "grad_norm": 0.8086762191971834, + "grad_norm": 0.9175929091877931, "learning_rate": 9.574468085106383e-06, - "loss": 0.7487, + "loss": 0.7758, "step": 27 }, { "epoch": 0.059957173447537475, - "grad_norm": 0.8878919709734377, + "grad_norm": 0.6622602227048174, "learning_rate": 9.929078014184398e-06, - "loss": 0.7034, + "loss": 0.724, "step": 28 }, { "epoch": 0.06209850107066381, - "grad_norm": 0.829679313012889, + "grad_norm": 0.673388358753726, "learning_rate": 1.0283687943262411e-05, - "loss": 0.7474, + "loss": 0.7647, "step": 29 }, { "epoch": 0.06423982869379015, - "grad_norm": 0.6626872942809816, + "grad_norm": 0.6658973555598119, "learning_rate": 1.0638297872340426e-05, - "loss": 0.7349, + "loss": 0.7529, "step": 30 }, { "epoch": 0.06638115631691649, - "grad_norm": 0.6753492576166917, + "grad_norm": 0.7382952955966445, "learning_rate": 1.0992907801418441e-05, - "loss": 0.7171, + "loss": 0.7324, "step": 31 }, { "epoch": 0.06852248394004283, - "grad_norm": 0.6704627449665963, + "grad_norm": 0.6870316045369611, "learning_rate": 1.1347517730496454e-05, - "loss": 0.7126, + "loss": 0.7258, "step": 32 }, { "epoch": 0.07066381156316917, - "grad_norm": 0.5562388268523989, + "grad_norm": 0.5409451023446604, "learning_rate": 1.170212765957447e-05, - "loss": 0.6868, + "loss": 0.6995, "step": 33 }, { "epoch": 0.0728051391862955, - "grad_norm": 0.6223987828107421, + "grad_norm": 0.4718286692000531, "learning_rate": 1.2056737588652483e-05, - "loss": 0.6845, + "loss": 0.6937, "step": 34 }, { "epoch": 0.07494646680942184, - "grad_norm": 0.5778135109852265, + "grad_norm": 0.5560095137091816, "learning_rate": 1.2411347517730498e-05, - "loss": 0.6766, + "loss": 0.6849, "step": 35 }, { "epoch": 0.07708779443254818, - "grad_norm": 0.4742088939687226, + "grad_norm": 0.5519607067166765, "learning_rate": 1.2765957446808511e-05, - "loss": 0.6522, + "loss": 0.6615, "step": 36 }, { "epoch": 0.07922912205567452, - "grad_norm": 0.4554617236825081, + "grad_norm": 0.4650149562699858, "learning_rate": 1.3120567375886524e-05, - "loss": 0.6551, + "loss": 0.6624, "step": 37 }, { "epoch": 0.08137044967880086, - "grad_norm": 0.49432958372427405, + "grad_norm": 0.4222769877707214, "learning_rate": 1.347517730496454e-05, - "loss": 0.6351, + "loss": 0.6429, "step": 38 }, { "epoch": 0.0835117773019272, - "grad_norm": 0.440896644761752, + "grad_norm": 0.41323190280074, "learning_rate": 1.3829787234042554e-05, - "loss": 0.62, + "loss": 0.6267, "step": 39 }, { "epoch": 0.08565310492505353, - "grad_norm": 0.3847657975966914, + "grad_norm": 0.45734022776906186, "learning_rate": 1.418439716312057e-05, - "loss": 0.6449, + "loss": 0.6516, "step": 40 }, { "epoch": 0.08779443254817987, - "grad_norm": 0.361109158517152, + "grad_norm": 0.42800824849390673, "learning_rate": 1.4539007092198581e-05, - "loss": 0.6603, + "loss": 0.6654, "step": 41 }, { "epoch": 0.08993576017130621, - "grad_norm": 0.35567536401833566, + "grad_norm": 0.33157687702952515, "learning_rate": 1.4893617021276596e-05, - "loss": 0.6005, + "loss": 0.6061, "step": 42 }, { "epoch": 0.09207708779443255, - "grad_norm": 0.3656052483807157, + "grad_norm": 0.34740038476735874, "learning_rate": 1.5248226950354611e-05, - "loss": 0.6126, + "loss": 0.6166, "step": 43 }, { "epoch": 0.09421841541755889, - "grad_norm": 0.33705396501786156, + "grad_norm": 0.34140646940139696, "learning_rate": 1.5602836879432626e-05, - "loss": 0.6066, + "loss": 0.6102, "step": 44 }, { "epoch": 0.09635974304068523, - "grad_norm": 0.30303119503903936, + "grad_norm": 0.3292213866559836, "learning_rate": 1.595744680851064e-05, - "loss": 0.6171, + "loss": 0.6227, "step": 45 }, { "epoch": 0.09850107066381156, - "grad_norm": 0.2967613905439793, + "grad_norm": 0.30198419319828534, "learning_rate": 1.6312056737588656e-05, - "loss": 0.608, + "loss": 0.6107, "step": 46 }, { "epoch": 0.1006423982869379, - "grad_norm": 0.3767161988103767, + "grad_norm": 0.31063047872860655, "learning_rate": 1.6666666666666667e-05, - "loss": 0.6171, + "loss": 0.6188, "step": 47 }, { "epoch": 0.10278372591006424, - "grad_norm": 0.314876767137896, + "grad_norm": 0.30350185379074784, "learning_rate": 1.7021276595744682e-05, - "loss": 0.5786, + "loss": 0.5828, "step": 48 }, { "epoch": 0.10492505353319058, - "grad_norm": 0.2767623351226321, + "grad_norm": 0.33592357937468176, "learning_rate": 1.7375886524822697e-05, - "loss": 0.6053, + "loss": 0.6074, "step": 49 }, { "epoch": 0.10706638115631692, - "grad_norm": 0.29425544708240703, + "grad_norm": 0.2977049583100526, "learning_rate": 1.773049645390071e-05, - "loss": 0.5932, + "loss": 0.5956, "step": 50 }, { "epoch": 0.10920770877944326, - "grad_norm": 0.2867315363666657, + "grad_norm": 0.29548881007123645, "learning_rate": 1.8085106382978724e-05, - "loss": 0.5842, + "loss": 0.5875, "step": 51 }, { "epoch": 0.11134903640256959, - "grad_norm": 0.2585858531022653, + "grad_norm": 0.3157281891885196, "learning_rate": 1.8439716312056736e-05, - "loss": 0.5853, + "loss": 0.5875, "step": 52 }, { "epoch": 0.11349036402569593, - "grad_norm": 0.2655264874976756, + "grad_norm": 0.2866517435357405, "learning_rate": 1.879432624113475e-05, - "loss": 0.58, + "loss": 0.5827, "step": 53 }, { "epoch": 0.11563169164882227, - "grad_norm": 0.30355375200055923, + "grad_norm": 0.2849703271418174, "learning_rate": 1.9148936170212766e-05, - "loss": 0.5891, + "loss": 0.5903, "step": 54 }, { "epoch": 0.11777301927194861, - "grad_norm": 0.30060619559855645, + "grad_norm": 0.2883104724475601, "learning_rate": 1.950354609929078e-05, - "loss": 0.5657, + "loss": 0.566, "step": 55 }, { "epoch": 0.11991434689507495, - "grad_norm": 0.2544001308153972, + "grad_norm": 0.27848766799340263, "learning_rate": 1.9858156028368796e-05, - "loss": 0.6008, + "loss": 0.6028, "step": 56 }, { "epoch": 0.12205567451820129, - "grad_norm": 0.25996965782644, + "grad_norm": 0.27054084886113955, "learning_rate": 2.0212765957446807e-05, - "loss": 0.5703, + "loss": 0.5738, "step": 57 }, { "epoch": 0.12419700214132762, - "grad_norm": 0.2895469648849053, + "grad_norm": 0.28790406613678704, "learning_rate": 2.0567375886524822e-05, - "loss": 0.5817, + "loss": 0.5835, "step": 58 }, { "epoch": 0.12633832976445397, - "grad_norm": 0.25922509784815095, + "grad_norm": 0.2609950876468609, "learning_rate": 2.0921985815602837e-05, - "loss": 0.5665, + "loss": 0.5679, "step": 59 }, { "epoch": 0.1284796573875803, - "grad_norm": 0.26935959494463335, + "grad_norm": 0.22708508929335028, "learning_rate": 2.1276595744680852e-05, - "loss": 0.5634, + "loss": 0.5644, "step": 60 }, { "epoch": 0.13062098501070663, - "grad_norm": 0.2472890335094329, + "grad_norm": 0.29273734573536586, "learning_rate": 2.1631205673758867e-05, - "loss": 0.5489, + "loss": 0.5506, "step": 61 }, { "epoch": 0.13276231263383298, - "grad_norm": 0.25220403853856466, + "grad_norm": 0.25373390652035177, "learning_rate": 2.1985815602836882e-05, - "loss": 0.5555, + "loss": 0.5574, "step": 62 }, { "epoch": 0.1349036402569593, - "grad_norm": 0.25227853159521985, + "grad_norm": 0.24510687261458605, "learning_rate": 2.2340425531914894e-05, "loss": 0.5977, "step": 63 }, { "epoch": 0.13704496788008566, - "grad_norm": 0.22662601243259486, + "grad_norm": 0.24945740897429344, "learning_rate": 2.269503546099291e-05, - "loss": 0.5481, + "loss": 0.55, "step": 64 }, { "epoch": 0.139186295503212, - "grad_norm": 0.2462259567482875, + "grad_norm": 0.2366238459163537, "learning_rate": 2.3049645390070924e-05, - "loss": 0.5628, + "loss": 0.5645, "step": 65 }, { "epoch": 0.14132762312633834, - "grad_norm": 0.24332397161016522, + "grad_norm": 0.2671806828673314, "learning_rate": 2.340425531914894e-05, - "loss": 0.5553, + "loss": 0.5575, "step": 66 }, { "epoch": 0.14346895074946467, - "grad_norm": 0.23477707463573774, + "grad_norm": 0.2693204831683393, "learning_rate": 2.3758865248226954e-05, - "loss": 0.5621, + "loss": 0.5644, "step": 67 }, { "epoch": 0.145610278372591, - "grad_norm": 0.2500380042146856, + "grad_norm": 0.27274356761854357, "learning_rate": 2.4113475177304965e-05, - "loss": 0.5661, + "loss": 0.569, "step": 68 }, { "epoch": 0.14775160599571735, - "grad_norm": 0.26380612632753875, + "grad_norm": 0.2834119787499386, "learning_rate": 2.446808510638298e-05, - "loss": 0.5461, + "loss": 0.5489, "step": 69 }, { "epoch": 0.14989293361884368, - "grad_norm": 0.2557903047322243, + "grad_norm": 0.28557474380682984, "learning_rate": 2.4822695035460995e-05, - "loss": 0.5506, + "loss": 0.5516, "step": 70 }, { "epoch": 0.15203426124197003, - "grad_norm": 0.25820215490499565, + "grad_norm": 0.28955467692470377, "learning_rate": 2.5177304964539007e-05, - "loss": 0.5478, + "loss": 0.5489, "step": 71 }, { "epoch": 0.15417558886509636, - "grad_norm": 0.24758155471819995, + "grad_norm": 0.28586765508193035, "learning_rate": 2.5531914893617022e-05, - "loss": 0.5518, + "loss": 0.5531, "step": 72 }, { "epoch": 0.15631691648822268, - "grad_norm": 0.2415443149878719, + "grad_norm": 0.26463443393592084, "learning_rate": 2.5886524822695034e-05, - "loss": 0.5505, + "loss": 0.5524, "step": 73 }, { "epoch": 0.15845824411134904, - "grad_norm": 0.24823560199460706, + "grad_norm": 0.27995499634374477, "learning_rate": 2.624113475177305e-05, - "loss": 0.5665, + "loss": 0.569, "step": 74 }, { "epoch": 0.16059957173447537, - "grad_norm": 0.23378147422826043, + "grad_norm": 0.263684223704656, "learning_rate": 2.6595744680851064e-05, - "loss": 0.527, + "loss": 0.5287, "step": 75 }, { "epoch": 0.16274089935760172, - "grad_norm": 0.23375799268334566, + "grad_norm": 0.2919043059115593, "learning_rate": 2.695035460992908e-05, - "loss": 0.5522, + "loss": 0.5541, "step": 76 }, { "epoch": 0.16488222698072805, - "grad_norm": 0.27046995095800896, + "grad_norm": 0.2805459610716478, "learning_rate": 2.7304964539007094e-05, - "loss": 0.5317, + "loss": 0.5328, "step": 77 }, { "epoch": 0.1670235546038544, - "grad_norm": 0.22869648794405983, + "grad_norm": 0.2704831415044607, "learning_rate": 2.765957446808511e-05, - "loss": 0.5488, + "loss": 0.5506, "step": 78 }, { "epoch": 0.16916488222698073, - "grad_norm": 0.2144302125474379, + "grad_norm": 0.26497156122554943, "learning_rate": 2.8014184397163124e-05, - "loss": 0.536, + "loss": 0.5383, "step": 79 }, { "epoch": 0.17130620985010706, - "grad_norm": 0.2573958870247804, + "grad_norm": 0.26473030453104546, "learning_rate": 2.836879432624114e-05, - "loss": 0.5261, + "loss": 0.5278, "step": 80 }, { "epoch": 0.1734475374732334, - "grad_norm": 0.2613906182039433, + "grad_norm": 0.26833166887012466, "learning_rate": 2.8723404255319154e-05, - "loss": 0.5418, + "loss": 0.543, "step": 81 }, { "epoch": 0.17558886509635974, - "grad_norm": 0.2314728596697455, + "grad_norm": 0.2669805636364855, "learning_rate": 2.9078014184397162e-05, - "loss": 0.5258, + "loss": 0.5282, "step": 82 }, { "epoch": 0.1777301927194861, - "grad_norm": 0.255469146374909, + "grad_norm": 0.3149827479787504, "learning_rate": 2.9432624113475177e-05, - "loss": 0.5358, + "loss": 0.538, "step": 83 }, { "epoch": 0.17987152034261242, - "grad_norm": 0.269758850025105, + "grad_norm": 0.27006814672073254, "learning_rate": 2.9787234042553192e-05, - "loss": 0.5467, + "loss": 0.5468, "step": 84 }, { "epoch": 0.18201284796573874, - "grad_norm": 0.22108427720048224, + "grad_norm": 0.2526636958624785, "learning_rate": 3.0141843971631207e-05, - "loss": 0.5138, + "loss": 0.515, "step": 85 }, { "epoch": 0.1841541755888651, - "grad_norm": 0.2918049155831516, + "grad_norm": 0.3066822441172635, "learning_rate": 3.0496453900709222e-05, - "loss": 0.5358, + "loss": 0.5363, "step": 86 }, { "epoch": 0.18629550321199143, - "grad_norm": 0.252878303302326, + "grad_norm": 0.24592969798097147, "learning_rate": 3.085106382978723e-05, - "loss": 0.5164, + "loss": 0.5172, "step": 87 }, { "epoch": 0.18843683083511778, - "grad_norm": 0.28867004146757574, + "grad_norm": 0.26341288237609334, "learning_rate": 3.120567375886525e-05, - "loss": 0.54, + "loss": 0.5406, "step": 88 }, { "epoch": 0.1905781584582441, - "grad_norm": 0.30841177860368885, + "grad_norm": 0.29569701405093984, "learning_rate": 3.156028368794326e-05, - "loss": 0.5271, + "loss": 0.5286, "step": 89 }, { "epoch": 0.19271948608137046, - "grad_norm": 0.29805101294833053, + "grad_norm": 0.26720546438232595, "learning_rate": 3.191489361702128e-05, - "loss": 0.5186, + "loss": 0.5191, "step": 90 }, { "epoch": 0.1948608137044968, - "grad_norm": 0.29287452022732885, + "grad_norm": 0.2861008299317861, "learning_rate": 3.226950354609929e-05, - "loss": 0.5288, + "loss": 0.5292, "step": 91 }, { "epoch": 0.19700214132762311, - "grad_norm": 0.2892573734733911, + "grad_norm": 0.3127809337444568, "learning_rate": 3.262411347517731e-05, - "loss": 0.5325, + "loss": 0.5379, "step": 92 }, { "epoch": 0.19914346895074947, - "grad_norm": 0.241170324930527, + "grad_norm": 0.2584764301110668, "learning_rate": 3.2978723404255317e-05, - "loss": 0.5069, + "loss": 0.5066, "step": 93 }, { "epoch": 0.2012847965738758, - "grad_norm": 0.28618356988432964, + "grad_norm": 0.2780596173019734, "learning_rate": 3.3333333333333335e-05, - "loss": 0.528, + "loss": 0.5286, "step": 94 }, { "epoch": 0.20342612419700215, - "grad_norm": 0.2706127494448774, + "grad_norm": 0.3273661550437294, "learning_rate": 3.3687943262411347e-05, - "loss": 0.5137, + "loss": 0.5155, "step": 95 }, { "epoch": 0.20556745182012848, - "grad_norm": 0.2920030135213235, + "grad_norm": 0.3036241503506882, "learning_rate": 3.4042553191489365e-05, - "loss": 0.5279, + "loss": 0.5282, "step": 96 }, { "epoch": 0.20770877944325483, - "grad_norm": 0.2481402426704902, + "grad_norm": 0.275255132741623, "learning_rate": 3.4397163120567377e-05, - "loss": 0.5001, + "loss": 0.5018, "step": 97 }, { "epoch": 0.20985010706638116, - "grad_norm": 0.27424390116478947, + "grad_norm": 0.28093670646107644, "learning_rate": 3.4751773049645395e-05, - "loss": 0.4932, + "loss": 0.4945, "step": 98 }, { "epoch": 0.21199143468950749, - "grad_norm": 0.29648670562970614, + "grad_norm": 0.27959022964420993, "learning_rate": 3.5106382978723407e-05, - "loss": 0.529, + "loss": 0.5293, "step": 99 }, { "epoch": 0.21413276231263384, - "grad_norm": 0.2457515803836918, + "grad_norm": 0.2987862837213484, "learning_rate": 3.546099290780142e-05, - "loss": 0.5178, + "loss": 0.5184, "step": 100 }, { "epoch": 0.21627408993576017, - "grad_norm": 0.29926928267286307, + "grad_norm": 0.2856449444610249, "learning_rate": 3.5815602836879437e-05, - "loss": 0.5224, + "loss": 0.5226, "step": 101 }, { "epoch": 0.21841541755888652, - "grad_norm": 0.24321396233390155, + "grad_norm": 0.2968896630112509, "learning_rate": 3.617021276595745e-05, - "loss": 0.5296, + "loss": 0.5314, "step": 102 }, { "epoch": 0.22055674518201285, - "grad_norm": 0.30921888959439514, + "grad_norm": 0.30586641123740416, "learning_rate": 3.6524822695035466e-05, - "loss": 0.5179, + "loss": 0.5195, "step": 103 }, { "epoch": 0.22269807280513917, - "grad_norm": 0.272063115950373, + "grad_norm": 0.28057509983972945, "learning_rate": 3.687943262411347e-05, - "loss": 0.5283, + "loss": 0.5287, "step": 104 }, { "epoch": 0.22483940042826553, - "grad_norm": 0.26613186863197535, + "grad_norm": 0.2752404260641325, "learning_rate": 3.723404255319149e-05, - "loss": 0.5242, + "loss": 0.525, "step": 105 }, { "epoch": 0.22698072805139186, - "grad_norm": 0.33500951074808644, + "grad_norm": 0.28309242710377003, "learning_rate": 3.75886524822695e-05, - "loss": 0.5174, + "loss": 0.5178, "step": 106 }, { "epoch": 0.2291220556745182, - "grad_norm": 0.2643772639229429, + "grad_norm": 0.2986830233079882, "learning_rate": 3.794326241134752e-05, - "loss": 0.5343, + "loss": 0.5338, "step": 107 }, { "epoch": 0.23126338329764454, - "grad_norm": 0.31115272338829114, + "grad_norm": 0.27191322829490466, "learning_rate": 3.829787234042553e-05, - "loss": 0.5052, + "loss": 0.5064, "step": 108 }, { "epoch": 0.2334047109207709, - "grad_norm": 0.2926558130416796, + "grad_norm": 0.3080545126102214, "learning_rate": 3.865248226950355e-05, - "loss": 0.5137, + "loss": 0.515, "step": 109 }, { "epoch": 0.23554603854389722, - "grad_norm": 0.3255536325363357, + "grad_norm": 0.31855156829294484, "learning_rate": 3.900709219858156e-05, - "loss": 0.5279, + "loss": 0.5267, "step": 110 }, { "epoch": 0.23768736616702354, - "grad_norm": 0.33989084909132006, + "grad_norm": 0.3118610221875637, "learning_rate": 3.936170212765958e-05, - "loss": 0.4988, + "loss": 0.4993, "step": 111 }, { "epoch": 0.2398286937901499, - "grad_norm": 0.35381236194602544, + "grad_norm": 0.3563490136500225, "learning_rate": 3.971631205673759e-05, - "loss": 0.4937, + "loss": 0.494, "step": 112 }, { "epoch": 0.24197002141327623, - "grad_norm": 0.3510604214761894, + "grad_norm": 0.33491792784430063, "learning_rate": 4.007092198581561e-05, - "loss": 0.5178, + "loss": 0.5166, "step": 113 }, { "epoch": 0.24411134903640258, - "grad_norm": 0.2974753747795791, + "grad_norm": 0.3107234877761307, "learning_rate": 4.0425531914893614e-05, - "loss": 0.5006, + "loss": 0.5016, "step": 114 }, { "epoch": 0.2462526766595289, - "grad_norm": 0.3391358705953081, + "grad_norm": 0.3574327616039191, "learning_rate": 4.078014184397163e-05, - "loss": 0.4991, + "loss": 0.4997, "step": 115 }, { "epoch": 0.24839400428265523, - "grad_norm": 0.3244454146866931, + "grad_norm": 0.31250608779187283, "learning_rate": 4.1134751773049644e-05, - "loss": 0.4992, + "loss": 0.5012, "step": 116 }, { "epoch": 0.2505353319057816, - "grad_norm": 0.29806045444607565, + "grad_norm": 0.31466096005113264, "learning_rate": 4.148936170212766e-05, - "loss": 0.4879, + "loss": 0.4898, "step": 117 }, { "epoch": 0.25267665952890794, - "grad_norm": 0.34001472132830457, + "grad_norm": 0.35023750633522854, "learning_rate": 4.1843971631205674e-05, - "loss": 0.5142, + "loss": 0.5151, "step": 118 }, { "epoch": 0.25481798715203424, - "grad_norm": 0.3147825744118216, + "grad_norm": 0.3055990330623537, "learning_rate": 4.219858156028369e-05, - "loss": 0.4975, + "loss": 0.4976, "step": 119 }, { "epoch": 0.2569593147751606, - "grad_norm": 0.343655564131983, + "grad_norm": 0.3558935694444545, "learning_rate": 4.2553191489361704e-05, - "loss": 0.5169, + "loss": 0.517, "step": 120 }, { "epoch": 0.25910064239828695, - "grad_norm": 0.3165564039834663, + "grad_norm": 0.34915884790434, "learning_rate": 4.2907801418439716e-05, - "loss": 0.5038, + "loss": 0.5044, "step": 121 }, { "epoch": 0.26124197002141325, - "grad_norm": 0.3421038891467151, + "grad_norm": 0.348632342188165, "learning_rate": 4.3262411347517734e-05, - "loss": 0.5044, + "loss": 0.505, "step": 122 }, { "epoch": 0.2633832976445396, - "grad_norm": 0.3236382619911156, + "grad_norm": 0.3785527848062378, "learning_rate": 4.3617021276595746e-05, - "loss": 0.5077, + "loss": 0.5085, "step": 123 }, { "epoch": 0.26552462526766596, - "grad_norm": 0.3482419500485224, + "grad_norm": 0.37162644733897265, "learning_rate": 4.3971631205673764e-05, - "loss": 0.4822, + "loss": 0.4833, "step": 124 }, { "epoch": 0.2676659528907923, - "grad_norm": 0.3967980846994073, + "grad_norm": 0.3980723109877404, "learning_rate": 4.432624113475177e-05, - "loss": 0.4947, + "loss": 0.4955, "step": 125 }, { "epoch": 0.2698072805139186, - "grad_norm": 0.29467264961379613, + "grad_norm": 0.3725438131171889, "learning_rate": 4.468085106382979e-05, - "loss": 0.4871, + "loss": 0.4879, "step": 126 }, { "epoch": 0.27194860813704497, - "grad_norm": 0.35596207729536755, + "grad_norm": 0.3546366551258052, "learning_rate": 4.50354609929078e-05, - "loss": 0.4831, + "loss": 0.485, "step": 127 }, { "epoch": 0.2740899357601713, - "grad_norm": 0.37395057401049897, + "grad_norm": 0.3352604060765713, "learning_rate": 4.539007092198582e-05, - "loss": 0.5122, + "loss": 0.5125, "step": 128 }, { "epoch": 0.2762312633832976, - "grad_norm": 0.28605544378420467, + "grad_norm": 0.38902168152696476, "learning_rate": 4.574468085106383e-05, - "loss": 0.4898, + "loss": 0.4923, "step": 129 }, { "epoch": 0.278372591006424, - "grad_norm": 0.3331655327731713, + "grad_norm": 0.4722633165183755, "learning_rate": 4.609929078014185e-05, - "loss": 0.4912, + "loss": 0.4933, "step": 130 }, { "epoch": 0.28051391862955033, - "grad_norm": 0.29374454897662605, + "grad_norm": 0.40974642024145574, "learning_rate": 4.645390070921986e-05, - "loss": 0.4814, + "loss": 0.4823, "step": 131 }, { "epoch": 0.2826552462526767, - "grad_norm": 0.30113962440816366, + "grad_norm": 0.34213507876160526, "learning_rate": 4.680851063829788e-05, - "loss": 0.4856, + "loss": 0.4865, "step": 132 }, { "epoch": 0.284796573875803, - "grad_norm": 0.30027399950187866, + "grad_norm": 0.3961185410637184, "learning_rate": 4.716312056737589e-05, - "loss": 0.4852, + "loss": 0.4865, "step": 133 }, { "epoch": 0.28693790149892934, - "grad_norm": 0.3230425204774485, + "grad_norm": 0.37000080466083957, "learning_rate": 4.751773049645391e-05, - "loss": 0.4903, + "loss": 0.4916, "step": 134 }, { "epoch": 0.2890792291220557, - "grad_norm": 0.31234670755297617, + "grad_norm": 0.37414319777051985, "learning_rate": 4.787234042553192e-05, - "loss": 0.5041, + "loss": 0.5044, "step": 135 }, { "epoch": 0.291220556745182, - "grad_norm": 0.30804022118995605, + "grad_norm": 0.3164774177894217, "learning_rate": 4.822695035460993e-05, - "loss": 0.4818, + "loss": 0.4811, "step": 136 }, { "epoch": 0.29336188436830835, - "grad_norm": 0.3524834300849378, + "grad_norm": 0.47067620082781225, "learning_rate": 4.858156028368794e-05, - "loss": 0.4935, + "loss": 0.4945, "step": 137 }, { "epoch": 0.2955032119914347, - "grad_norm": 0.3743768713148858, + "grad_norm": 0.4294475664764798, "learning_rate": 4.893617021276596e-05, - "loss": 0.5098, + "loss": 0.5094, "step": 138 }, { "epoch": 0.29764453961456105, - "grad_norm": 0.34028691668800937, + "grad_norm": 0.3040028099316697, "learning_rate": 4.929078014184397e-05, - "loss": 0.4935, + "loss": 0.4942, "step": 139 }, { "epoch": 0.29978586723768735, - "grad_norm": 0.3432662688866466, + "grad_norm": 0.37748057667062174, "learning_rate": 4.964539007092199e-05, - "loss": 0.4906, + "loss": 0.4905, "step": 140 }, { "epoch": 0.3019271948608137, - "grad_norm": 0.3484159957729913, + "grad_norm": 0.2989419262617704, "learning_rate": 5e-05, - "loss": 0.4916, + "loss": 0.4905, "step": 141 }, { "epoch": 0.30406852248394006, - "grad_norm": 0.2906011601786296, + "grad_norm": 0.3871243075644112, "learning_rate": 4.996031746031746e-05, - "loss": 0.4851, + "loss": 0.4852, "step": 142 }, { "epoch": 0.30620985010706636, - "grad_norm": 0.31602752642817156, + "grad_norm": 0.29602921563211776, "learning_rate": 4.9920634920634924e-05, - "loss": 0.4804, + "loss": 0.4803, "step": 143 }, { "epoch": 0.3083511777301927, - "grad_norm": 0.33931054385910697, + "grad_norm": 0.48126987646952557, "learning_rate": 4.9880952380952385e-05, - "loss": 0.4956, + "loss": 0.4963, "step": 144 }, { "epoch": 0.31049250535331907, - "grad_norm": 0.2673250220712651, + "grad_norm": 0.3271921917836504, "learning_rate": 4.9841269841269845e-05, - "loss": 0.474, + "loss": 0.4739, "step": 145 }, { "epoch": 0.31263383297644537, - "grad_norm": 0.27590244134076686, + "grad_norm": 0.30962361825644874, "learning_rate": 4.9801587301587306e-05, - "loss": 0.4652, + "loss": 0.4659, "step": 146 }, { "epoch": 0.3147751605995717, - "grad_norm": 0.29339636329684166, + "grad_norm": 0.2862717141303497, "learning_rate": 4.976190476190477e-05, - "loss": 0.4804, + "loss": 0.4805, "step": 147 }, { "epoch": 0.3169164882226981, - "grad_norm": 0.2842194678371775, + "grad_norm": 0.29165213355742037, "learning_rate": 4.972222222222223e-05, - "loss": 0.4884, + "loss": 0.4893, "step": 148 }, { "epoch": 0.31905781584582443, - "grad_norm": 0.3177841927362518, + "grad_norm": 0.2837190882523078, "learning_rate": 4.968253968253969e-05, - "loss": 0.4887, + "loss": 0.4877, "step": 149 }, { "epoch": 0.32119914346895073, - "grad_norm": 0.362994905525864, + "grad_norm": 0.3162679943468195, "learning_rate": 4.964285714285715e-05, - "loss": 0.4957, + "loss": 0.4949, "step": 150 }, { "epoch": 0.3233404710920771, - "grad_norm": 0.3134548549120884, + "grad_norm": 0.3166915931837597, "learning_rate": 4.960317460317461e-05, - "loss": 0.4746, + "loss": 0.4745, "step": 151 }, { "epoch": 0.32548179871520344, - "grad_norm": 0.33022141318925424, + "grad_norm": 0.2867036124644211, "learning_rate": 4.956349206349207e-05, - "loss": 0.4836, + "loss": 0.4827, "step": 152 }, { "epoch": 0.32762312633832974, - "grad_norm": 0.2841374060809414, + "grad_norm": 0.3127471586658351, "learning_rate": 4.9523809523809525e-05, - "loss": 0.4824, + "loss": 0.4817, "step": 153 }, { "epoch": 0.3297644539614561, - "grad_norm": 0.3042451933872161, + "grad_norm": 0.28863042358499497, "learning_rate": 4.9484126984126985e-05, - "loss": 0.4947, + "loss": 0.4944, "step": 154 }, { "epoch": 0.33190578158458245, - "grad_norm": 0.3081245822778917, + "grad_norm": 0.30635024454971044, "learning_rate": 4.9444444444444446e-05, "loss": 0.4882, "step": 155 }, { "epoch": 0.3340471092077088, - "grad_norm": 0.2888403289180772, + "grad_norm": 0.25406638929643754, "learning_rate": 4.940476190476191e-05, "loss": 0.4583, "step": 156 }, { "epoch": 0.3361884368308351, - "grad_norm": 0.49151318304484476, + "grad_norm": 0.4236621872965728, "learning_rate": 4.936507936507937e-05, - "loss": 0.5023, + "loss": 0.5011, "step": 157 }, { "epoch": 0.33832976445396146, - "grad_norm": 0.33926947413240066, + "grad_norm": 0.33240414924657075, "learning_rate": 4.932539682539683e-05, - "loss": 0.5, + "loss": 0.4984, "step": 158 }, { "epoch": 0.3404710920770878, - "grad_norm": 0.3432037588535958, + "grad_norm": 0.27992076972800195, "learning_rate": 4.928571428571429e-05, - "loss": 0.4892, + "loss": 0.4882, "step": 159 }, { "epoch": 0.3426124197002141, - "grad_norm": 0.33357704996030413, + "grad_norm": 0.3378573050357272, "learning_rate": 4.924603174603175e-05, - "loss": 0.483, + "loss": 0.4821, "step": 160 }, { "epoch": 0.34475374732334046, - "grad_norm": 0.3455404282494954, + "grad_norm": 0.3116481418117616, "learning_rate": 4.9206349206349204e-05, - "loss": 0.502, + "loss": 0.501, "step": 161 }, { "epoch": 0.3468950749464668, - "grad_norm": 0.3799400968283991, + "grad_norm": 0.30756617491799604, "learning_rate": 4.9166666666666665e-05, - "loss": 0.5005, + "loss": 0.499, "step": 162 }, { "epoch": 0.3490364025695932, - "grad_norm": 0.28633882509130054, + "grad_norm": 0.3501139975577791, "learning_rate": 4.9126984126984125e-05, - "loss": 0.5012, + "loss": 0.4998, "step": 163 }, { "epoch": 0.3511777301927195, - "grad_norm": 0.3692568848737784, + "grad_norm": 0.30095198583101096, "learning_rate": 4.9087301587301586e-05, - "loss": 0.4773, + "loss": 0.4772, "step": 164 }, { "epoch": 0.3533190578158458, - "grad_norm": 0.28424265463523657, + "grad_norm": 0.29146076836363266, "learning_rate": 4.904761904761905e-05, - "loss": 0.4933, + "loss": 0.4932, "step": 165 }, { "epoch": 0.3554603854389722, - "grad_norm": 0.3433785169927177, + "grad_norm": 0.3245937860525494, "learning_rate": 4.900793650793651e-05, - "loss": 0.4708, + "loss": 0.4717, "step": 166 }, { "epoch": 0.3576017130620985, - "grad_norm": 0.322472121983474, + "grad_norm": 0.27276729093930885, "learning_rate": 4.896825396825397e-05, - "loss": 0.484, + "loss": 0.485, "step": 167 }, { "epoch": 0.35974304068522484, - "grad_norm": 0.37196647339980105, + "grad_norm": 0.36020649417508827, "learning_rate": 4.892857142857143e-05, - "loss": 0.4872, + "loss": 0.4873, "step": 168 }, { "epoch": 0.3618843683083512, - "grad_norm": 0.3144695427706762, + "grad_norm": 0.2918070400385983, "learning_rate": 4.888888888888889e-05, - "loss": 0.4909, + "loss": 0.4905, "step": 169 }, { "epoch": 0.3640256959314775, - "grad_norm": 0.3399481737215869, + "grad_norm": 0.33570775396857644, "learning_rate": 4.884920634920635e-05, - "loss": 0.4851, + "loss": 0.4852, "step": 170 }, { "epoch": 0.36616702355460384, - "grad_norm": 0.30151182574451896, + "grad_norm": 0.3134384968531535, "learning_rate": 4.880952380952381e-05, - "loss": 0.4772, + "loss": 0.477, "step": 171 }, { "epoch": 0.3683083511777302, - "grad_norm": 0.3128577892502514, + "grad_norm": 0.31510450944034885, "learning_rate": 4.876984126984127e-05, - "loss": 0.4848, + "loss": 0.4851, "step": 172 }, { "epoch": 0.37044967880085655, - "grad_norm": 0.31922897506045883, + "grad_norm": 0.3154584471333162, "learning_rate": 4.873015873015873e-05, - "loss": 0.4986, + "loss": 0.4974, "step": 173 }, { "epoch": 0.37259100642398285, - "grad_norm": 0.3017606389198961, + "grad_norm": 0.2909626760317359, "learning_rate": 4.8690476190476194e-05, - "loss": 0.4828, + "loss": 0.4837, "step": 174 }, { "epoch": 0.3747323340471092, - "grad_norm": 0.3000491473588088, + "grad_norm": 0.3647746614109793, "learning_rate": 4.8650793650793654e-05, - "loss": 0.4752, + "loss": 0.4766, "step": 175 }, { "epoch": 0.37687366167023556, - "grad_norm": 0.3892869618739201, + "grad_norm": 0.2994221837641762, "learning_rate": 4.8611111111111115e-05, - "loss": 0.4892, + "loss": 0.4889, "step": 176 }, { "epoch": 0.37901498929336186, - "grad_norm": 0.29614553607421257, + "grad_norm": 0.3352657823786708, "learning_rate": 4.8571428571428576e-05, - "loss": 0.5018, + "loss": 0.5019, "step": 177 }, { "epoch": 0.3811563169164882, - "grad_norm": 0.3790956924695203, + "grad_norm": 0.3916874320487166, "learning_rate": 4.853174603174604e-05, "loss": 0.5092, "step": 178 }, { "epoch": 0.38329764453961457, - "grad_norm": 0.3316734586371452, + "grad_norm": 0.29941263170579285, "learning_rate": 4.84920634920635e-05, - "loss": 0.482, + "loss": 0.4803, "step": 179 }, { "epoch": 0.3854389721627409, - "grad_norm": 0.3404880131426743, + "grad_norm": 0.32217426505709545, "learning_rate": 4.845238095238095e-05, - "loss": 0.4981, + "loss": 0.4984, "step": 180 }, { "epoch": 0.3875802997858672, - "grad_norm": 0.2900095290587479, + "grad_norm": 0.3032060935319803, "learning_rate": 4.841269841269841e-05, - "loss": 0.4668, + "loss": 0.4673, "step": 181 }, { "epoch": 0.3897216274089936, - "grad_norm": 0.2922900867382386, + "grad_norm": 0.2726502791688811, "learning_rate": 4.837301587301587e-05, - "loss": 0.4667, + "loss": 0.4664, "step": 182 }, { "epoch": 0.39186295503211993, - "grad_norm": 0.3181046448048784, + "grad_norm": 0.2971786378502053, "learning_rate": 4.8333333333333334e-05, - "loss": 0.4804, + "loss": 0.4798, "step": 183 }, { "epoch": 0.39400428265524623, - "grad_norm": 0.27687702877774883, + "grad_norm": 0.2865455200314379, "learning_rate": 4.8293650793650794e-05, "loss": 0.4775, "step": 184 }, { "epoch": 0.3961456102783726, - "grad_norm": 0.2836321426330953, + "grad_norm": 0.2547381179115949, "learning_rate": 4.8253968253968255e-05, - "loss": 0.4725, + "loss": 0.473, "step": 185 }, { "epoch": 0.39828693790149894, - "grad_norm": 0.2913866873418313, + "grad_norm": 0.2970502284932418, "learning_rate": 4.8214285714285716e-05, - "loss": 0.4775, + "loss": 0.4777, "step": 186 }, { "epoch": 0.4004282655246253, - "grad_norm": 0.30835624032708503, + "grad_norm": 0.3081755326163523, "learning_rate": 4.817460317460318e-05, - "loss": 0.4878, + "loss": 0.4884, "step": 187 }, { "epoch": 0.4025695931477516, - "grad_norm": 0.29588677700901966, + "grad_norm": 0.2988953377132881, "learning_rate": 4.813492063492064e-05, - "loss": 0.4732, + "loss": 0.4721, "step": 188 }, { "epoch": 0.40471092077087795, - "grad_norm": 0.30881963199698537, + "grad_norm": 0.32840372938982254, "learning_rate": 4.80952380952381e-05, - "loss": 0.4598, + "loss": 0.4592, "step": 189 }, { "epoch": 0.4068522483940043, - "grad_norm": 0.30298345716545283, + "grad_norm": 0.27671818316763175, "learning_rate": 4.805555555555556e-05, - "loss": 0.4717, + "loss": 0.4712, "step": 190 }, { "epoch": 0.4089935760171306, - "grad_norm": 0.35885982262554333, + "grad_norm": 0.3246160797192649, "learning_rate": 4.801587301587302e-05, "loss": 0.4752, "step": 191 }, { "epoch": 0.41113490364025695, - "grad_norm": 0.3004713627687736, + "grad_norm": 0.3566765772926943, "learning_rate": 4.797619047619048e-05, - "loss": 0.4674, + "loss": 0.4672, "step": 192 }, { "epoch": 0.4132762312633833, - "grad_norm": 0.29124273016906205, + "grad_norm": 0.26828922214902995, "learning_rate": 4.793650793650794e-05, - "loss": 0.4883, + "loss": 0.4878, "step": 193 }, { "epoch": 0.41541755888650966, - "grad_norm": 0.28170083463492585, + "grad_norm": 0.3550040895109434, "learning_rate": 4.78968253968254e-05, - "loss": 0.4808, + "loss": 0.48, "step": 194 }, { "epoch": 0.41755888650963596, - "grad_norm": 0.2930248614401525, + "grad_norm": 0.2838693253149512, "learning_rate": 4.785714285714286e-05, - "loss": 0.4614, + "loss": 0.4611, "step": 195 }, { "epoch": 0.4197002141327623, - "grad_norm": 0.3725197913239573, + "grad_norm": 0.39397907047263425, "learning_rate": 4.781746031746032e-05, - "loss": 0.466, + "loss": 0.4655, "step": 196 }, { "epoch": 0.42184154175588867, - "grad_norm": 0.3173770734521219, + "grad_norm": 0.26305224728667986, "learning_rate": 4.7777777777777784e-05, - "loss": 0.4839, + "loss": 0.4841, "step": 197 }, { "epoch": 0.42398286937901497, - "grad_norm": 0.31792822631775125, + "grad_norm": 0.325701418774634, "learning_rate": 4.7738095238095245e-05, - "loss": 0.4604, + "loss": 0.4603, "step": 198 }, { "epoch": 0.4261241970021413, - "grad_norm": 0.4547994524865786, + "grad_norm": 0.442210465125467, "learning_rate": 4.7698412698412706e-05, - "loss": 0.458, + "loss": 0.4599, "step": 199 }, { "epoch": 0.4282655246252677, - "grad_norm": 0.3200904129332156, + "grad_norm": 0.30449934938572704, "learning_rate": 4.7658730158730166e-05, - "loss": 0.4722, + "loss": 0.4715, "step": 200 }, { "epoch": 0.430406852248394, - "grad_norm": 0.33843214154215645, + "grad_norm": 0.30205346239976966, "learning_rate": 4.761904761904762e-05, - "loss": 0.4841, + "loss": 0.4832, "step": 201 }, { "epoch": 0.43254817987152033, - "grad_norm": 0.29671855926132695, + "grad_norm": 0.29056082365428426, "learning_rate": 4.757936507936508e-05, - "loss": 0.4896, + "loss": 0.489, "step": 202 }, { "epoch": 0.4346895074946467, - "grad_norm": 0.31070750776887823, + "grad_norm": 0.36617244189716913, "learning_rate": 4.753968253968254e-05, - "loss": 0.4714, + "loss": 0.4716, "step": 203 }, { "epoch": 0.43683083511777304, - "grad_norm": 0.3305729189717249, + "grad_norm": 0.2589569319503199, "learning_rate": 4.75e-05, - "loss": 0.4553, + "loss": 0.4533, "step": 204 }, { "epoch": 0.43897216274089934, - "grad_norm": 0.30161254779141056, + "grad_norm": 0.3693939526209848, "learning_rate": 4.746031746031746e-05, - "loss": 0.4771, + "loss": 0.477, "step": 205 }, { "epoch": 0.4411134903640257, - "grad_norm": 0.4165722704133656, + "grad_norm": 0.32847584920064604, "learning_rate": 4.7420634920634924e-05, - "loss": 0.4941, + "loss": 0.4928, "step": 206 }, { "epoch": 0.44325481798715205, - "grad_norm": 0.3098035646663218, + "grad_norm": 0.44099566221142933, "learning_rate": 4.738095238095238e-05, - "loss": 0.4562, + "loss": 0.4556, "step": 207 }, { "epoch": 0.44539614561027835, - "grad_norm": 0.31217832469780865, + "grad_norm": 0.2472596663917096, "learning_rate": 4.734126984126984e-05, - "loss": 0.4593, + "loss": 0.4589, "step": 208 }, { "epoch": 0.4475374732334047, - "grad_norm": 0.33303037422629134, + "grad_norm": 0.4006991143128524, "learning_rate": 4.73015873015873e-05, - "loss": 0.4695, + "loss": 0.469, "step": 209 }, { "epoch": 0.44967880085653106, - "grad_norm": 0.3164429971126553, + "grad_norm": 0.2851927359912657, "learning_rate": 4.726190476190476e-05, - "loss": 0.4787, + "loss": 0.4766, "step": 210 }, { "epoch": 0.4518201284796574, - "grad_norm": 0.3667199010122267, + "grad_norm": 0.3365161124957976, "learning_rate": 4.722222222222222e-05, - "loss": 0.4741, + "loss": 0.4723, "step": 211 }, { "epoch": 0.4539614561027837, - "grad_norm": 0.32958761965134165, + "grad_norm": 0.34565226910942354, "learning_rate": 4.718253968253968e-05, - "loss": 0.476, + "loss": 0.4758, "step": 212 }, { "epoch": 0.45610278372591007, - "grad_norm": 0.3847263613923873, + "grad_norm": 0.27344513413865157, "learning_rate": 4.714285714285714e-05, - "loss": 0.4699, + "loss": 0.4679, "step": 213 }, { "epoch": 0.4582441113490364, - "grad_norm": 0.2955982753193451, + "grad_norm": 0.38311587773024736, "learning_rate": 4.71031746031746e-05, - "loss": 0.4799, + "loss": 0.48, "step": 214 }, { "epoch": 0.4603854389721627, - "grad_norm": 0.38809540646072666, + "grad_norm": 0.34364682832024224, "learning_rate": 4.7063492063492064e-05, - "loss": 0.4783, + "loss": 0.4782, "step": 215 }, { "epoch": 0.4625267665952891, - "grad_norm": 0.33351402005149655, + "grad_norm": 0.3914058243306043, "learning_rate": 4.7023809523809525e-05, - "loss": 0.4671, + "loss": 0.4661, "step": 216 }, { "epoch": 0.46466809421841543, - "grad_norm": 0.3544022577957815, + "grad_norm": 0.29870287943266227, "learning_rate": 4.6984126984126986e-05, - "loss": 0.4634, + "loss": 0.4638, "step": 217 }, { "epoch": 0.4668094218415418, - "grad_norm": 0.3440682763213123, + "grad_norm": 0.4094058032238448, "learning_rate": 4.6944444444444446e-05, - "loss": 0.4606, + "loss": 0.46, "step": 218 }, { "epoch": 0.4689507494646681, - "grad_norm": 0.35994592001280135, + "grad_norm": 0.3350045536223048, "learning_rate": 4.690476190476191e-05, - "loss": 0.4687, + "loss": 0.4679, "step": 219 }, { "epoch": 0.47109207708779444, - "grad_norm": 0.2792969038742562, + "grad_norm": 0.3512288511765975, "learning_rate": 4.686507936507937e-05, - "loss": 0.4675, + "loss": 0.4685, "step": 220 }, { "epoch": 0.4732334047109208, - "grad_norm": 0.3167594669644963, + "grad_norm": 0.3378187173840975, "learning_rate": 4.682539682539683e-05, - "loss": 0.447, + "loss": 0.4465, "step": 221 }, { "epoch": 0.4753747323340471, - "grad_norm": 0.28698974715014, + "grad_norm": 0.3038250915138398, "learning_rate": 4.678571428571429e-05, - "loss": 0.4824, + "loss": 0.4821, "step": 222 }, { "epoch": 0.47751605995717344, - "grad_norm": 0.2919731601075355, + "grad_norm": 0.29703824475925095, "learning_rate": 4.674603174603175e-05, - "loss": 0.4756, + "loss": 0.4747, "step": 223 }, { "epoch": 0.4796573875802998, - "grad_norm": 0.3961977468030219, + "grad_norm": 0.3806097625906433, "learning_rate": 4.670634920634921e-05, - "loss": 0.4988, + "loss": 0.4979, "step": 224 }, { "epoch": 0.4817987152034261, - "grad_norm": 0.2939889127181417, + "grad_norm": 0.3317321869836366, "learning_rate": 4.666666666666667e-05, - "loss": 0.4782, + "loss": 0.4775, "step": 225 }, { "epoch": 0.48394004282655245, - "grad_norm": 0.38374224244801103, + "grad_norm": 0.35267177499930097, "learning_rate": 4.662698412698413e-05, - "loss": 0.4713, + "loss": 0.4707, "step": 226 }, { "epoch": 0.4860813704496788, - "grad_norm": 0.2783898692581709, + "grad_norm": 0.3134789773559103, "learning_rate": 4.658730158730159e-05, - "loss": 0.4683, + "loss": 0.4676, "step": 227 }, { "epoch": 0.48822269807280516, - "grad_norm": 0.3514854051266336, + "grad_norm": 0.31826459466880935, "learning_rate": 4.6547619047619054e-05, - "loss": 0.4745, + "loss": 0.4741, "step": 228 }, { "epoch": 0.49036402569593146, - "grad_norm": 0.2761349026606947, + "grad_norm": 0.3172595965266178, "learning_rate": 4.6507936507936515e-05, - "loss": 0.4777, + "loss": 0.4769, "step": 229 }, { "epoch": 0.4925053533190578, - "grad_norm": 0.2826076093838578, + "grad_norm": 0.281104120201888, "learning_rate": 4.646825396825397e-05, - "loss": 0.4491, + "loss": 0.4486, "step": 230 }, { "epoch": 0.49464668094218417, - "grad_norm": 0.2945939088071528, + "grad_norm": 0.3390660082734012, "learning_rate": 4.642857142857143e-05, - "loss": 0.4724, + "loss": 0.4721, "step": 231 }, { "epoch": 0.49678800856531047, - "grad_norm": 0.2750719469025186, + "grad_norm": 0.2624873773296077, "learning_rate": 4.638888888888889e-05, - "loss": 0.4592, + "loss": 0.4579, "step": 232 }, { "epoch": 0.4989293361884368, - "grad_norm": 0.2515310029544384, + "grad_norm": 0.30482318678415316, "learning_rate": 4.634920634920635e-05, - "loss": 0.4572, + "loss": 0.4575, "step": 233 }, { "epoch": 0.5010706638115632, - "grad_norm": 0.29715312851565084, + "grad_norm": 0.3163232948976339, "learning_rate": 4.630952380952381e-05, - "loss": 0.4769, + "loss": 0.4771, "step": 234 }, { "epoch": 0.5032119914346895, - "grad_norm": 0.28840327957350287, + "grad_norm": 0.28403210542670865, "learning_rate": 4.626984126984127e-05, - "loss": 0.459, + "loss": 0.4571, "step": 235 }, { "epoch": 0.5053533190578159, - "grad_norm": 0.38388536035323373, + "grad_norm": 0.3169956103989183, "learning_rate": 4.623015873015873e-05, - "loss": 0.4645, + "loss": 0.4642, "step": 236 }, { "epoch": 0.5074946466809421, - "grad_norm": 0.34005158978288785, + "grad_norm": 0.3124625210824864, "learning_rate": 4.6190476190476194e-05, - "loss": 0.4951, + "loss": 0.4934, "step": 237 }, { "epoch": 0.5096359743040685, - "grad_norm": 0.3135930281832256, + "grad_norm": 0.27914854479065176, "learning_rate": 4.6150793650793655e-05, - "loss": 0.4783, + "loss": 0.4777, "step": 238 }, { "epoch": 0.5117773019271948, - "grad_norm": 0.29153181091541064, + "grad_norm": 0.30405223359375916, "learning_rate": 4.6111111111111115e-05, - "loss": 0.4783, + "loss": 0.4774, "step": 239 }, { "epoch": 0.5139186295503212, - "grad_norm": 0.35211081607111105, + "grad_norm": 0.27824494271256855, "learning_rate": 4.607142857142857e-05, - "loss": 0.4784, + "loss": 0.4778, "step": 240 }, { "epoch": 0.5160599571734475, - "grad_norm": 0.2534327826926188, + "grad_norm": 0.28631142513329905, "learning_rate": 4.603174603174603e-05, - "loss": 0.4758, + "loss": 0.4742, "step": 241 }, { "epoch": 0.5182012847965739, - "grad_norm": 0.291715580605378, + "grad_norm": 0.2734339260745776, "learning_rate": 4.599206349206349e-05, - "loss": 0.4613, + "loss": 0.4598, "step": 242 }, { "epoch": 0.5203426124197003, - "grad_norm": 0.49953531179115856, + "grad_norm": 0.4662830157775148, "learning_rate": 4.595238095238095e-05, - "loss": 0.467, + "loss": 0.4622, "step": 243 }, { "epoch": 0.5224839400428265, - "grad_norm": 0.37988476101063506, + "grad_norm": 0.27437983872972277, "learning_rate": 4.591269841269841e-05, - "loss": 0.4668, + "loss": 0.4661, "step": 244 }, { "epoch": 0.5246252676659529, - "grad_norm": 0.3306624298526701, + "grad_norm": 0.3569225244178123, "learning_rate": 4.587301587301587e-05, - "loss": 0.4571, + "loss": 0.4558, "step": 245 }, { "epoch": 0.5267665952890792, - "grad_norm": 0.292259379733011, + "grad_norm": 0.2921312413477028, "learning_rate": 4.5833333333333334e-05, - "loss": 0.4747, + "loss": 0.473, "step": 246 }, { "epoch": 0.5289079229122056, - "grad_norm": 0.2817284251841534, + "grad_norm": 0.3105118836185397, "learning_rate": 4.5793650793650795e-05, - "loss": 0.4601, + "loss": 0.4592, "step": 247 }, { "epoch": 0.5310492505353319, - "grad_norm": 0.28259998989853713, + "grad_norm": 0.2602252376592866, "learning_rate": 4.5753968253968255e-05, - "loss": 0.4489, + "loss": 0.4475, "step": 248 }, { "epoch": 0.5331905781584583, - "grad_norm": 0.3077230211086655, + "grad_norm": 0.31546458571249575, "learning_rate": 4.5714285714285716e-05, - "loss": 0.4678, + "loss": 0.4655, "step": 249 }, { "epoch": 0.5353319057815846, - "grad_norm": 0.2783904187217292, + "grad_norm": 0.29234521833457866, "learning_rate": 4.567460317460318e-05, - "loss": 0.4401, + "loss": 0.4386, "step": 250 }, { "epoch": 0.5374732334047109, - "grad_norm": 0.31896115951211806, + "grad_norm": 0.27089799714474905, "learning_rate": 4.563492063492064e-05, - "loss": 0.4572, + "loss": 0.4552, "step": 251 }, { "epoch": 0.5396145610278372, - "grad_norm": 0.30499818403576123, + "grad_norm": 0.2876790516942422, "learning_rate": 4.55952380952381e-05, - "loss": 0.465, + "loss": 0.4643, "step": 252 }, { "epoch": 0.5417558886509636, - "grad_norm": 0.35575649581615343, + "grad_norm": 0.2796690158138871, "learning_rate": 4.555555555555556e-05, - "loss": 0.47, + "loss": 0.4685, "step": 253 }, { "epoch": 0.5438972162740899, - "grad_norm": 0.2684090185728518, + "grad_norm": 0.2620205830889585, "learning_rate": 4.551587301587302e-05, - "loss": 0.4645, + "loss": 0.4628, "step": 254 }, { "epoch": 0.5460385438972163, - "grad_norm": 0.37921813849517194, + "grad_norm": 0.27675126452812215, "learning_rate": 4.547619047619048e-05, - "loss": 0.4678, + "loss": 0.4659, "step": 255 }, { "epoch": 0.5481798715203426, - "grad_norm": 0.26743329008720806, + "grad_norm": 0.2529727200895915, "learning_rate": 4.543650793650794e-05, - "loss": 0.4732, + "loss": 0.4719, "step": 256 }, { "epoch": 0.550321199143469, - "grad_norm": 0.31026028532831346, + "grad_norm": 0.28938734392771215, "learning_rate": 4.5396825396825395e-05, - "loss": 0.4508, + "loss": 0.4499, "step": 257 }, { "epoch": 0.5524625267665952, - "grad_norm": 0.2581029656532225, + "grad_norm": 0.2732820003277474, "learning_rate": 4.5357142857142856e-05, - "loss": 0.471, + "loss": 0.4677, "step": 258 }, { "epoch": 0.5546038543897216, - "grad_norm": 0.26803290950789216, + "grad_norm": 0.24867294850473456, "learning_rate": 4.531746031746032e-05, - "loss": 0.4743, + "loss": 0.4714, "step": 259 }, { "epoch": 0.556745182012848, - "grad_norm": 0.38556934693664496, + "grad_norm": 0.3040098909557791, "learning_rate": 4.527777777777778e-05, - "loss": 0.4768, + "loss": 0.4757, "step": 260 }, { "epoch": 0.5588865096359743, - "grad_norm": 0.2661542720290909, + "grad_norm": 0.2705701549546398, "learning_rate": 4.523809523809524e-05, - "loss": 0.466, + "loss": 0.4642, "step": 261 }, { "epoch": 0.5610278372591007, - "grad_norm": 0.29067138820514277, + "grad_norm": 0.2949114446979747, "learning_rate": 4.51984126984127e-05, - "loss": 0.4702, + "loss": 0.4671, "step": 262 }, { "epoch": 0.563169164882227, - "grad_norm": 0.2872183782023422, + "grad_norm": 0.28678659699639003, "learning_rate": 4.515873015873016e-05, - "loss": 0.4534, + "loss": 0.4519, "step": 263 }, { "epoch": 0.5653104925053534, - "grad_norm": 0.25367532092796774, + "grad_norm": 0.2582358880028455, "learning_rate": 4.511904761904762e-05, - "loss": 0.4551, + "loss": 0.4545, "step": 264 }, { "epoch": 0.5674518201284796, - "grad_norm": 0.3039430947031411, + "grad_norm": 0.304233138313808, "learning_rate": 4.507936507936508e-05, - "loss": 0.4627, + "loss": 0.4616, "step": 265 }, { "epoch": 0.569593147751606, - "grad_norm": 0.28351706499792234, + "grad_norm": 0.2721420459823916, "learning_rate": 4.503968253968254e-05, - "loss": 0.4574, + "loss": 0.4547, "step": 266 }, { "epoch": 0.5717344753747323, - "grad_norm": 0.27425219768007014, + "grad_norm": 0.2687992677978425, "learning_rate": 4.5e-05, - "loss": 0.4589, + "loss": 0.4576, "step": 267 }, { "epoch": 0.5738758029978587, - "grad_norm": 0.2869408267882252, + "grad_norm": 0.2742900950884715, "learning_rate": 4.4960317460317464e-05, - "loss": 0.4475, + "loss": 0.4459, "step": 268 }, { "epoch": 0.576017130620985, - "grad_norm": 0.2762345424358576, + "grad_norm": 0.28795217687814245, "learning_rate": 4.4920634920634924e-05, - "loss": 0.4708, + "loss": 0.4698, "step": 269 }, { "epoch": 0.5781584582441114, - "grad_norm": 0.3049597843838145, + "grad_norm": 0.25641275034601446, "learning_rate": 4.4880952380952385e-05, - "loss": 0.4689, + "loss": 0.4679, "step": 270 }, { "epoch": 0.5802997858672377, - "grad_norm": 0.2592725090051658, + "grad_norm": 0.26668890447386795, "learning_rate": 4.4841269841269846e-05, - "loss": 0.4656, + "loss": 0.463, "step": 271 }, { "epoch": 0.582441113490364, - "grad_norm": 0.26044781623078483, + "grad_norm": 0.24030260367205755, "learning_rate": 4.4801587301587307e-05, - "loss": 0.4638, + "loss": 0.4628, "step": 272 }, { "epoch": 0.5845824411134903, - "grad_norm": 0.2804692407312598, + "grad_norm": 0.2790517674146759, "learning_rate": 4.476190476190477e-05, - "loss": 0.4675, + "loss": 0.4671, "step": 273 }, { "epoch": 0.5867237687366167, - "grad_norm": 0.26684962117113786, + "grad_norm": 0.22820456884707455, "learning_rate": 4.472222222222223e-05, - "loss": 0.4467, + "loss": 0.4457, "step": 274 }, { "epoch": 0.588865096359743, - "grad_norm": 0.31968480635740154, + "grad_norm": 0.3271552248804396, "learning_rate": 4.468253968253969e-05, - "loss": 0.474, + "loss": 0.473, "step": 275 }, { "epoch": 0.5910064239828694, - "grad_norm": 0.2644188695273227, + "grad_norm": 0.2862040648114735, "learning_rate": 4.464285714285715e-05, - "loss": 0.4686, + "loss": 0.4675, "step": 276 }, { "epoch": 0.5931477516059958, - "grad_norm": 0.2930643108233625, + "grad_norm": 0.3114706333448285, "learning_rate": 4.460317460317461e-05, - "loss": 0.4669, + "loss": 0.4666, "step": 277 }, { "epoch": 0.5952890792291221, - "grad_norm": 0.3015159130498082, + "grad_norm": 0.32309306607336397, "learning_rate": 4.456349206349207e-05, - "loss": 0.4648, + "loss": 0.463, "step": 278 }, { "epoch": 0.5974304068522484, - "grad_norm": 0.2727279142726974, + "grad_norm": 0.24906075204446051, "learning_rate": 4.4523809523809525e-05, - "loss": 0.4612, + "loss": 0.4588, "step": 279 }, { "epoch": 0.5995717344753747, - "grad_norm": 0.2975236904657764, + "grad_norm": 0.3269798068381555, "learning_rate": 4.4484126984126986e-05, - "loss": 0.444, + "loss": 0.4434, "step": 280 }, { "epoch": 0.6017130620985011, - "grad_norm": 0.2580099237283999, + "grad_norm": 0.24388996621753112, "learning_rate": 4.4444444444444447e-05, - "loss": 0.4792, + "loss": 0.4773, "step": 281 }, { "epoch": 0.6038543897216274, - "grad_norm": 0.2961073515555114, + "grad_norm": 0.3069850683178239, "learning_rate": 4.440476190476191e-05, - "loss": 0.4534, + "loss": 0.4517, "step": 282 }, { "epoch": 0.6059957173447538, - "grad_norm": 0.2764889525421342, + "grad_norm": 0.249901094708518, "learning_rate": 4.436507936507937e-05, - "loss": 0.4442, + "loss": 0.4426, "step": 283 }, { "epoch": 0.6081370449678801, - "grad_norm": 0.26435268929370137, + "grad_norm": 0.2678810481940219, "learning_rate": 4.432539682539683e-05, - "loss": 0.4452, + "loss": 0.4446, "step": 284 }, { "epoch": 0.6102783725910065, - "grad_norm": 0.2882202813256758, + "grad_norm": 0.26626008865151485, "learning_rate": 4.428571428571428e-05, - "loss": 0.4448, + "loss": 0.444, "step": 285 }, { "epoch": 0.6124197002141327, - "grad_norm": 0.2561848522615123, + "grad_norm": 0.25802913659501, "learning_rate": 4.4246031746031744e-05, - "loss": 0.453, + "loss": 0.4514, "step": 286 }, { "epoch": 0.6145610278372591, - "grad_norm": 0.32128316367276627, + "grad_norm": 0.3058519427291137, "learning_rate": 4.4206349206349204e-05, - "loss": 0.4676, + "loss": 0.4657, "step": 287 }, { "epoch": 0.6167023554603854, - "grad_norm": 0.305653864447794, + "grad_norm": 0.25739722317495545, "learning_rate": 4.4166666666666665e-05, - "loss": 0.457, + "loss": 0.4565, "step": 288 }, { "epoch": 0.6188436830835118, - "grad_norm": 0.27429918226444405, + "grad_norm": 0.2846833831079424, "learning_rate": 4.4126984126984126e-05, - "loss": 0.4592, + "loss": 0.4584, "step": 289 }, { "epoch": 0.6209850107066381, - "grad_norm": 0.26436848550152375, + "grad_norm": 0.258608736893351, "learning_rate": 4.4087301587301587e-05, - "loss": 0.4343, + "loss": 0.4335, "step": 290 }, { "epoch": 0.6231263383297645, - "grad_norm": 0.2892075831032154, + "grad_norm": 0.2911602326047305, "learning_rate": 4.404761904761905e-05, - "loss": 0.4405, + "loss": 0.4394, "step": 291 }, { "epoch": 0.6252676659528907, - "grad_norm": 0.27340488703312127, + "grad_norm": 0.26424730063896607, "learning_rate": 4.400793650793651e-05, - "loss": 0.4754, + "loss": 0.4739, "step": 292 }, { "epoch": 0.6274089935760171, - "grad_norm": 0.2748574269063886, + "grad_norm": 0.27052936749041473, "learning_rate": 4.396825396825397e-05, - "loss": 0.474, + "loss": 0.4737, "step": 293 }, { "epoch": 0.6295503211991434, - "grad_norm": 0.2941612988883471, + "grad_norm": 0.2826287053464395, "learning_rate": 4.392857142857143e-05, - "loss": 0.4564, + "loss": 0.4557, "step": 294 }, { "epoch": 0.6316916488222698, - "grad_norm": 0.24636592894226964, + "grad_norm": 0.22012742826317977, "learning_rate": 4.388888888888889e-05, - "loss": 0.4422, + "loss": 0.4408, "step": 295 }, { "epoch": 0.6338329764453962, - "grad_norm": 0.28582887621940406, + "grad_norm": 0.28162001988825747, "learning_rate": 4.384920634920635e-05, - "loss": 0.4588, + "loss": 0.4574, "step": 296 }, { "epoch": 0.6359743040685225, - "grad_norm": 0.27743437358263945, + "grad_norm": 0.23283539296950778, "learning_rate": 4.380952380952381e-05, - "loss": 0.4469, + "loss": 0.4462, "step": 297 }, { "epoch": 0.6381156316916489, - "grad_norm": 0.26242382702944955, + "grad_norm": 0.2498862871583954, "learning_rate": 4.376984126984127e-05, - "loss": 0.4398, + "loss": 0.439, "step": 298 }, { "epoch": 0.6402569593147751, - "grad_norm": 0.28165809226942223, + "grad_norm": 0.30936218743365784, "learning_rate": 4.373015873015873e-05, - "loss": 0.4413, + "loss": 0.4405, "step": 299 }, { "epoch": 0.6423982869379015, - "grad_norm": 0.25284838465648923, + "grad_norm": 0.22890543657069876, "learning_rate": 4.3690476190476194e-05, - "loss": 0.4342, + "loss": 0.4337, "step": 300 }, { "epoch": 0.6445396145610278, - "grad_norm": 0.31339700087144234, + "grad_norm": 0.3478614587683788, "learning_rate": 4.3650793650793655e-05, - "loss": 0.4612, + "loss": 0.4596, "step": 301 }, { "epoch": 0.6466809421841542, - "grad_norm": 0.3104952345189579, + "grad_norm": 0.2696278385429742, "learning_rate": 4.3611111111111116e-05, - "loss": 0.4731, + "loss": 0.4713, "step": 302 }, { "epoch": 0.6488222698072805, - "grad_norm": 0.2947572345084962, + "grad_norm": 0.3139962716399602, "learning_rate": 4.3571428571428576e-05, - "loss": 0.453, + "loss": 0.4527, "step": 303 }, { "epoch": 0.6509635974304069, - "grad_norm": 0.3455960603661197, + "grad_norm": 0.2670285875512765, "learning_rate": 4.353174603174604e-05, - "loss": 0.4644, + "loss": 0.4627, "step": 304 }, { "epoch": 0.6531049250535332, - "grad_norm": 0.25711343366321854, + "grad_norm": 0.3044721685500663, "learning_rate": 4.34920634920635e-05, - "loss": 0.4802, + "loss": 0.4785, "step": 305 }, { "epoch": 0.6552462526766595, - "grad_norm": 0.31160993789120733, + "grad_norm": 0.27396062258454984, "learning_rate": 4.345238095238096e-05, - "loss": 0.4534, + "loss": 0.4523, "step": 306 }, { "epoch": 0.6573875802997858, - "grad_norm": 0.27484057830146985, + "grad_norm": 0.2886960728161468, "learning_rate": 4.341269841269842e-05, - "loss": 0.4529, + "loss": 0.4517, "step": 307 }, { "epoch": 0.6595289079229122, - "grad_norm": 0.26337065068689747, + "grad_norm": 0.2533722996661738, "learning_rate": 4.337301587301587e-05, - "loss": 0.4647, + "loss": 0.4646, "step": 308 }, { "epoch": 0.6616702355460385, - "grad_norm": 0.271610173490994, + "grad_norm": 0.2699468559182033, "learning_rate": 4.3333333333333334e-05, - "loss": 0.4445, + "loss": 0.4421, "step": 309 }, { "epoch": 0.6638115631691649, - "grad_norm": 0.2528923842806531, + "grad_norm": 0.283041429141369, "learning_rate": 4.3293650793650795e-05, - "loss": 0.4571, + "loss": 0.4552, "step": 310 }, { "epoch": 0.6659528907922913, - "grad_norm": 0.28885045896535533, + "grad_norm": 0.2591706001178728, "learning_rate": 4.3253968253968256e-05, - "loss": 0.4554, + "loss": 0.4527, "step": 311 }, { "epoch": 0.6680942184154176, - "grad_norm": 0.25814187425851787, + "grad_norm": 0.2864467963151813, "learning_rate": 4.3214285714285716e-05, - "loss": 0.468, + "loss": 0.4666, "step": 312 }, { "epoch": 0.6702355460385439, - "grad_norm": 0.26764371303035933, + "grad_norm": 0.281657408994935, "learning_rate": 4.317460317460318e-05, - "loss": 0.4594, + "loss": 0.4582, "step": 313 }, { "epoch": 0.6723768736616702, - "grad_norm": 0.24837539558393776, + "grad_norm": 0.2474591816739494, "learning_rate": 4.313492063492064e-05, - "loss": 0.4371, + "loss": 0.4361, "step": 314 }, { "epoch": 0.6745182012847966, - "grad_norm": 0.25412398449219464, + "grad_norm": 0.2973486888734723, "learning_rate": 4.30952380952381e-05, - "loss": 0.4594, + "loss": 0.4583, "step": 315 }, { "epoch": 0.6766595289079229, - "grad_norm": 0.23755251323497587, + "grad_norm": 0.2444964189560587, "learning_rate": 4.305555555555556e-05, - "loss": 0.4674, + "loss": 0.4664, "step": 316 }, { "epoch": 0.6788008565310493, - "grad_norm": 0.2924363858039394, + "grad_norm": 0.2480078169039145, "learning_rate": 4.301587301587302e-05, - "loss": 0.4688, + "loss": 0.4675, "step": 317 }, { "epoch": 0.6809421841541756, - "grad_norm": 0.2570229512356918, + "grad_norm": 0.26312534234905105, "learning_rate": 4.297619047619048e-05, - "loss": 0.4494, + "loss": 0.4489, "step": 318 }, { "epoch": 0.683083511777302, - "grad_norm": 0.2764941603084112, + "grad_norm": 0.2462656315297276, "learning_rate": 4.2936507936507935e-05, - "loss": 0.4605, + "loss": 0.4591, "step": 319 }, { "epoch": 0.6852248394004282, - "grad_norm": 0.3153973496882214, + "grad_norm": 0.29816624349859283, "learning_rate": 4.2896825396825396e-05, - "loss": 0.4488, + "loss": 0.4477, "step": 320 }, { "epoch": 0.6873661670235546, - "grad_norm": 0.28153011145522383, + "grad_norm": 0.24698029570776403, "learning_rate": 4.2857142857142856e-05, - "loss": 0.4705, + "loss": 0.4695, "step": 321 }, { "epoch": 0.6895074946466809, - "grad_norm": 0.3506781644039644, + "grad_norm": 0.34703431482300096, "learning_rate": 4.281746031746032e-05, - "loss": 0.4576, + "loss": 0.4566, "step": 322 }, { "epoch": 0.6916488222698073, - "grad_norm": 0.3053110048188775, + "grad_norm": 0.2437786811285962, "learning_rate": 4.277777777777778e-05, - "loss": 0.4732, + "loss": 0.4723, "step": 323 }, { "epoch": 0.6937901498929336, - "grad_norm": 0.3240151118248191, + "grad_norm": 0.3412185273042048, "learning_rate": 4.273809523809524e-05, - "loss": 0.4554, + "loss": 0.4532, "step": 324 }, { "epoch": 0.69593147751606, - "grad_norm": 0.29062216265241864, + "grad_norm": 0.24904956038384604, "learning_rate": 4.26984126984127e-05, - "loss": 0.4342, + "loss": 0.4334, "step": 325 }, { "epoch": 0.6980728051391863, - "grad_norm": 0.25283396063398134, + "grad_norm": 0.2782237575432937, "learning_rate": 4.265873015873016e-05, - "loss": 0.4464, + "loss": 0.4463, "step": 326 }, { "epoch": 0.7002141327623126, - "grad_norm": 0.2680651552624257, + "grad_norm": 0.2657363496164857, "learning_rate": 4.261904761904762e-05, - "loss": 0.4601, + "loss": 0.4591, "step": 327 }, { "epoch": 0.702355460385439, - "grad_norm": 0.33752170462860476, + "grad_norm": 0.2555708376181943, "learning_rate": 4.257936507936508e-05, - "loss": 0.4478, + "loss": 0.4464, "step": 328 }, { "epoch": 0.7044967880085653, - "grad_norm": 0.2190411233461398, + "grad_norm": 0.23571687740169944, "learning_rate": 4.253968253968254e-05, - "loss": 0.4465, + "loss": 0.4455, "step": 329 }, { "epoch": 0.7066381156316917, - "grad_norm": 0.2508058536828542, + "grad_norm": 0.2687735724291379, "learning_rate": 4.25e-05, - "loss": 0.4452, + "loss": 0.4442, "step": 330 }, { "epoch": 0.708779443254818, - "grad_norm": 0.2377335448608951, + "grad_norm": 0.21677485542153913, "learning_rate": 4.2460317460317464e-05, - "loss": 0.4401, + "loss": 0.4389, "step": 331 }, { "epoch": 0.7109207708779444, - "grad_norm": 0.2529235872462942, + "grad_norm": 0.30463445202519995, "learning_rate": 4.2420634920634925e-05, - "loss": 0.4651, + "loss": 0.4641, "step": 332 }, { "epoch": 0.7130620985010707, - "grad_norm": 0.2516915529502037, + "grad_norm": 0.25037659567634474, "learning_rate": 4.2380952380952385e-05, - "loss": 0.4617, + "loss": 0.4601, "step": 333 }, { "epoch": 0.715203426124197, - "grad_norm": 0.2364540674065309, + "grad_norm": 0.2597863908544412, "learning_rate": 4.2341269841269846e-05, - "loss": 0.4711, + "loss": 0.4692, "step": 334 }, { "epoch": 0.7173447537473233, - "grad_norm": 0.25560457992791263, + "grad_norm": 0.26953376114396266, "learning_rate": 4.23015873015873e-05, - "loss": 0.4558, + "loss": 0.454, "step": 335 }, { "epoch": 0.7194860813704497, - "grad_norm": 0.23835034145803924, + "grad_norm": 0.246208461301376, "learning_rate": 4.226190476190476e-05, - "loss": 0.4643, + "loss": 0.463, "step": 336 }, { "epoch": 0.721627408993576, - "grad_norm": 0.22675744611839385, + "grad_norm": 0.2370326879028165, "learning_rate": 4.222222222222222e-05, - "loss": 0.4477, + "loss": 0.4467, "step": 337 }, { "epoch": 0.7237687366167024, - "grad_norm": 0.24050077892906585, + "grad_norm": 0.25036860829476676, "learning_rate": 4.218253968253968e-05, - "loss": 0.4417, + "loss": 0.4411, "step": 338 }, { "epoch": 0.7259100642398287, - "grad_norm": 0.25640906605974817, + "grad_norm": 0.28329537291272733, "learning_rate": 4.214285714285714e-05, - "loss": 0.4549, + "loss": 0.454, "step": 339 }, { "epoch": 0.728051391862955, - "grad_norm": 0.228989440935261, + "grad_norm": 0.24559865290348679, "learning_rate": 4.2103174603174604e-05, - "loss": 0.4504, + "loss": 0.4498, "step": 340 }, { "epoch": 0.7301927194860813, - "grad_norm": 0.2308736253356299, + "grad_norm": 0.23500115967335444, "learning_rate": 4.2063492063492065e-05, - "loss": 0.4577, + "loss": 0.4552, "step": 341 }, { "epoch": 0.7323340471092077, - "grad_norm": 0.25703393548284337, + "grad_norm": 0.2919643243289569, "learning_rate": 4.2023809523809525e-05, - "loss": 0.4586, + "loss": 0.4567, "step": 342 }, { "epoch": 0.734475374732334, - "grad_norm": 0.24906898794284202, + "grad_norm": 0.26791366452779686, "learning_rate": 4.1984126984126986e-05, - "loss": 0.4457, + "loss": 0.4442, "step": 343 }, { "epoch": 0.7366167023554604, - "grad_norm": 0.2574124725732392, + "grad_norm": 0.28291332973939526, "learning_rate": 4.194444444444445e-05, - "loss": 0.4529, + "loss": 0.451, "step": 344 }, { "epoch": 0.7387580299785867, - "grad_norm": 0.243507960931399, + "grad_norm": 0.25359000867556963, "learning_rate": 4.190476190476191e-05, - "loss": 0.4438, + "loss": 0.4425, "step": 345 }, { "epoch": 0.7408993576017131, - "grad_norm": 0.2445092413905703, + "grad_norm": 0.2408660681176407, "learning_rate": 4.186507936507937e-05, - "loss": 0.4498, + "loss": 0.4493, "step": 346 }, { "epoch": 0.7430406852248393, - "grad_norm": 0.30753988028224943, + "grad_norm": 0.3349449875093207, "learning_rate": 4.182539682539683e-05, - "loss": 0.4598, + "loss": 0.4592, "step": 347 }, { "epoch": 0.7451820128479657, - "grad_norm": 0.2505049178661786, + "grad_norm": 0.2559707605515902, "learning_rate": 4.178571428571429e-05, - "loss": 0.4553, + "loss": 0.4538, "step": 348 }, { "epoch": 0.7473233404710921, - "grad_norm": 0.27044349556425973, + "grad_norm": 0.2737394625735486, "learning_rate": 4.174603174603175e-05, - "loss": 0.4573, + "loss": 0.4563, "step": 349 }, { "epoch": 0.7494646680942184, - "grad_norm": 0.25124615427090896, + "grad_norm": 0.28364900087424905, "learning_rate": 4.170634920634921e-05, - "loss": 0.476, + "loss": 0.4741, "step": 350 }, { "epoch": 0.7516059957173448, - "grad_norm": 0.26703556388241734, + "grad_norm": 0.2741462100795298, "learning_rate": 4.166666666666667e-05, - "loss": 0.4366, + "loss": 0.4363, "step": 351 }, { "epoch": 0.7537473233404711, - "grad_norm": 0.24998740358371754, + "grad_norm": 0.25635492049633196, "learning_rate": 4.162698412698413e-05, - "loss": 0.4467, + "loss": 0.4445, "step": 352 }, { "epoch": 0.7558886509635975, - "grad_norm": 0.2554947787191526, + "grad_norm": 0.273452663060448, "learning_rate": 4.1587301587301594e-05, - "loss": 0.4414, + "loss": 0.4408, "step": 353 }, { "epoch": 0.7580299785867237, - "grad_norm": 0.29499953780878213, + "grad_norm": 0.32668360058869866, "learning_rate": 4.1547619047619054e-05, - "loss": 0.4668, + "loss": 0.4653, "step": 354 }, { "epoch": 0.7601713062098501, - "grad_norm": 0.2634686274433604, + "grad_norm": 0.28322113864222886, "learning_rate": 4.1507936507936515e-05, - "loss": 0.4501, + "loss": 0.449, "step": 355 }, { "epoch": 0.7623126338329764, - "grad_norm": 0.27587805015536104, + "grad_norm": 0.3314451874615497, "learning_rate": 4.1468253968253976e-05, "loss": 0.4345, "step": 356 }, { "epoch": 0.7644539614561028, - "grad_norm": 0.27969963429910366, + "grad_norm": 0.2909325990893949, "learning_rate": 4.1428571428571437e-05, - "loss": 0.4517, + "loss": 0.4501, "step": 357 }, { "epoch": 0.7665952890792291, - "grad_norm": 0.3006002807511561, + "grad_norm": 0.3155371913936611, "learning_rate": 4.138888888888889e-05, - "loss": 0.4363, + "loss": 0.4362, "step": 358 }, { "epoch": 0.7687366167023555, - "grad_norm": 0.24094041832952617, + "grad_norm": 0.24516857979495404, "learning_rate": 4.134920634920635e-05, - "loss": 0.4463, + "loss": 0.4457, "step": 359 }, { "epoch": 0.7708779443254818, - "grad_norm": 0.30068476671680583, + "grad_norm": 0.30672444383523295, "learning_rate": 4.130952380952381e-05, - "loss": 0.4394, + "loss": 0.4382, "step": 360 }, { "epoch": 0.7730192719486081, - "grad_norm": 0.24223112221889828, + "grad_norm": 0.2622046826324259, "learning_rate": 4.126984126984127e-05, - "loss": 0.4437, + "loss": 0.4427, "step": 361 }, { "epoch": 0.7751605995717344, - "grad_norm": 0.3306808578708232, + "grad_norm": 0.28744995963189524, "learning_rate": 4.123015873015873e-05, - "loss": 0.463, + "loss": 0.4608, "step": 362 }, { "epoch": 0.7773019271948608, - "grad_norm": 0.22496155789537978, + "grad_norm": 0.2566986215900858, "learning_rate": 4.119047619047619e-05, - "loss": 0.4396, + "loss": 0.4395, "step": 363 }, { "epoch": 0.7794432548179872, - "grad_norm": 0.26980303715734333, + "grad_norm": 0.2754243217704112, "learning_rate": 4.115079365079365e-05, - "loss": 0.4502, + "loss": 0.4472, "step": 364 }, { "epoch": 0.7815845824411135, - "grad_norm": 0.24531612262771849, + "grad_norm": 0.2679510555818145, "learning_rate": 4.111111111111111e-05, - "loss": 0.4371, + "loss": 0.4364, "step": 365 }, { "epoch": 0.7837259100642399, - "grad_norm": 0.23508992950961904, + "grad_norm": 0.22802893505394484, "learning_rate": 4.107142857142857e-05, - "loss": 0.4351, + "loss": 0.4341, "step": 366 }, { "epoch": 0.7858672376873662, - "grad_norm": 0.2343557766020594, + "grad_norm": 0.25400510440151647, "learning_rate": 4.103174603174603e-05, - "loss": 0.4474, + "loss": 0.446, "step": 367 }, { "epoch": 0.7880085653104925, - "grad_norm": 0.24946904187399346, + "grad_norm": 0.24898639805458245, "learning_rate": 4.099206349206349e-05, - "loss": 0.4432, + "loss": 0.4417, "step": 368 }, { "epoch": 0.7901498929336188, - "grad_norm": 0.253014359764136, + "grad_norm": 0.2820704635610971, "learning_rate": 4.095238095238095e-05, - "loss": 0.4528, + "loss": 0.4518, "step": 369 }, { "epoch": 0.7922912205567452, - "grad_norm": 0.22772669050696337, + "grad_norm": 0.23351424195098205, "learning_rate": 4.091269841269841e-05, - "loss": 0.4543, + "loss": 0.4515, "step": 370 }, { "epoch": 0.7944325481798715, - "grad_norm": 0.24881653297292614, + "grad_norm": 0.2815505680256724, "learning_rate": 4.0873015873015874e-05, - "loss": 0.4312, + "loss": 0.4296, "step": 371 }, { "epoch": 0.7965738758029979, - "grad_norm": 0.25995771340020857, + "grad_norm": 0.23872824455145594, "learning_rate": 4.0833333333333334e-05, - "loss": 0.4265, + "loss": 0.4253, "step": 372 }, { "epoch": 0.7987152034261242, - "grad_norm": 0.21693583355693366, + "grad_norm": 0.2509283478607524, "learning_rate": 4.0793650793650795e-05, - "loss": 0.4278, + "loss": 0.4272, "step": 373 }, { "epoch": 0.8008565310492506, - "grad_norm": 0.24159689301797868, + "grad_norm": 0.23260890926948258, "learning_rate": 4.0753968253968256e-05, - "loss": 0.4437, + "loss": 0.4422, "step": 374 }, { "epoch": 0.8029978586723768, - "grad_norm": 0.25009229119018167, + "grad_norm": 0.24640220531762067, "learning_rate": 4.0714285714285717e-05, - "loss": 0.4338, + "loss": 0.4325, "step": 375 }, { "epoch": 0.8051391862955032, - "grad_norm": 0.2486299019264688, + "grad_norm": 0.2557935473418916, "learning_rate": 4.067460317460318e-05, - "loss": 0.4506, + "loss": 0.4495, "step": 376 }, { "epoch": 0.8072805139186295, - "grad_norm": 0.247308996752361, + "grad_norm": 0.24864370999195623, "learning_rate": 4.063492063492064e-05, - "loss": 0.4414, + "loss": 0.44, "step": 377 }, { "epoch": 0.8094218415417559, - "grad_norm": 0.2590933139992137, + "grad_norm": 0.2494337325169666, "learning_rate": 4.05952380952381e-05, - "loss": 0.4297, + "loss": 0.4292, "step": 378 }, { "epoch": 0.8115631691648822, - "grad_norm": 0.2508458429815614, + "grad_norm": 0.26643265488032447, "learning_rate": 4.055555555555556e-05, - "loss": 0.4269, + "loss": 0.4258, "step": 379 }, { "epoch": 0.8137044967880086, - "grad_norm": 0.2879218570314043, + "grad_norm": 0.29981525233093126, "learning_rate": 4.051587301587302e-05, - "loss": 0.4466, + "loss": 0.4462, "step": 380 }, { "epoch": 0.815845824411135, - "grad_norm": 0.23441911617011937, + "grad_norm": 0.23483623618223876, "learning_rate": 4.047619047619048e-05, - "loss": 0.4412, + "loss": 0.4403, "step": 381 }, { "epoch": 0.8179871520342612, - "grad_norm": 0.32198396205698543, + "grad_norm": 0.31621068938716507, "learning_rate": 4.043650793650794e-05, - "loss": 0.4547, + "loss": 0.4542, "step": 382 }, { "epoch": 0.8201284796573876, - "grad_norm": 0.30161712796978346, + "grad_norm": 0.27447908054014813, "learning_rate": 4.03968253968254e-05, - "loss": 0.4515, + "loss": 0.4498, "step": 383 }, { "epoch": 0.8222698072805139, - "grad_norm": 0.23761583289371385, + "grad_norm": 0.2979087905677751, "learning_rate": 4.035714285714286e-05, - "loss": 0.4343, + "loss": 0.4326, "step": 384 }, { "epoch": 0.8244111349036403, - "grad_norm": 0.27338237171088253, + "grad_norm": 0.2738788951804694, "learning_rate": 4.031746031746032e-05, - "loss": 0.445, + "loss": 0.4433, "step": 385 }, { "epoch": 0.8265524625267666, - "grad_norm": 0.2856895105115863, + "grad_norm": 0.27184798198740845, "learning_rate": 4.027777777777778e-05, - "loss": 0.4401, + "loss": 0.4385, "step": 386 }, { "epoch": 0.828693790149893, - "grad_norm": 0.2379960246228477, + "grad_norm": 0.2596242318887042, "learning_rate": 4.023809523809524e-05, - "loss": 0.4411, + "loss": 0.4391, "step": 387 }, { "epoch": 0.8308351177730193, - "grad_norm": 0.3631643489141991, + "grad_norm": 0.3257005719389779, "learning_rate": 4.01984126984127e-05, - "loss": 0.4528, + "loss": 0.4515, "step": 388 }, { "epoch": 0.8329764453961456, - "grad_norm": 0.2664861994481591, + "grad_norm": 0.26020892798323664, "learning_rate": 4.015873015873016e-05, - "loss": 0.454, + "loss": 0.4525, "step": 389 }, { "epoch": 0.8351177730192719, - "grad_norm": 0.3054363176379015, + "grad_norm": 0.3282937862319744, "learning_rate": 4.011904761904762e-05, - "loss": 0.4535, + "loss": 0.4522, "step": 390 }, { "epoch": 0.8372591006423983, - "grad_norm": 0.31912720459183097, + "grad_norm": 0.3036258464302225, "learning_rate": 4.007936507936508e-05, - "loss": 0.4493, + "loss": 0.4477, "step": 391 }, { "epoch": 0.8394004282655246, - "grad_norm": 0.23251206637574764, + "grad_norm": 0.26338851157250837, "learning_rate": 4.003968253968254e-05, - "loss": 0.4431, + "loss": 0.4422, "step": 392 }, { "epoch": 0.841541755888651, - "grad_norm": 0.28631581267375344, + "grad_norm": 0.3106182811071532, "learning_rate": 4e-05, - "loss": 0.4323, + "loss": 0.4309, "step": 393 }, { "epoch": 0.8436830835117773, - "grad_norm": 0.267723471910929, + "grad_norm": 0.24266766603453416, "learning_rate": 3.9960317460317464e-05, - "loss": 0.4473, + "loss": 0.4466, "step": 394 }, { "epoch": 0.8458244111349036, - "grad_norm": 0.27597440700713444, + "grad_norm": 0.34811764903452264, "learning_rate": 3.9920634920634925e-05, - "loss": 0.4542, + "loss": 0.452, "step": 395 }, { "epoch": 0.8479657387580299, - "grad_norm": 0.25433002014223405, + "grad_norm": 0.24788695830598698, "learning_rate": 3.9880952380952386e-05, - "loss": 0.4444, + "loss": 0.4433, "step": 396 }, { "epoch": 0.8501070663811563, - "grad_norm": 0.25437045991554746, + "grad_norm": 0.2783115131710977, "learning_rate": 3.984126984126984e-05, - "loss": 0.4579, + "loss": 0.4561, "step": 397 }, { "epoch": 0.8522483940042827, - "grad_norm": 0.24534548283140603, + "grad_norm": 0.2576730623053907, "learning_rate": 3.98015873015873e-05, - "loss": 0.4725, + "loss": 0.4708, "step": 398 }, { "epoch": 0.854389721627409, - "grad_norm": 0.25435785462287974, + "grad_norm": 0.24712208382727868, "learning_rate": 3.976190476190476e-05, - "loss": 0.4512, + "loss": 0.449, "step": 399 }, { "epoch": 0.8565310492505354, - "grad_norm": 0.24601810950369032, + "grad_norm": 0.2708918582747635, "learning_rate": 3.972222222222222e-05, - "loss": 0.448, + "loss": 0.446, "step": 400 }, { "epoch": 0.8586723768736617, - "grad_norm": 0.2653361455115733, + "grad_norm": 0.2713784850892593, "learning_rate": 3.968253968253968e-05, - "loss": 0.439, + "loss": 0.4386, "step": 401 }, { "epoch": 0.860813704496788, - "grad_norm": 0.23673358174194264, + "grad_norm": 0.25255493559196973, "learning_rate": 3.964285714285714e-05, - "loss": 0.4413, + "loss": 0.44, "step": 402 }, { "epoch": 0.8629550321199143, - "grad_norm": 0.24462225822851738, + "grad_norm": 0.27193870527806324, "learning_rate": 3.9603174603174604e-05, - "loss": 0.45, + "loss": 0.4485, "step": 403 }, { "epoch": 0.8650963597430407, - "grad_norm": 0.2788329723246181, + "grad_norm": 0.27126270170714106, "learning_rate": 3.9563492063492065e-05, - "loss": 0.454, + "loss": 0.4525, "step": 404 }, { "epoch": 0.867237687366167, - "grad_norm": 0.24432656732987582, + "grad_norm": 0.24844125447179155, "learning_rate": 3.9523809523809526e-05, - "loss": 0.4346, + "loss": 0.4336, "step": 405 }, { "epoch": 0.8693790149892934, - "grad_norm": 0.3020331491334916, + "grad_norm": 0.30939011775053954, "learning_rate": 3.9484126984126986e-05, - "loss": 0.4522, + "loss": 0.4494, "step": 406 }, { "epoch": 0.8715203426124197, - "grad_norm": 0.2856505508961332, + "grad_norm": 0.26378548519547557, "learning_rate": 3.944444444444445e-05, - "loss": 0.4485, + "loss": 0.447, "step": 407 }, { "epoch": 0.8736616702355461, - "grad_norm": 0.2656132022406484, + "grad_norm": 0.2849481666705005, "learning_rate": 3.940476190476191e-05, - "loss": 0.4601, + "loss": 0.4583, "step": 408 }, { "epoch": 0.8758029978586723, - "grad_norm": 0.29873704798610634, + "grad_norm": 0.2815143992545685, "learning_rate": 3.936507936507937e-05, - "loss": 0.4354, + "loss": 0.4348, "step": 409 }, { "epoch": 0.8779443254817987, - "grad_norm": 0.28600605020427156, + "grad_norm": 0.27153299884477455, "learning_rate": 3.932539682539683e-05, - "loss": 0.4452, + "loss": 0.4434, "step": 410 }, { "epoch": 0.880085653104925, - "grad_norm": 0.26767141652320475, + "grad_norm": 0.27535841889086504, "learning_rate": 3.928571428571429e-05, - "loss": 0.4414, + "loss": 0.4398, "step": 411 }, { "epoch": 0.8822269807280514, - "grad_norm": 0.35131045572847885, + "grad_norm": 0.30776377057560106, "learning_rate": 3.9246031746031744e-05, - "loss": 0.4475, + "loss": 0.4464, "step": 412 }, { "epoch": 0.8843683083511777, - "grad_norm": 0.2370839823487289, + "grad_norm": 0.24429372628536725, "learning_rate": 3.9206349206349205e-05, - "loss": 0.4358, + "loss": 0.4345, "step": 413 }, { "epoch": 0.8865096359743041, - "grad_norm": 0.3544368048991166, + "grad_norm": 0.34967840303013203, "learning_rate": 3.9166666666666665e-05, - "loss": 0.4494, + "loss": 0.447, "step": 414 }, { "epoch": 0.8886509635974305, - "grad_norm": 0.3172911304193655, + "grad_norm": 0.25224213794765044, "learning_rate": 3.9126984126984126e-05, - "loss": 0.4487, + "loss": 0.4477, "step": 415 }, { "epoch": 0.8907922912205567, - "grad_norm": 0.275970880794438, + "grad_norm": 0.2899965867095618, "learning_rate": 3.908730158730159e-05, - "loss": 0.4192, + "loss": 0.4183, "step": 416 }, { "epoch": 0.892933618843683, - "grad_norm": 0.3547661108556819, + "grad_norm": 0.3091076754528513, "learning_rate": 3.904761904761905e-05, - "loss": 0.4445, + "loss": 0.4438, "step": 417 }, { "epoch": 0.8950749464668094, - "grad_norm": 0.24666251953784982, + "grad_norm": 0.25144689022877925, "learning_rate": 3.900793650793651e-05, - "loss": 0.4486, + "loss": 0.4467, "step": 418 }, { "epoch": 0.8972162740899358, - "grad_norm": 0.33556985351699165, + "grad_norm": 0.3164686564857124, "learning_rate": 3.896825396825397e-05, - "loss": 0.4357, + "loss": 0.4348, "step": 419 }, { "epoch": 0.8993576017130621, - "grad_norm": 0.29373216787613254, + "grad_norm": 0.2879131478620461, "learning_rate": 3.892857142857143e-05, - "loss": 0.4559, + "loss": 0.4546, "step": 420 }, { "epoch": 0.9014989293361885, - "grad_norm": 0.27236847144104587, + "grad_norm": 0.2718535367036997, "learning_rate": 3.888888888888889e-05, - "loss": 0.4509, + "loss": 0.4493, "step": 421 }, { "epoch": 0.9036402569593148, - "grad_norm": 0.3363146705059116, + "grad_norm": 0.3215237762816025, "learning_rate": 3.884920634920635e-05, - "loss": 0.4338, + "loss": 0.4324, "step": 422 }, { "epoch": 0.9057815845824411, - "grad_norm": 0.23799725549650105, + "grad_norm": 0.23703610924543467, "learning_rate": 3.880952380952381e-05, - "loss": 0.4422, + "loss": 0.4423, "step": 423 }, { "epoch": 0.9079229122055674, - "grad_norm": 0.2828958789244041, + "grad_norm": 0.2790375906958242, "learning_rate": 3.876984126984127e-05, - "loss": 0.4597, + "loss": 0.459, "step": 424 }, { "epoch": 0.9100642398286938, - "grad_norm": 0.3136305684919964, + "grad_norm": 0.31216707197799737, "learning_rate": 3.8730158730158734e-05, - "loss": 0.4361, + "loss": 0.4358, "step": 425 }, { "epoch": 0.9122055674518201, - "grad_norm": 0.23243444701464022, + "grad_norm": 0.2238572953572114, "learning_rate": 3.8690476190476195e-05, - "loss": 0.4337, + "loss": 0.4329, "step": 426 }, { "epoch": 0.9143468950749465, - "grad_norm": 0.31121643328315307, + "grad_norm": 0.3300050366021809, "learning_rate": 3.8650793650793655e-05, - "loss": 0.4429, + "loss": 0.4411, "step": 427 }, { "epoch": 0.9164882226980728, - "grad_norm": 0.24007066200020147, + "grad_norm": 0.24904767848985657, "learning_rate": 3.8611111111111116e-05, - "loss": 0.4412, + "loss": 0.4395, "step": 428 }, { "epoch": 0.9186295503211992, - "grad_norm": 0.24526627011399632, + "grad_norm": 0.2558995321875151, "learning_rate": 3.857142857142858e-05, - "loss": 0.4326, + "loss": 0.4325, "step": 429 }, { "epoch": 0.9207708779443254, - "grad_norm": 0.2782809072036015, + "grad_norm": 0.2810339034644166, "learning_rate": 3.853174603174604e-05, - "loss": 0.435, + "loss": 0.4333, "step": 430 }, { "epoch": 0.9229122055674518, - "grad_norm": 0.22743806598005653, + "grad_norm": 0.25199173622671855, "learning_rate": 3.84920634920635e-05, - "loss": 0.4522, + "loss": 0.4516, "step": 431 }, { "epoch": 0.9250535331905781, - "grad_norm": 0.2652959665516378, + "grad_norm": 0.27563768688654877, "learning_rate": 3.845238095238096e-05, - "loss": 0.4313, + "loss": 0.4306, "step": 432 }, { "epoch": 0.9271948608137045, - "grad_norm": 0.2320440083455923, + "grad_norm": 0.24180893154053762, "learning_rate": 3.841269841269842e-05, - "loss": 0.4336, + "loss": 0.4327, "step": 433 }, { "epoch": 0.9293361884368309, - "grad_norm": 0.22904699603839362, + "grad_norm": 0.232187215372479, "learning_rate": 3.837301587301588e-05, - "loss": 0.4398, + "loss": 0.4383, "step": 434 }, { "epoch": 0.9314775160599572, - "grad_norm": 0.28061696790352825, + "grad_norm": 0.2524306893162537, "learning_rate": 3.8333333333333334e-05, - "loss": 0.4398, + "loss": 0.4383, "step": 435 }, { "epoch": 0.9336188436830836, - "grad_norm": 0.22771552947703397, + "grad_norm": 0.21917495109126314, "learning_rate": 3.8293650793650795e-05, - "loss": 0.4427, + "loss": 0.442, "step": 436 }, { "epoch": 0.9357601713062098, - "grad_norm": 0.22379899530445632, + "grad_norm": 0.2491947970666118, "learning_rate": 3.8253968253968256e-05, - "loss": 0.4292, + "loss": 0.4284, "step": 437 }, { "epoch": 0.9379014989293362, - "grad_norm": 0.2705759563196105, + "grad_norm": 0.24928999737243862, "learning_rate": 3.821428571428572e-05, - "loss": 0.4381, + "loss": 0.4366, "step": 438 }, { "epoch": 0.9400428265524625, - "grad_norm": 0.25257412614253144, + "grad_norm": 0.2696841956340179, "learning_rate": 3.817460317460317e-05, - "loss": 0.4441, + "loss": 0.4426, "step": 439 }, { "epoch": 0.9421841541755889, - "grad_norm": 0.2230238602545094, + "grad_norm": 0.22630244921640252, "learning_rate": 3.813492063492063e-05, - "loss": 0.4382, + "loss": 0.4363, "step": 440 }, { "epoch": 0.9443254817987152, - "grad_norm": 0.2643425174467501, + "grad_norm": 0.2600201845565814, "learning_rate": 3.809523809523809e-05, - "loss": 0.4494, + "loss": 0.4484, "step": 441 }, { "epoch": 0.9464668094218416, - "grad_norm": 0.22879958232087208, + "grad_norm": 0.24580135509156412, "learning_rate": 3.805555555555555e-05, - "loss": 0.4686, + "loss": 0.4671, "step": 442 }, { "epoch": 0.9486081370449678, - "grad_norm": 0.3234801610814164, + "grad_norm": 0.29113072327829725, "learning_rate": 3.8015873015873014e-05, - "loss": 0.4533, + "loss": 0.451, "step": 443 }, { "epoch": 0.9507494646680942, - "grad_norm": 0.2238623404459414, + "grad_norm": 0.24308118122648267, "learning_rate": 3.7976190476190474e-05, - "loss": 0.4593, + "loss": 0.4564, "step": 444 }, { "epoch": 0.9528907922912205, - "grad_norm": 0.2854000014313601, + "grad_norm": 0.3071448001103864, "learning_rate": 3.7936507936507935e-05, - "loss": 0.4404, + "loss": 0.4397, "step": 445 }, { "epoch": 0.9550321199143469, - "grad_norm": 0.22982231285103497, + "grad_norm": 0.2420827295559477, "learning_rate": 3.7896825396825396e-05, - "loss": 0.447, + "loss": 0.4454, "step": 446 }, { "epoch": 0.9571734475374732, - "grad_norm": 0.22798223769442008, + "grad_norm": 0.23925961181164737, "learning_rate": 3.785714285714286e-05, - "loss": 0.4435, + "loss": 0.4422, "step": 447 }, { "epoch": 0.9593147751605996, - "grad_norm": 0.2698671605451589, + "grad_norm": 0.27857277637461475, "learning_rate": 3.781746031746032e-05, - "loss": 0.4468, + "loss": 0.4451, "step": 448 }, { "epoch": 0.961456102783726, - "grad_norm": 0.25052757718595015, + "grad_norm": 0.25387578048055814, "learning_rate": 3.777777777777778e-05, - "loss": 0.4559, + "loss": 0.454, "step": 449 }, { "epoch": 0.9635974304068522, - "grad_norm": 0.2509803927116757, + "grad_norm": 0.2365009172752972, "learning_rate": 3.773809523809524e-05, - "loss": 0.4485, + "loss": 0.4477, "step": 450 }, { "epoch": 0.9657387580299786, - "grad_norm": 0.24678486996154717, + "grad_norm": 0.24886378867096692, "learning_rate": 3.76984126984127e-05, - "loss": 0.4469, + "loss": 0.4455, "step": 451 }, { "epoch": 0.9678800856531049, - "grad_norm": 0.2297303050823692, + "grad_norm": 0.22709536457385576, "learning_rate": 3.765873015873016e-05, - "loss": 0.4427, + "loss": 0.4421, "step": 452 }, { "epoch": 0.9700214132762313, - "grad_norm": 0.24752427508051356, + "grad_norm": 0.24437838490375288, "learning_rate": 3.761904761904762e-05, - "loss": 0.4414, + "loss": 0.4407, "step": 453 }, { "epoch": 0.9721627408993576, - "grad_norm": 0.285731490530565, + "grad_norm": 0.25704107343861016, "learning_rate": 3.757936507936508e-05, - "loss": 0.4711, + "loss": 0.4688, "step": 454 }, { "epoch": 0.974304068522484, - "grad_norm": 0.2623040174515322, + "grad_norm": 0.22904949606864244, "learning_rate": 3.753968253968254e-05, - "loss": 0.4297, + "loss": 0.4293, "step": 455 }, { "epoch": 0.9764453961456103, - "grad_norm": 0.27326887949485235, + "grad_norm": 0.2849795459258746, "learning_rate": 3.7500000000000003e-05, - "loss": 0.4498, + "loss": 0.4462, "step": 456 }, { "epoch": 0.9785867237687366, - "grad_norm": 0.2603762615843794, + "grad_norm": 0.25874699145604835, "learning_rate": 3.7460317460317464e-05, - "loss": 0.4542, + "loss": 0.4533, "step": 457 }, { "epoch": 0.9807280513918629, - "grad_norm": 0.32975141148086856, + "grad_norm": 0.3152490582808027, "learning_rate": 3.7420634920634925e-05, - "loss": 0.4424, + "loss": 0.4407, "step": 458 }, { "epoch": 0.9828693790149893, - "grad_norm": 0.29153212143683266, + "grad_norm": 0.26375789665291616, "learning_rate": 3.7380952380952386e-05, - "loss": 0.4483, + "loss": 0.4465, "step": 459 }, { "epoch": 0.9850107066381156, - "grad_norm": 0.29992516288872756, + "grad_norm": 0.29544790745591465, "learning_rate": 3.7341269841269846e-05, - "loss": 0.4511, + "loss": 0.4488, "step": 460 }, { "epoch": 0.987152034261242, - "grad_norm": 0.29958162250392306, + "grad_norm": 0.2825194356752145, "learning_rate": 3.730158730158731e-05, - "loss": 0.4263, + "loss": 0.4254, "step": 461 }, { "epoch": 0.9892933618843683, - "grad_norm": 0.24964751706597657, + "grad_norm": 0.24723341454362188, "learning_rate": 3.726190476190476e-05, - "loss": 0.4324, + "loss": 0.4313, "step": 462 }, { "epoch": 0.9914346895074947, - "grad_norm": 0.2614912237652234, + "grad_norm": 0.23944247120858028, "learning_rate": 3.722222222222222e-05, - "loss": 0.4312, + "loss": 0.4297, "step": 463 }, { "epoch": 0.9935760171306209, - "grad_norm": 0.266348778726537, + "grad_norm": 0.25318330122355875, "learning_rate": 3.718253968253968e-05, - "loss": 0.4263, + "loss": 0.4248, "step": 464 }, { "epoch": 0.9957173447537473, - "grad_norm": 0.23725530062162245, + "grad_norm": 0.2385083732481541, "learning_rate": 3.7142857142857143e-05, - "loss": 0.4364, + "loss": 0.4342, "step": 465 }, { "epoch": 0.9978586723768736, - "grad_norm": 0.30914413313910843, + "grad_norm": 0.25267257737774884, "learning_rate": 3.7103174603174604e-05, - "loss": 0.4261, + "loss": 0.4244, "step": 466 }, { "epoch": 1.0, - "grad_norm": 0.23586300310874772, + "grad_norm": 0.26302764538521656, "learning_rate": 3.7063492063492065e-05, - "loss": 0.4211, + "loss": 0.4197, "step": 467 }, { "epoch": 1.0021413276231264, - "grad_norm": 0.3369816971461586, + "grad_norm": 0.3321143759595324, "learning_rate": 3.7023809523809526e-05, - "loss": 0.3762, + "loss": 0.3746, "step": 468 }, { "epoch": 1.0042826552462527, - "grad_norm": 0.22809858744604314, + "grad_norm": 0.2523859067388859, "learning_rate": 3.6984126984126986e-05, - "loss": 0.3646, + "loss": 0.3629, "step": 469 }, { "epoch": 1.006423982869379, - "grad_norm": 0.266143350315859, + "grad_norm": 0.2740491528188909, "learning_rate": 3.694444444444445e-05, - "loss": 0.3746, + "loss": 0.3739, "step": 470 }, { "epoch": 1.0085653104925054, - "grad_norm": 0.2884518377103039, + "grad_norm": 0.34376513245262397, "learning_rate": 3.690476190476191e-05, - "loss": 0.3666, + "loss": 0.366, "step": 471 }, { "epoch": 1.0107066381156318, - "grad_norm": 0.28068641149516393, + "grad_norm": 0.28059137656399624, "learning_rate": 3.686507936507937e-05, - "loss": 0.3743, + "loss": 0.3727, "step": 472 }, { "epoch": 1.0128479657387581, - "grad_norm": 0.29435491983752216, + "grad_norm": 0.33729133415206547, "learning_rate": 3.682539682539683e-05, - "loss": 0.3678, + "loss": 0.3657, "step": 473 }, { "epoch": 1.0149892933618843, - "grad_norm": 0.29581454893570663, + "grad_norm": 0.3201683000077876, "learning_rate": 3.678571428571429e-05, - "loss": 0.382, + "loss": 0.3802, "step": 474 }, { "epoch": 1.0171306209850106, - "grad_norm": 0.2567395711302861, + "grad_norm": 0.274999594299655, "learning_rate": 3.674603174603175e-05, - "loss": 0.3781, + "loss": 0.3772, "step": 475 }, { "epoch": 1.019271948608137, - "grad_norm": 0.30179003911878, + "grad_norm": 0.31109609046109504, "learning_rate": 3.6706349206349205e-05, - "loss": 0.3815, + "loss": 0.3795, "step": 476 }, { "epoch": 1.0214132762312633, - "grad_norm": 0.279742858383842, + "grad_norm": 0.2575665550459576, "learning_rate": 3.6666666666666666e-05, - "loss": 0.3777, + "loss": 0.376, "step": 477 }, { "epoch": 1.0235546038543897, - "grad_norm": 0.28030505005478956, + "grad_norm": 0.2937108429839069, "learning_rate": 3.6626984126984126e-05, - "loss": 0.3656, + "loss": 0.3638, "step": 478 }, { "epoch": 1.025695931477516, - "grad_norm": 0.2713443901873829, + "grad_norm": 0.2511369961460089, "learning_rate": 3.658730158730159e-05, - "loss": 0.374, + "loss": 0.3725, "step": 479 }, { "epoch": 1.0278372591006424, - "grad_norm": 0.2643392830096214, + "grad_norm": 0.2839057922395332, "learning_rate": 3.654761904761905e-05, - "loss": 0.3649, + "loss": 0.3627, "step": 480 }, { "epoch": 1.0299785867237687, - "grad_norm": 0.3008739451104665, + "grad_norm": 0.2920184281736407, "learning_rate": 3.650793650793651e-05, - "loss": 0.3822, + "loss": 0.3808, "step": 481 }, { "epoch": 1.032119914346895, - "grad_norm": 0.23862820873191215, + "grad_norm": 0.233470618770099, "learning_rate": 3.646825396825397e-05, - "loss": 0.3632, + "loss": 0.3617, "step": 482 }, { "epoch": 1.0342612419700214, - "grad_norm": 0.27869337578181386, + "grad_norm": 0.280577964771194, "learning_rate": 3.642857142857143e-05, - "loss": 0.3636, + "loss": 0.3625, "step": 483 }, { "epoch": 1.0364025695931478, - "grad_norm": 0.2533397054393399, + "grad_norm": 0.24608809205741292, "learning_rate": 3.638888888888889e-05, - "loss": 0.3666, + "loss": 0.3646, "step": 484 }, { "epoch": 1.0385438972162742, - "grad_norm": 0.21014394289523325, + "grad_norm": 0.2200793066660099, "learning_rate": 3.634920634920635e-05, - "loss": 0.3609, + "loss": 0.3593, "step": 485 }, { "epoch": 1.0406852248394005, - "grad_norm": 0.3283463606383022, + "grad_norm": 0.3327179418533381, "learning_rate": 3.630952380952381e-05, - "loss": 0.3939, + "loss": 0.3933, "step": 486 }, { "epoch": 1.0428265524625269, - "grad_norm": 0.22387699439354838, + "grad_norm": 0.22445951219430535, "learning_rate": 3.626984126984127e-05, - "loss": 0.3611, + "loss": 0.3591, "step": 487 }, { "epoch": 1.044967880085653, - "grad_norm": 0.2690411087151262, + "grad_norm": 0.2496038218726081, "learning_rate": 3.6230158730158734e-05, - "loss": 0.3869, + "loss": 0.3858, "step": 488 }, { "epoch": 1.0471092077087794, - "grad_norm": 0.3053411429357243, + "grad_norm": 0.31223282629773935, "learning_rate": 3.619047619047619e-05, - "loss": 0.3845, + "loss": 0.3833, "step": 489 }, { "epoch": 1.0492505353319057, - "grad_norm": 0.2631289951026371, + "grad_norm": 0.2500697247329698, "learning_rate": 3.615079365079365e-05, - "loss": 0.3698, + "loss": 0.3675, "step": 490 }, { "epoch": 1.051391862955032, - "grad_norm": 0.2670111664926503, + "grad_norm": 0.2569196171542971, "learning_rate": 3.611111111111111e-05, - "loss": 0.3692, + "loss": 0.3682, "step": 491 }, { "epoch": 1.0535331905781584, - "grad_norm": 0.2485103124351668, + "grad_norm": 0.2862772036703233, "learning_rate": 3.607142857142857e-05, - "loss": 0.3538, + "loss": 0.3521, "step": 492 }, { "epoch": 1.0556745182012848, - "grad_norm": 0.23420232693366563, + "grad_norm": 0.22001094989053036, "learning_rate": 3.603174603174603e-05, - "loss": 0.3756, + "loss": 0.3737, "step": 493 }, { "epoch": 1.0578158458244111, - "grad_norm": 0.23665666643547903, + "grad_norm": 0.25763962840222093, "learning_rate": 3.599206349206349e-05, - "loss": 0.3577, + "loss": 0.3559, "step": 494 }, { "epoch": 1.0599571734475375, - "grad_norm": 0.2681669094655506, + "grad_norm": 0.26453793828520467, "learning_rate": 3.595238095238095e-05, - "loss": 0.3785, + "loss": 0.3764, "step": 495 }, { "epoch": 1.0620985010706638, - "grad_norm": 0.22646726688527194, + "grad_norm": 0.24859096396107916, "learning_rate": 3.591269841269841e-05, - "loss": 0.3657, + "loss": 0.3646, "step": 496 }, { "epoch": 1.0642398286937902, - "grad_norm": 0.26622378410440317, + "grad_norm": 0.24879931824582785, "learning_rate": 3.5873015873015874e-05, - "loss": 0.3879, + "loss": 0.3856, "step": 497 }, { "epoch": 1.0663811563169165, - "grad_norm": 0.24041744536037002, + "grad_norm": 0.26070745567359305, "learning_rate": 3.5833333333333335e-05, - "loss": 0.3799, + "loss": 0.3776, "step": 498 }, { "epoch": 1.068522483940043, - "grad_norm": 0.27516627734803006, + "grad_norm": 0.25811232439255394, "learning_rate": 3.5793650793650795e-05, - "loss": 0.3689, + "loss": 0.3668, "step": 499 }, { "epoch": 1.0706638115631693, - "grad_norm": 0.23109318989193225, + "grad_norm": 0.23394126650129565, "learning_rate": 3.5753968253968256e-05, - "loss": 0.3662, + "loss": 0.3644, "step": 500 }, { "epoch": 1.0728051391862956, - "grad_norm": 0.2499488422329624, + "grad_norm": 0.28885516256092003, "learning_rate": 3.571428571428572e-05, - "loss": 0.3703, + "loss": 0.3697, "step": 501 }, { "epoch": 1.0749464668094217, - "grad_norm": 0.2706738165446803, + "grad_norm": 0.22243797367983625, "learning_rate": 3.567460317460318e-05, - "loss": 0.3641, + "loss": 0.3624, "step": 502 }, { "epoch": 1.077087794432548, - "grad_norm": 0.22012676483900637, + "grad_norm": 0.25222271031887367, "learning_rate": 3.563492063492064e-05, - "loss": 0.3736, + "loss": 0.3719, "step": 503 }, { "epoch": 1.0792291220556745, - "grad_norm": 0.2650545240192106, + "grad_norm": 0.26965537434104925, "learning_rate": 3.55952380952381e-05, - "loss": 0.398, + "loss": 0.3959, "step": 504 }, { "epoch": 1.0813704496788008, - "grad_norm": 0.2520843476038801, + "grad_norm": 0.24050297443124782, "learning_rate": 3.555555555555556e-05, - "loss": 0.3597, + "loss": 0.3581, "step": 505 }, { "epoch": 1.0835117773019272, - "grad_norm": 0.24197390108439093, + "grad_norm": 0.2687861602822808, "learning_rate": 3.551587301587302e-05, - "loss": 0.3764, + "loss": 0.3758, "step": 506 }, { "epoch": 1.0856531049250535, - "grad_norm": 0.24538696032664498, + "grad_norm": 0.2140853649825794, "learning_rate": 3.547619047619048e-05, - "loss": 0.3825, + "loss": 0.3806, "step": 507 }, { "epoch": 1.0877944325481799, - "grad_norm": 0.22888284257244346, + "grad_norm": 0.24365107193268598, "learning_rate": 3.543650793650794e-05, - "loss": 0.3745, + "loss": 0.3731, "step": 508 }, { "epoch": 1.0899357601713062, - "grad_norm": 0.24869463423729066, + "grad_norm": 0.24553711140882334, "learning_rate": 3.53968253968254e-05, - "loss": 0.3493, + "loss": 0.3478, "step": 509 }, { "epoch": 1.0920770877944326, - "grad_norm": 0.23993057946273721, + "grad_norm": 0.2489843525995639, "learning_rate": 3.5357142857142864e-05, - "loss": 0.3663, + "loss": 0.3639, "step": 510 }, { "epoch": 1.094218415417559, - "grad_norm": 0.2657625184616568, + "grad_norm": 0.2847640243532346, "learning_rate": 3.5317460317460324e-05, - "loss": 0.3668, + "loss": 0.3652, "step": 511 }, { "epoch": 1.0963597430406853, - "grad_norm": 0.2309814403862246, + "grad_norm": 0.24864833640916995, "learning_rate": 3.527777777777778e-05, - "loss": 0.3787, + "loss": 0.3771, "step": 512 }, { "epoch": 1.0985010706638116, - "grad_norm": 0.24234237491113314, + "grad_norm": 0.24597951146232283, "learning_rate": 3.523809523809524e-05, - "loss": 0.3493, + "loss": 0.3474, "step": 513 }, { "epoch": 1.100642398286938, - "grad_norm": 0.23974579149766023, + "grad_norm": 0.2710332839771354, "learning_rate": 3.51984126984127e-05, - "loss": 0.3641, + "loss": 0.3611, "step": 514 }, { "epoch": 1.1027837259100641, - "grad_norm": 0.24724340356035635, + "grad_norm": 0.24106875283916565, "learning_rate": 3.515873015873016e-05, - "loss": 0.3702, + "loss": 0.3694, "step": 515 }, { "epoch": 1.1049250535331905, - "grad_norm": 0.28135285517273, + "grad_norm": 0.25198007958486407, "learning_rate": 3.511904761904762e-05, - "loss": 0.3695, + "loss": 0.3676, "step": 516 }, { "epoch": 1.1070663811563168, - "grad_norm": 0.243348659130255, + "grad_norm": 0.285475860805922, "learning_rate": 3.5079365079365075e-05, - "loss": 0.3714, + "loss": 0.3694, "step": 517 }, { "epoch": 1.1092077087794432, - "grad_norm": 0.24745510283381558, + "grad_norm": 0.232641447262773, "learning_rate": 3.5039682539682536e-05, - "loss": 0.3565, + "loss": 0.3543, "step": 518 }, { "epoch": 1.1113490364025695, - "grad_norm": 0.2563919456656394, + "grad_norm": 0.26148277895318806, "learning_rate": 3.5e-05, - "loss": 0.3766, + "loss": 0.3737, "step": 519 }, { "epoch": 1.113490364025696, - "grad_norm": 0.22577716624985744, + "grad_norm": 0.2522296596506257, "learning_rate": 3.496031746031746e-05, - "loss": 0.3697, + "loss": 0.3685, "step": 520 }, { "epoch": 1.1156316916488223, - "grad_norm": 0.263849669081119, + "grad_norm": 0.24283436286680268, "learning_rate": 3.492063492063492e-05, - "loss": 0.3621, + "loss": 0.3607, "step": 521 }, { "epoch": 1.1177730192719486, - "grad_norm": 0.2578266915441593, + "grad_norm": 0.2546190850581306, "learning_rate": 3.488095238095238e-05, - "loss": 0.3781, + "loss": 0.379, "step": 522 }, { "epoch": 1.119914346895075, - "grad_norm": 0.267410790093275, + "grad_norm": 0.2695857257313346, "learning_rate": 3.484126984126984e-05, - "loss": 0.3802, + "loss": 0.3791, "step": 523 }, { "epoch": 1.1220556745182013, - "grad_norm": 0.26720246872107734, + "grad_norm": 0.26122674703315873, "learning_rate": 3.48015873015873e-05, - "loss": 0.3686, + "loss": 0.3654, "step": 524 }, { "epoch": 1.1241970021413277, - "grad_norm": 0.22844067281982589, + "grad_norm": 0.23999078841945964, "learning_rate": 3.476190476190476e-05, - "loss": 0.3892, + "loss": 0.3876, "step": 525 }, { "epoch": 1.126338329764454, - "grad_norm": 0.2522676842739371, + "grad_norm": 0.27711890693427227, "learning_rate": 3.472222222222222e-05, - "loss": 0.3695, + "loss": 0.3678, "step": 526 }, { "epoch": 1.1284796573875804, - "grad_norm": 0.20988616282501774, + "grad_norm": 0.22153222863862254, "learning_rate": 3.468253968253968e-05, - "loss": 0.3705, + "loss": 0.3695, "step": 527 }, { "epoch": 1.1306209850107067, - "grad_norm": 0.2452290096455659, + "grad_norm": 0.2367741399209089, "learning_rate": 3.4642857142857144e-05, - "loss": 0.3643, + "loss": 0.3619, "step": 528 }, { "epoch": 1.132762312633833, - "grad_norm": 0.20376413452570455, + "grad_norm": 0.2279683733856979, "learning_rate": 3.4603174603174604e-05, - "loss": 0.3621, + "loss": 0.3602, "step": 529 }, { "epoch": 1.1349036402569592, - "grad_norm": 0.22961373150459624, + "grad_norm": 0.21797256781445612, "learning_rate": 3.4563492063492065e-05, - "loss": 0.355, + "loss": 0.3538, "step": 530 }, { "epoch": 1.1370449678800856, - "grad_norm": 0.2430831935717076, + "grad_norm": 0.23243584252507798, "learning_rate": 3.4523809523809526e-05, - "loss": 0.3689, + "loss": 0.3676, "step": 531 }, { "epoch": 1.139186295503212, - "grad_norm": 0.2017982939822363, + "grad_norm": 0.22337304786840087, "learning_rate": 3.448412698412699e-05, - "loss": 0.3508, + "loss": 0.35, "step": 532 }, { "epoch": 1.1413276231263383, - "grad_norm": 0.26746451866780874, + "grad_norm": 0.2723870117088924, "learning_rate": 3.444444444444445e-05, - "loss": 0.3695, + "loss": 0.3681, "step": 533 }, { "epoch": 1.1434689507494646, - "grad_norm": 0.24736040419305078, + "grad_norm": 0.23683938267697505, "learning_rate": 3.440476190476191e-05, - "loss": 0.3604, + "loss": 0.3584, "step": 534 }, { "epoch": 1.145610278372591, - "grad_norm": 0.23139512163454942, + "grad_norm": 0.27263835901881306, "learning_rate": 3.436507936507937e-05, - "loss": 0.3597, + "loss": 0.3585, "step": 535 }, { "epoch": 1.1477516059957173, - "grad_norm": 0.24151228114318973, + "grad_norm": 0.2413247019766125, "learning_rate": 3.432539682539683e-05, - "loss": 0.3722, + "loss": 0.3715, "step": 536 }, { "epoch": 1.1498929336188437, - "grad_norm": 0.225767578077691, + "grad_norm": 0.24835573947131598, "learning_rate": 3.428571428571429e-05, - "loss": 0.3852, + "loss": 0.3834, "step": 537 }, { "epoch": 1.15203426124197, - "grad_norm": 0.2379372088035776, + "grad_norm": 0.24833529249782688, "learning_rate": 3.424603174603175e-05, - "loss": 0.377, + "loss": 0.3753, "step": 538 }, { "epoch": 1.1541755888650964, - "grad_norm": 0.22031199802513296, + "grad_norm": 0.2271111061767862, "learning_rate": 3.420634920634921e-05, - "loss": 0.3767, + "loss": 0.3751, "step": 539 }, { "epoch": 1.1563169164882228, - "grad_norm": 0.23132051552867036, + "grad_norm": 0.2432938466019091, "learning_rate": 3.4166666666666666e-05, - "loss": 0.3528, + "loss": 0.3521, "step": 540 }, { "epoch": 1.1584582441113491, - "grad_norm": 0.24257151344704514, + "grad_norm": 0.2458707748193166, "learning_rate": 3.412698412698413e-05, - "loss": 0.3595, + "loss": 0.3583, "step": 541 }, { "epoch": 1.1605995717344753, - "grad_norm": 0.2261792555412017, + "grad_norm": 0.23943667615924907, "learning_rate": 3.408730158730159e-05, - "loss": 0.3633, + "loss": 0.3623, "step": 542 }, { "epoch": 1.1627408993576016, - "grad_norm": 0.2829561917904997, + "grad_norm": 0.29767722341183855, "learning_rate": 3.404761904761905e-05, - "loss": 0.3648, + "loss": 0.3624, "step": 543 }, { "epoch": 1.164882226980728, - "grad_norm": 0.25098056566365634, + "grad_norm": 0.2643286960690453, "learning_rate": 3.400793650793651e-05, - "loss": 0.3702, + "loss": 0.3688, "step": 544 }, { "epoch": 1.1670235546038543, - "grad_norm": 0.24644300251595483, + "grad_norm": 0.261706842552651, "learning_rate": 3.396825396825397e-05, - "loss": 0.3641, + "loss": 0.363, "step": 545 }, { "epoch": 1.1691648822269807, - "grad_norm": 0.23310081121807147, + "grad_norm": 0.22632083311117027, "learning_rate": 3.392857142857143e-05, - "loss": 0.3724, + "loss": 0.37, "step": 546 }, { "epoch": 1.171306209850107, - "grad_norm": 0.2396918335180637, + "grad_norm": 0.24969845223117051, "learning_rate": 3.388888888888889e-05, - "loss": 0.3724, + "loss": 0.371, "step": 547 }, { "epoch": 1.1734475374732334, - "grad_norm": 0.26305119086996215, + "grad_norm": 0.2653686082584629, "learning_rate": 3.384920634920635e-05, - "loss": 0.3663, + "loss": 0.3641, "step": 548 }, { "epoch": 1.1755888650963597, - "grad_norm": 0.23766315816254158, + "grad_norm": 0.26484419864560405, "learning_rate": 3.380952380952381e-05, - "loss": 0.3543, + "loss": 0.3532, "step": 549 }, { "epoch": 1.177730192719486, - "grad_norm": 0.24791689978321688, + "grad_norm": 0.23731845700258258, "learning_rate": 3.3769841269841273e-05, - "loss": 0.3858, + "loss": 0.3844, "step": 550 }, { "epoch": 1.1798715203426124, - "grad_norm": 0.2427021752634518, + "grad_norm": 0.26369894447851056, "learning_rate": 3.3730158730158734e-05, - "loss": 0.3544, + "loss": 0.3535, "step": 551 }, { "epoch": 1.1820128479657388, - "grad_norm": 0.2524229855590501, + "grad_norm": 0.25523841664126234, "learning_rate": 3.3690476190476195e-05, - "loss": 0.3932, + "loss": 0.3915, "step": 552 }, { "epoch": 1.1841541755888652, - "grad_norm": 0.2208689079418904, + "grad_norm": 0.21550220946043203, "learning_rate": 3.3650793650793656e-05, - "loss": 0.3638, + "loss": 0.3623, "step": 553 }, { "epoch": 1.1862955032119915, - "grad_norm": 0.24091697133765969, + "grad_norm": 0.24110638888192076, "learning_rate": 3.3611111111111116e-05, - "loss": 0.351, + "loss": 0.3499, "step": 554 }, { "epoch": 1.1884368308351179, - "grad_norm": 0.24197830327776518, + "grad_norm": 0.22464037830444605, "learning_rate": 3.357142857142857e-05, - "loss": 0.3596, + "loss": 0.3578, "step": 555 }, { "epoch": 1.1905781584582442, - "grad_norm": 0.19888129082318382, + "grad_norm": 0.21226847946734964, "learning_rate": 3.353174603174603e-05, - "loss": 0.3643, + "loss": 0.3634, "step": 556 }, { "epoch": 1.1927194860813706, - "grad_norm": 0.26369169905142914, + "grad_norm": 0.2626157214230889, "learning_rate": 3.349206349206349e-05, - "loss": 0.3605, + "loss": 0.3589, "step": 557 }, { "epoch": 1.1948608137044967, - "grad_norm": 0.2532299216339328, + "grad_norm": 0.24071856058212684, "learning_rate": 3.345238095238095e-05, - "loss": 0.3878, + "loss": 0.3863, "step": 558 }, { "epoch": 1.197002141327623, - "grad_norm": 0.23397198291097293, + "grad_norm": 0.2508127197520457, "learning_rate": 3.3412698412698413e-05, - "loss": 0.3892, + "loss": 0.386, "step": 559 }, { "epoch": 1.1991434689507494, - "grad_norm": 0.3051071879009391, + "grad_norm": 0.30389459018728837, "learning_rate": 3.3373015873015874e-05, - "loss": 0.3857, + "loss": 0.3833, "step": 560 }, { "epoch": 1.2012847965738758, - "grad_norm": 0.2415676269852608, + "grad_norm": 0.25210041774328223, "learning_rate": 3.3333333333333335e-05, - "loss": 0.3529, + "loss": 0.3517, "step": 561 }, { "epoch": 1.2034261241970021, - "grad_norm": 0.2156997194979025, + "grad_norm": 0.23500359192291567, "learning_rate": 3.3293650793650796e-05, - "loss": 0.3618, + "loss": 0.3605, "step": 562 }, { "epoch": 1.2055674518201285, - "grad_norm": 0.24244616134298602, + "grad_norm": 0.2546074299402884, "learning_rate": 3.3253968253968256e-05, - "loss": 0.3528, + "loss": 0.3516, "step": 563 }, { "epoch": 1.2077087794432548, - "grad_norm": 0.2675996346388178, + "grad_norm": 0.2709353855529332, "learning_rate": 3.321428571428572e-05, - "loss": 0.3716, + "loss": 0.3701, "step": 564 }, { "epoch": 1.2098501070663812, - "grad_norm": 0.22977064909622621, + "grad_norm": 0.2518216621854237, "learning_rate": 3.317460317460318e-05, - "loss": 0.3651, + "loss": 0.3652, "step": 565 }, { "epoch": 1.2119914346895075, - "grad_norm": 0.22289828933891506, + "grad_norm": 0.23203813858204622, "learning_rate": 3.313492063492064e-05, - "loss": 0.3578, + "loss": 0.357, "step": 566 }, { "epoch": 1.214132762312634, - "grad_norm": 0.24907945525188446, + "grad_norm": 0.27322569967364363, "learning_rate": 3.309523809523809e-05, - "loss": 0.3803, + "loss": 0.3795, "step": 567 }, { "epoch": 1.2162740899357602, - "grad_norm": 0.21260828983218635, + "grad_norm": 0.21862325914700695, "learning_rate": 3.3055555555555553e-05, - "loss": 0.3823, + "loss": 0.3803, "step": 568 }, { "epoch": 1.2184154175588866, - "grad_norm": 0.2444036303892165, + "grad_norm": 0.23817063938434127, "learning_rate": 3.3015873015873014e-05, - "loss": 0.3873, + "loss": 0.3864, "step": 569 }, { "epoch": 1.2205567451820127, - "grad_norm": 0.27044980470468927, + "grad_norm": 0.30402766988609314, "learning_rate": 3.2976190476190475e-05, - "loss": 0.3796, + "loss": 0.3784, "step": 570 }, { "epoch": 1.222698072805139, - "grad_norm": 0.23806387274834903, + "grad_norm": 0.23766830428808083, "learning_rate": 3.2936507936507936e-05, - "loss": 0.3606, + "loss": 0.3588, "step": 571 }, { "epoch": 1.2248394004282654, - "grad_norm": 0.2541644994496726, + "grad_norm": 0.25852053977967815, "learning_rate": 3.2896825396825396e-05, - "loss": 0.3795, + "loss": 0.3781, "step": 572 }, { "epoch": 1.2269807280513918, - "grad_norm": 0.2726575491057458, + "grad_norm": 0.292675984396594, "learning_rate": 3.285714285714286e-05, - "loss": 0.3806, + "loss": 0.3791, "step": 573 }, { "epoch": 1.2291220556745182, - "grad_norm": 0.23105651670608096, + "grad_norm": 0.22578595387659117, "learning_rate": 3.281746031746032e-05, - "loss": 0.3692, + "loss": 0.368, "step": 574 }, { "epoch": 1.2312633832976445, - "grad_norm": 0.23853407830325385, + "grad_norm": 0.24728132039412187, "learning_rate": 3.277777777777778e-05, - "loss": 0.3701, + "loss": 0.3696, "step": 575 }, { "epoch": 1.2334047109207709, - "grad_norm": 0.2550243654541794, + "grad_norm": 0.27218412013650006, "learning_rate": 3.273809523809524e-05, - "loss": 0.3696, + "loss": 0.3688, "step": 576 }, { "epoch": 1.2355460385438972, - "grad_norm": 0.23511204104744063, + "grad_norm": 0.23158465192272315, "learning_rate": 3.26984126984127e-05, - "loss": 0.3853, + "loss": 0.3842, "step": 577 }, { "epoch": 1.2376873661670236, - "grad_norm": 0.26214275864155384, + "grad_norm": 0.27482920742913025, "learning_rate": 3.265873015873016e-05, - "loss": 0.3678, + "loss": 0.3663, "step": 578 }, { "epoch": 1.23982869379015, - "grad_norm": 0.23149724483459302, + "grad_norm": 0.25920426583780776, "learning_rate": 3.261904761904762e-05, - "loss": 0.3584, + "loss": 0.3571, "step": 579 }, { "epoch": 1.2419700214132763, - "grad_norm": 0.2085157601025498, + "grad_norm": 0.2032778358032685, "learning_rate": 3.257936507936508e-05, - "loss": 0.3497, + "loss": 0.3488, "step": 580 }, { "epoch": 1.2441113490364026, - "grad_norm": 0.24198910705782214, + "grad_norm": 0.2549778314308203, "learning_rate": 3.253968253968254e-05, - "loss": 0.3712, + "loss": 0.37, "step": 581 }, { "epoch": 1.246252676659529, - "grad_norm": 0.23194060619444556, + "grad_norm": 0.25860176335450585, "learning_rate": 3.2500000000000004e-05, - "loss": 0.3653, + "loss": 0.3633, "step": 582 }, { "epoch": 1.2483940042826553, - "grad_norm": 0.21932550329816566, + "grad_norm": 0.21568263900821674, "learning_rate": 3.2460317460317465e-05, - "loss": 0.36, + "loss": 0.3594, "step": 583 }, { "epoch": 1.2505353319057817, - "grad_norm": 0.2485536260156264, + "grad_norm": 0.25190898114681015, "learning_rate": 3.2420634920634925e-05, - "loss": 0.3557, + "loss": 0.3543, "step": 584 }, { "epoch": 1.252676659528908, - "grad_norm": 0.21298281815390674, + "grad_norm": 0.21237788073269723, "learning_rate": 3.2380952380952386e-05, - "loss": 0.3768, + "loss": 0.3755, "step": 585 }, { "epoch": 1.2548179871520342, - "grad_norm": 0.2308383054301672, + "grad_norm": 0.22878119129492097, "learning_rate": 3.234126984126985e-05, - "loss": 0.3613, + "loss": 0.3598, "step": 586 }, { "epoch": 1.2569593147751605, - "grad_norm": 0.2500705573168694, + "grad_norm": 0.24998274367059609, "learning_rate": 3.230158730158731e-05, - "loss": 0.3672, + "loss": 0.3671, "step": 587 }, { "epoch": 1.259100642398287, - "grad_norm": 0.24932919856158853, + "grad_norm": 0.24248609005355534, "learning_rate": 3.226190476190477e-05, - "loss": 0.3664, + "loss": 0.365, "step": 588 }, { "epoch": 1.2612419700214133, - "grad_norm": 0.21114923566401123, + "grad_norm": 0.2502819042148945, "learning_rate": 3.222222222222223e-05, "loss": 0.3654, "step": 589 }, { "epoch": 1.2633832976445396, - "grad_norm": 0.26655876159854425, + "grad_norm": 0.24428764588220775, "learning_rate": 3.218253968253968e-05, - "loss": 0.376, + "loss": 0.375, "step": 590 }, { "epoch": 1.265524625267666, - "grad_norm": 0.22494122561905386, + "grad_norm": 0.24064115102187758, "learning_rate": 3.2142857142857144e-05, - "loss": 0.366, + "loss": 0.3646, "step": 591 }, { "epoch": 1.2676659528907923, - "grad_norm": 0.24850515269385678, + "grad_norm": 0.2580980916190992, "learning_rate": 3.2103174603174605e-05, - "loss": 0.3814, + "loss": 0.3805, "step": 592 }, { "epoch": 1.2698072805139187, - "grad_norm": 0.2359476164199983, + "grad_norm": 0.21389149883559067, "learning_rate": 3.2063492063492065e-05, - "loss": 0.3876, + "loss": 0.3854, "step": 593 }, { "epoch": 1.271948608137045, - "grad_norm": 0.2170060923253427, + "grad_norm": 0.23647472042052045, "learning_rate": 3.202380952380952e-05, - "loss": 0.3662, + "loss": 0.3645, "step": 594 }, { "epoch": 1.2740899357601714, - "grad_norm": 0.2504062463730666, + "grad_norm": 0.23724295037763593, "learning_rate": 3.198412698412698e-05, - "loss": 0.3753, + "loss": 0.3735, "step": 595 }, { "epoch": 1.2762312633832975, - "grad_norm": 0.20106817241986427, + "grad_norm": 0.22413428930081114, "learning_rate": 3.194444444444444e-05, - "loss": 0.3592, + "loss": 0.359, "step": 596 }, { "epoch": 1.2783725910064239, - "grad_norm": 0.21200714791632813, + "grad_norm": 0.2223223372100241, "learning_rate": 3.19047619047619e-05, - "loss": 0.3509, + "loss": 0.3491, "step": 597 }, { "epoch": 1.2805139186295502, - "grad_norm": 0.21778732718260096, + "grad_norm": 0.21897487856488682, "learning_rate": 3.186507936507936e-05, - "loss": 0.3584, + "loss": 0.357, "step": 598 }, { "epoch": 1.2826552462526766, - "grad_norm": 0.2189934520948945, + "grad_norm": 0.2502304419567601, "learning_rate": 3.182539682539682e-05, - "loss": 0.3549, + "loss": 0.3536, "step": 599 }, { "epoch": 1.284796573875803, - "grad_norm": 0.2208172986950504, + "grad_norm": 0.2253482204551222, "learning_rate": 3.1785714285714284e-05, - "loss": 0.3571, + "loss": 0.3558, "step": 600 }, { "epoch": 1.2869379014989293, - "grad_norm": 0.22576417970064774, + "grad_norm": 0.24613177754827212, "learning_rate": 3.1746031746031745e-05, - "loss": 0.3525, + "loss": 0.3515, "step": 601 }, { "epoch": 1.2890792291220556, - "grad_norm": 0.27692988742074415, + "grad_norm": 0.2523575221500364, "learning_rate": 3.1706349206349205e-05, - "loss": 0.3764, + "loss": 0.3741, "step": 602 }, { "epoch": 1.291220556745182, - "grad_norm": 0.21396523695563535, + "grad_norm": 0.2165503319453962, "learning_rate": 3.1666666666666666e-05, - "loss": 0.3733, + "loss": 0.3727, "step": 603 }, { "epoch": 1.2933618843683083, - "grad_norm": 0.23416094155920952, + "grad_norm": 0.21823243034291184, "learning_rate": 3.162698412698413e-05, - "loss": 0.3559, + "loss": 0.3533, "step": 604 }, { "epoch": 1.2955032119914347, - "grad_norm": 0.2486860214774819, + "grad_norm": 0.24997589624196248, "learning_rate": 3.158730158730159e-05, - "loss": 0.3562, + "loss": 0.3542, "step": 605 }, { "epoch": 1.297644539614561, - "grad_norm": 0.2201774327190655, + "grad_norm": 0.2398097391159386, "learning_rate": 3.154761904761905e-05, - "loss": 0.3716, + "loss": 0.3697, "step": 606 }, { "epoch": 1.2997858672376874, - "grad_norm": 0.1964840066863156, + "grad_norm": 0.20653894840882975, "learning_rate": 3.150793650793651e-05, - "loss": 0.3542, + "loss": 0.3541, "step": 607 }, { "epoch": 1.3019271948608138, - "grad_norm": 0.22767951673276937, + "grad_norm": 0.24435984092571286, "learning_rate": 3.146825396825397e-05, - "loss": 0.3612, + "loss": 0.3596, "step": 608 }, { "epoch": 1.3040685224839401, - "grad_norm": 0.22557177823924723, + "grad_norm": 0.2542985397543424, "learning_rate": 3.142857142857143e-05, - "loss": 0.3603, + "loss": 0.3583, "step": 609 }, { "epoch": 1.3062098501070665, - "grad_norm": 0.27527013455606647, + "grad_norm": 0.20683543774079505, "learning_rate": 3.138888888888889e-05, - "loss": 0.3502, + "loss": 0.3483, "step": 610 }, { "epoch": 1.3083511777301928, - "grad_norm": 0.23325579262107884, + "grad_norm": 0.2711230309464256, "learning_rate": 3.134920634920635e-05, - "loss": 0.3631, + "loss": 0.3616, "step": 611 }, { "epoch": 1.3104925053533192, - "grad_norm": 0.22097147014315746, + "grad_norm": 0.24151563936505477, "learning_rate": 3.130952380952381e-05, - "loss": 0.3511, + "loss": 0.3498, "step": 612 }, { "epoch": 1.3126338329764453, - "grad_norm": 0.2174864072196349, + "grad_norm": 0.2369073543305672, "learning_rate": 3.1269841269841274e-05, - "loss": 0.3672, + "loss": 0.366, "step": 613 }, { "epoch": 1.3147751605995717, - "grad_norm": 0.2106746804771801, + "grad_norm": 0.24227771996000944, "learning_rate": 3.1230158730158734e-05, - "loss": 0.3612, + "loss": 0.3599, "step": 614 }, { "epoch": 1.316916488222698, - "grad_norm": 0.21088050552854007, + "grad_norm": 0.2221305104163018, "learning_rate": 3.1190476190476195e-05, - "loss": 0.3471, + "loss": 0.3456, "step": 615 }, { "epoch": 1.3190578158458244, - "grad_norm": 0.2580125309103289, + "grad_norm": 0.2745562820217464, "learning_rate": 3.1150793650793656e-05, - "loss": 0.3741, + "loss": 0.373, "step": 616 }, { "epoch": 1.3211991434689507, - "grad_norm": 0.21505898974697335, + "grad_norm": 0.223755905789405, "learning_rate": 3.111111111111111e-05, - "loss": 0.3763, + "loss": 0.3743, "step": 617 }, { "epoch": 1.323340471092077, - "grad_norm": 0.2465748563264946, + "grad_norm": 0.2540980055756857, "learning_rate": 3.107142857142857e-05, - "loss": 0.3766, + "loss": 0.3752, "step": 618 }, { "epoch": 1.3254817987152034, - "grad_norm": 0.3321013586284405, + "grad_norm": 0.31394476060654153, "learning_rate": 3.103174603174603e-05, - "loss": 0.3955, + "loss": 0.3918, "step": 619 }, { "epoch": 1.3276231263383298, - "grad_norm": 0.22495906786878164, + "grad_norm": 0.2326066740490904, "learning_rate": 3.099206349206349e-05, - "loss": 0.3718, + "loss": 0.3697, "step": 620 }, { "epoch": 1.3297644539614561, - "grad_norm": 0.23719074429579814, + "grad_norm": 0.24392074483576617, "learning_rate": 3.095238095238095e-05, - "loss": 0.3544, + "loss": 0.3522, "step": 621 }, { "epoch": 1.3319057815845825, - "grad_norm": 0.23831973310054141, + "grad_norm": 0.25390065497286196, "learning_rate": 3.0912698412698414e-05, - "loss": 0.3795, + "loss": 0.3757, "step": 622 }, { "epoch": 1.3340471092077089, - "grad_norm": 0.2128688128559845, + "grad_norm": 0.20396843300419068, "learning_rate": 3.0873015873015874e-05, - "loss": 0.3668, + "loss": 0.3651, "step": 623 }, { "epoch": 1.336188436830835, - "grad_norm": 0.22961487193589208, + "grad_norm": 0.22881137081512465, "learning_rate": 3.0833333333333335e-05, - "loss": 0.366, + "loss": 0.3638, "step": 624 }, { "epoch": 1.3383297644539613, - "grad_norm": 0.2214674128035989, + "grad_norm": 0.23141190116614863, "learning_rate": 3.0793650793650796e-05, - "loss": 0.365, + "loss": 0.3617, "step": 625 }, { "epoch": 1.3404710920770877, - "grad_norm": 0.20498199401783132, + "grad_norm": 0.21197621308607237, "learning_rate": 3.075396825396826e-05, - "loss": 0.3556, + "loss": 0.3533, "step": 626 }, { "epoch": 1.342612419700214, - "grad_norm": 0.22359296981746604, + "grad_norm": 0.22886167158750198, "learning_rate": 3.071428571428572e-05, - "loss": 0.3812, + "loss": 0.3798, "step": 627 }, { "epoch": 1.3447537473233404, - "grad_norm": 0.21653621158585087, + "grad_norm": 0.21750304837930745, "learning_rate": 3.067460317460318e-05, - "loss": 0.3797, + "loss": 0.3781, "step": 628 }, { "epoch": 1.3468950749464668, - "grad_norm": 0.22549871106519903, + "grad_norm": 0.21938329417465666, "learning_rate": 3.063492063492064e-05, - "loss": 0.3596, + "loss": 0.3586, "step": 629 }, { "epoch": 1.3490364025695931, - "grad_norm": 0.20475922109140837, + "grad_norm": 0.21402476475138535, "learning_rate": 3.05952380952381e-05, - "loss": 0.3735, + "loss": 0.3716, "step": 630 }, { "epoch": 1.3511777301927195, - "grad_norm": 0.209763601459537, + "grad_norm": 0.20938404630600568, "learning_rate": 3.055555555555556e-05, - "loss": 0.3709, + "loss": 0.37, "step": 631 }, { "epoch": 1.3533190578158458, - "grad_norm": 0.2176748492097036, + "grad_norm": 0.21383889850254142, "learning_rate": 3.051587301587302e-05, - "loss": 0.3808, + "loss": 0.3792, "step": 632 }, { "epoch": 1.3554603854389722, - "grad_norm": 0.21622906752623547, + "grad_norm": 0.22055010594265703, "learning_rate": 3.0476190476190482e-05, - "loss": 0.3604, + "loss": 0.3589, "step": 633 }, { "epoch": 1.3576017130620985, - "grad_norm": 0.20063962917481207, + "grad_norm": 0.19487907168624696, "learning_rate": 3.0436507936507936e-05, - "loss": 0.3546, + "loss": 0.3535, "step": 634 }, { "epoch": 1.359743040685225, - "grad_norm": 0.20999519152589322, + "grad_norm": 0.22306820784789425, "learning_rate": 3.0396825396825397e-05, - "loss": 0.3597, + "loss": 0.3572, "step": 635 }, { "epoch": 1.3618843683083512, - "grad_norm": 0.20927897537878964, + "grad_norm": 0.2229559913435905, "learning_rate": 3.0357142857142857e-05, - "loss": 0.3729, + "loss": 0.3707, "step": 636 }, { "epoch": 1.3640256959314776, - "grad_norm": 0.21686713042567732, + "grad_norm": 0.2024293507877544, "learning_rate": 3.0317460317460318e-05, - "loss": 0.3653, + "loss": 0.3638, "step": 637 }, { "epoch": 1.366167023554604, - "grad_norm": 0.21754952121551016, + "grad_norm": 0.23534992817477, "learning_rate": 3.0277777777777776e-05, - "loss": 0.3677, + "loss": 0.365, "step": 638 }, { "epoch": 1.3683083511777303, - "grad_norm": 0.20450902968020848, + "grad_norm": 0.22873997219956177, "learning_rate": 3.0238095238095236e-05, - "loss": 0.3355, + "loss": 0.3338, "step": 639 }, { "epoch": 1.3704496788008567, - "grad_norm": 0.21623695248881408, + "grad_norm": 0.2211599162133884, "learning_rate": 3.0198412698412697e-05, - "loss": 0.3806, + "loss": 0.3784, "step": 640 }, { "epoch": 1.3725910064239828, - "grad_norm": 0.21537383605412733, + "grad_norm": 0.22100903081850962, "learning_rate": 3.0158730158730158e-05, - "loss": 0.3681, + "loss": 0.3677, "step": 641 }, { "epoch": 1.3747323340471092, - "grad_norm": 0.19837937617514323, + "grad_norm": 0.20600842295123745, "learning_rate": 3.011904761904762e-05, - "loss": 0.3622, + "loss": 0.361, "step": 642 }, { "epoch": 1.3768736616702355, - "grad_norm": 0.209108602319792, + "grad_norm": 0.22980010113385077, "learning_rate": 3.007936507936508e-05, - "loss": 0.3484, + "loss": 0.3465, "step": 643 }, { "epoch": 1.3790149892933619, - "grad_norm": 0.2062194560604121, + "grad_norm": 0.21464631919266589, "learning_rate": 3.003968253968254e-05, - "loss": 0.3702, + "loss": 0.3681, "step": 644 }, { "epoch": 1.3811563169164882, - "grad_norm": 0.2080771057237384, + "grad_norm": 0.2103030135241598, "learning_rate": 3e-05, - "loss": 0.362, + "loss": 0.3603, "step": 645 }, { "epoch": 1.3832976445396146, - "grad_norm": 0.2227326233506996, + "grad_norm": 0.23432239498652505, "learning_rate": 2.996031746031746e-05, - "loss": 0.3701, + "loss": 0.3676, "step": 646 }, { "epoch": 1.385438972162741, - "grad_norm": 0.2089744724523155, + "grad_norm": 0.21783149705335722, "learning_rate": 2.9920634920634922e-05, - "loss": 0.361, + "loss": 0.3588, "step": 647 }, { "epoch": 1.3875802997858673, - "grad_norm": 0.25404638727035117, + "grad_norm": 0.274165710942849, "learning_rate": 2.9880952380952383e-05, - "loss": 0.3629, + "loss": 0.361, "step": 648 }, { "epoch": 1.3897216274089936, - "grad_norm": 0.21442147488950894, + "grad_norm": 0.21073344703205954, "learning_rate": 2.9841269841269844e-05, - "loss": 0.3513, + "loss": 0.3497, "step": 649 }, { "epoch": 1.39186295503212, - "grad_norm": 0.2481654405500989, + "grad_norm": 0.2501430083399487, "learning_rate": 2.98015873015873e-05, - "loss": 0.3662, + "loss": 0.3634, "step": 650 }, { "epoch": 1.3940042826552461, - "grad_norm": 0.22382398124617614, + "grad_norm": 0.21899163049205936, "learning_rate": 2.9761904761904762e-05, - "loss": 0.3687, + "loss": 0.3679, "step": 651 }, { "epoch": 1.3961456102783725, - "grad_norm": 0.31356212838373204, + "grad_norm": 0.294173480692159, "learning_rate": 2.9722222222222223e-05, - "loss": 0.3795, + "loss": 0.3826, "step": 652 }, { "epoch": 1.3982869379014988, - "grad_norm": 0.2429332621043318, + "grad_norm": 0.2482472851407277, "learning_rate": 2.9682539682539683e-05, - "loss": 0.373, + "loss": 0.3713, "step": 653 }, { "epoch": 1.4004282655246252, - "grad_norm": 0.2249979699253431, + "grad_norm": 0.24006701265887465, "learning_rate": 2.9642857142857144e-05, - "loss": 0.3676, + "loss": 0.366, "step": 654 }, { "epoch": 1.4025695931477515, - "grad_norm": 0.2560531887036681, + "grad_norm": 0.2591909670041966, "learning_rate": 2.9603174603174605e-05, - "loss": 0.3637, + "loss": 0.3623, "step": 655 }, { "epoch": 1.404710920770878, - "grad_norm": 0.24116508821199556, + "grad_norm": 0.22731690936226825, "learning_rate": 2.9563492063492066e-05, - "loss": 0.3707, + "loss": 0.3691, "step": 656 }, { "epoch": 1.4068522483940042, - "grad_norm": 0.2598274205324474, + "grad_norm": 0.2536340951226291, "learning_rate": 2.9523809523809526e-05, - "loss": 0.379, + "loss": 0.3774, "step": 657 }, { "epoch": 1.4089935760171306, - "grad_norm": 0.21575521660148642, + "grad_norm": 0.22286664711972803, "learning_rate": 2.9484126984126987e-05, - "loss": 0.376, + "loss": 0.3743, "step": 658 }, { "epoch": 1.411134903640257, - "grad_norm": 0.22041851374967822, + "grad_norm": 0.23183154595936484, "learning_rate": 2.9444444444444448e-05, - "loss": 0.3607, + "loss": 0.3598, "step": 659 }, { "epoch": 1.4132762312633833, - "grad_norm": 0.2564088225056767, + "grad_norm": 0.25169102444114877, "learning_rate": 2.940476190476191e-05, - "loss": 0.3599, + "loss": 0.3589, "step": 660 }, { "epoch": 1.4154175588865097, - "grad_norm": 0.20549406527051656, + "grad_norm": 0.21476384262074055, "learning_rate": 2.9365079365079366e-05, - "loss": 0.3657, + "loss": 0.3652, "step": 661 }, { "epoch": 1.417558886509636, - "grad_norm": 0.2540118149516082, + "grad_norm": 0.25197060268479204, "learning_rate": 2.9325396825396827e-05, - "loss": 0.368, + "loss": 0.3666, "step": 662 }, { "epoch": 1.4197002141327624, - "grad_norm": 0.2648103833326966, + "grad_norm": 0.2673997115188798, "learning_rate": 2.9285714285714288e-05, - "loss": 0.3689, + "loss": 0.3671, "step": 663 }, { "epoch": 1.4218415417558887, - "grad_norm": 0.24077539771530324, + "grad_norm": 0.23612940999561857, "learning_rate": 2.9246031746031748e-05, - "loss": 0.351, + "loss": 0.3492, "step": 664 }, { "epoch": 1.423982869379015, - "grad_norm": 0.24569673366814643, + "grad_norm": 0.2601602289574086, "learning_rate": 2.920634920634921e-05, - "loss": 0.3595, + "loss": 0.3581, "step": 665 }, { "epoch": 1.4261241970021414, - "grad_norm": 0.22436619156977872, + "grad_norm": 0.2276622282515761, "learning_rate": 2.916666666666667e-05, - "loss": 0.3636, + "loss": 0.3625, "step": 666 }, { "epoch": 1.4282655246252678, - "grad_norm": 0.22702096393924773, + "grad_norm": 0.2231777877927664, "learning_rate": 2.912698412698413e-05, - "loss": 0.3577, + "loss": 0.357, "step": 667 }, { "epoch": 1.430406852248394, - "grad_norm": 0.21223624922722298, + "grad_norm": 0.21491218629534398, "learning_rate": 2.908730158730159e-05, - "loss": 0.3677, + "loss": 0.3659, "step": 668 }, { "epoch": 1.4325481798715203, - "grad_norm": 0.2474621671117468, + "grad_norm": 0.2286321200334171, "learning_rate": 2.9047619047619052e-05, - "loss": 0.3632, + "loss": 0.3616, "step": 669 }, { "epoch": 1.4346895074946466, - "grad_norm": 0.25684655588636496, + "grad_norm": 0.2431901099756692, "learning_rate": 2.9007936507936513e-05, - "loss": 0.3592, + "loss": 0.3574, "step": 670 }, { "epoch": 1.436830835117773, - "grad_norm": 0.25072866337855965, + "grad_norm": 0.23060779470960058, "learning_rate": 2.8968253968253974e-05, - "loss": 0.3598, + "loss": 0.3584, "step": 671 }, { "epoch": 1.4389721627408993, - "grad_norm": 0.22582476526482667, + "grad_norm": 0.21613457164084013, "learning_rate": 2.8928571428571434e-05, - "loss": 0.3577, + "loss": 0.3566, "step": 672 }, { "epoch": 1.4411134903640257, - "grad_norm": 0.28658549817719403, + "grad_norm": 0.264503775266096, "learning_rate": 2.8888888888888888e-05, - "loss": 0.3839, + "loss": 0.3828, "step": 673 }, { "epoch": 1.443254817987152, - "grad_norm": 0.25430283807134485, + "grad_norm": 0.23265546959620192, "learning_rate": 2.884920634920635e-05, - "loss": 0.3643, + "loss": 0.3633, "step": 674 }, { "epoch": 1.4453961456102784, - "grad_norm": 0.22951249670225976, + "grad_norm": 0.23025174524649092, "learning_rate": 2.880952380952381e-05, - "loss": 0.3692, + "loss": 0.3676, "step": 675 }, { "epoch": 1.4475374732334048, - "grad_norm": 0.23388364802048373, + "grad_norm": 0.21855417995132626, "learning_rate": 2.876984126984127e-05, - "loss": 0.3708, + "loss": 0.3683, "step": 676 }, { "epoch": 1.4496788008565311, - "grad_norm": 0.21976733689801523, + "grad_norm": 0.22496528832364351, "learning_rate": 2.8730158730158728e-05, - "loss": 0.3802, + "loss": 0.3779, "step": 677 }, { "epoch": 1.4518201284796575, - "grad_norm": 0.21027240355399024, + "grad_norm": 0.2244181509364692, "learning_rate": 2.869047619047619e-05, - "loss": 0.3773, + "loss": 0.3766, "step": 678 }, { "epoch": 1.4539614561027836, - "grad_norm": 0.20199987883594428, + "grad_norm": 0.2079076247700585, "learning_rate": 2.865079365079365e-05, - "loss": 0.3637, + "loss": 0.3617, "step": 679 }, { "epoch": 1.45610278372591, - "grad_norm": 0.21923422597696604, + "grad_norm": 0.23278584379048903, "learning_rate": 2.861111111111111e-05, - "loss": 0.3619, + "loss": 0.3613, "step": 680 }, { "epoch": 1.4582441113490363, - "grad_norm": 0.21000907335559735, + "grad_norm": 0.22944194687094452, "learning_rate": 2.857142857142857e-05, - "loss": 0.3536, + "loss": 0.3514, "step": 681 }, { "epoch": 1.4603854389721627, - "grad_norm": 0.21711620369351062, + "grad_norm": 0.24034672179425406, "learning_rate": 2.853174603174603e-05, - "loss": 0.3738, + "loss": 0.3723, "step": 682 }, { "epoch": 1.462526766595289, - "grad_norm": 0.22565854928185133, + "grad_norm": 0.235937135297554, "learning_rate": 2.8492063492063492e-05, - "loss": 0.3608, + "loss": 0.3597, "step": 683 }, { "epoch": 1.4646680942184154, - "grad_norm": 0.24796414653671942, + "grad_norm": 0.26577068348082256, "learning_rate": 2.8452380952380953e-05, - "loss": 0.3859, + "loss": 0.3839, "step": 684 }, { "epoch": 1.4668094218415417, - "grad_norm": 0.19936447407500923, + "grad_norm": 0.22074582010953367, "learning_rate": 2.8412698412698414e-05, - "loss": 0.3772, + "loss": 0.3762, "step": 685 }, { "epoch": 1.468950749464668, - "grad_norm": 0.2659759151283577, + "grad_norm": 0.28955823214995086, "learning_rate": 2.8373015873015875e-05, - "loss": 0.3754, + "loss": 0.3742, "step": 686 }, { "epoch": 1.4710920770877944, - "grad_norm": 0.2142679975770448, + "grad_norm": 0.23728903924448824, "learning_rate": 2.8333333333333335e-05, - "loss": 0.3705, + "loss": 0.3682, "step": 687 }, { "epoch": 1.4732334047109208, - "grad_norm": 0.21126669286906707, + "grad_norm": 0.21949914443063165, "learning_rate": 2.8293650793650793e-05, - "loss": 0.3831, + "loss": 0.3827, "step": 688 }, { "epoch": 1.4753747323340471, - "grad_norm": 0.24217810422670202, + "grad_norm": 0.27980639261823936, "learning_rate": 2.8253968253968253e-05, - "loss": 0.3549, + "loss": 0.3525, "step": 689 }, { "epoch": 1.4775160599571735, - "grad_norm": 0.21568420422411547, + "grad_norm": 0.2314438069257415, "learning_rate": 2.8214285714285714e-05, - "loss": 0.3693, + "loss": 0.3679, "step": 690 }, { "epoch": 1.4796573875802999, - "grad_norm": 0.2312299790821551, + "grad_norm": 0.2722100613441337, "learning_rate": 2.8174603174603175e-05, - "loss": 0.3764, + "loss": 0.3751, "step": 691 }, { "epoch": 1.4817987152034262, - "grad_norm": 0.233754289817078, + "grad_norm": 0.2445277948976506, "learning_rate": 2.8134920634920636e-05, - "loss": 0.3805, + "loss": 0.378, "step": 692 }, { "epoch": 1.4839400428265526, - "grad_norm": 0.2423944727011651, + "grad_norm": 0.2658539356677542, "learning_rate": 2.8095238095238096e-05, - "loss": 0.3636, + "loss": 0.3619, "step": 693 }, { "epoch": 1.486081370449679, - "grad_norm": 0.22905460391929347, + "grad_norm": 0.26351984231630443, "learning_rate": 2.8055555555555557e-05, - "loss": 0.3895, + "loss": 0.3884, "step": 694 }, { "epoch": 1.4882226980728053, - "grad_norm": 0.23214001332154588, + "grad_norm": 0.23831638328525492, "learning_rate": 2.8015873015873018e-05, - "loss": 0.3605, + "loss": 0.359, "step": 695 }, { "epoch": 1.4903640256959314, - "grad_norm": 0.19690116100293087, + "grad_norm": 0.21588034310392898, "learning_rate": 2.797619047619048e-05, - "loss": 0.3582, + "loss": 0.3576, "step": 696 }, { "epoch": 1.4925053533190578, - "grad_norm": 0.24573029483590722, + "grad_norm": 0.2814689991132766, "learning_rate": 2.793650793650794e-05, - "loss": 0.3712, + "loss": 0.3702, "step": 697 }, { "epoch": 1.4946466809421841, - "grad_norm": 0.19231425648369746, + "grad_norm": 0.19677015269212336, "learning_rate": 2.78968253968254e-05, - "loss": 0.3497, + "loss": 0.3481, "step": 698 }, { "epoch": 1.4967880085653105, - "grad_norm": 0.22612270059401912, + "grad_norm": 0.24694240495989817, "learning_rate": 2.785714285714286e-05, - "loss": 0.3417, + "loss": 0.3404, "step": 699 }, { "epoch": 1.4989293361884368, - "grad_norm": 0.22960092397135454, + "grad_norm": 0.24560310196329904, "learning_rate": 2.781746031746032e-05, - "loss": 0.3575, + "loss": 0.3559, "step": 700 }, { "epoch": 1.5010706638115632, - "grad_norm": 0.23684421828804095, + "grad_norm": 0.22770557903012514, "learning_rate": 2.777777777777778e-05, - "loss": 0.3786, + "loss": 0.3774, "step": 701 }, { "epoch": 1.5032119914346895, - "grad_norm": 0.2191044054358468, + "grad_norm": 0.22247306696683353, "learning_rate": 2.773809523809524e-05, - "loss": 0.3549, + "loss": 0.3531, "step": 702 }, { "epoch": 1.5053533190578159, - "grad_norm": 0.24248023653906947, + "grad_norm": 0.23940565806300032, "learning_rate": 2.76984126984127e-05, - "loss": 0.3558, + "loss": 0.3548, "step": 703 }, { "epoch": 1.507494646680942, - "grad_norm": 0.24648400748932908, + "grad_norm": 0.2535843655330012, "learning_rate": 2.765873015873016e-05, "loss": 0.3657, "step": 704 }, { "epoch": 1.5096359743040684, - "grad_norm": 0.23645047152874282, + "grad_norm": 0.23618580242395265, "learning_rate": 2.7619047619047622e-05, - "loss": 0.3704, + "loss": 0.3684, "step": 705 }, { "epoch": 1.5117773019271947, - "grad_norm": 0.25487174415303726, + "grad_norm": 0.2548030311217275, "learning_rate": 2.7579365079365083e-05, - "loss": 0.3582, + "loss": 0.3575, "step": 706 }, { "epoch": 1.513918629550321, - "grad_norm": 0.2331120266364236, + "grad_norm": 0.23669897400390896, "learning_rate": 2.7539682539682544e-05, - "loss": 0.3699, + "loss": 0.3678, "step": 707 }, { "epoch": 1.5160599571734474, - "grad_norm": 0.2181485815951784, + "grad_norm": 0.22764730629262053, "learning_rate": 2.7500000000000004e-05, - "loss": 0.3751, + "loss": 0.3738, "step": 708 }, { "epoch": 1.5182012847965738, - "grad_norm": 0.24072328359741854, + "grad_norm": 0.2522265156348486, "learning_rate": 2.7460317460317465e-05, - "loss": 0.3775, + "loss": 0.3752, "step": 709 }, { "epoch": 1.5203426124197001, - "grad_norm": 0.22869430663467058, + "grad_norm": 0.22848735051444552, "learning_rate": 2.7420634920634926e-05, - "loss": 0.3724, + "loss": 0.3699, "step": 710 }, { "epoch": 1.5224839400428265, - "grad_norm": 0.21918246335255903, + "grad_norm": 0.23087075847319702, "learning_rate": 2.7380952380952383e-05, - "loss": 0.3547, + "loss": 0.3539, "step": 711 }, { "epoch": 1.5246252676659529, - "grad_norm": 0.2189822768505496, + "grad_norm": 0.22350228895743782, "learning_rate": 2.734126984126984e-05, - "loss": 0.3535, + "loss": 0.3513, "step": 712 }, { "epoch": 1.5267665952890792, - "grad_norm": 0.2526005237695246, + "grad_norm": 0.23714590267506044, "learning_rate": 2.73015873015873e-05, - "loss": 0.3713, + "loss": 0.37, "step": 713 }, { "epoch": 1.5289079229122056, - "grad_norm": 0.22601185946675514, + "grad_norm": 0.23601477099160736, "learning_rate": 2.7261904761904762e-05, - "loss": 0.3675, + "loss": 0.3654, "step": 714 }, { "epoch": 1.531049250535332, - "grad_norm": 0.20669359644643262, + "grad_norm": 0.22820632242815328, "learning_rate": 2.7222222222222223e-05, - "loss": 0.3659, + "loss": 0.3643, "step": 715 }, { "epoch": 1.5331905781584583, - "grad_norm": 0.24031352892120264, + "grad_norm": 0.2367592790064326, "learning_rate": 2.718253968253968e-05, - "loss": 0.3602, + "loss": 0.3588, "step": 716 }, { "epoch": 1.5353319057815846, - "grad_norm": 0.22795507587171882, + "grad_norm": 0.2283679400061131, "learning_rate": 2.714285714285714e-05, - "loss": 0.3709, + "loss": 0.3698, "step": 717 }, { "epoch": 1.537473233404711, - "grad_norm": 0.20536455549527732, + "grad_norm": 0.23687279355617852, "learning_rate": 2.7103174603174602e-05, - "loss": 0.3799, + "loss": 0.3796, "step": 718 }, { "epoch": 1.5396145610278373, - "grad_norm": 0.2943404251055241, + "grad_norm": 0.23367650341137466, "learning_rate": 2.7063492063492062e-05, - "loss": 0.3836, + "loss": 0.3817, "step": 719 }, { "epoch": 1.5417558886509637, - "grad_norm": 0.22205668356720148, + "grad_norm": 0.2314615253573604, "learning_rate": 2.7023809523809523e-05, - "loss": 0.3664, + "loss": 0.3656, "step": 720 }, { "epoch": 1.54389721627409, - "grad_norm": 0.21954728948498192, + "grad_norm": 0.24177942634670613, "learning_rate": 2.6984126984126984e-05, - "loss": 0.3647, + "loss": 0.3639, "step": 721 }, { "epoch": 1.5460385438972164, - "grad_norm": 0.2280345060624478, + "grad_norm": 0.21932445481704804, "learning_rate": 2.6944444444444445e-05, - "loss": 0.3713, + "loss": 0.3702, "step": 722 }, { "epoch": 1.5481798715203428, - "grad_norm": 0.22861539448823806, + "grad_norm": 0.24065414879821986, "learning_rate": 2.6904761904761905e-05, - "loss": 0.3888, + "loss": 0.3885, "step": 723 }, { "epoch": 1.550321199143469, - "grad_norm": 0.19425559114407048, + "grad_norm": 0.20826666740635788, "learning_rate": 2.6865079365079366e-05, - "loss": 0.3739, + "loss": 0.372, "step": 724 }, { "epoch": 1.5524625267665952, - "grad_norm": 0.24288323051735908, + "grad_norm": 0.22699790366079173, "learning_rate": 2.6825396825396827e-05, - "loss": 0.355, + "loss": 0.3529, "step": 725 }, { "epoch": 1.5546038543897216, - "grad_norm": 0.22443141683817264, + "grad_norm": 0.23143139316201608, "learning_rate": 2.6785714285714288e-05, - "loss": 0.3888, + "loss": 0.3873, "step": 726 }, { "epoch": 1.556745182012848, - "grad_norm": 0.2182399943573036, + "grad_norm": 0.23566599378296404, "learning_rate": 2.6746031746031745e-05, - "loss": 0.3624, + "loss": 0.3613, "step": 727 }, { "epoch": 1.5588865096359743, - "grad_norm": 0.23168062600285244, + "grad_norm": 0.2431611538451522, "learning_rate": 2.6706349206349206e-05, - "loss": 0.3585, + "loss": 0.3564, "step": 728 }, { "epoch": 1.5610278372591007, - "grad_norm": 0.21591657004920606, + "grad_norm": 0.21557222183068245, "learning_rate": 2.6666666666666667e-05, - "loss": 0.3543, + "loss": 0.3519, "step": 729 }, { "epoch": 1.563169164882227, - "grad_norm": 0.21509414063611806, + "grad_norm": 0.21761967858926073, "learning_rate": 2.6626984126984127e-05, - "loss": 0.369, + "loss": 0.3676, "step": 730 }, { "epoch": 1.5653104925053534, - "grad_norm": 0.21543923122674843, + "grad_norm": 0.2067924331453489, "learning_rate": 2.6587301587301588e-05, - "loss": 0.3582, + "loss": 0.357, "step": 731 }, { "epoch": 1.5674518201284795, - "grad_norm": 0.20613757156954682, + "grad_norm": 0.21006075912880307, "learning_rate": 2.654761904761905e-05, - "loss": 0.3564, + "loss": 0.3545, "step": 732 }, { "epoch": 1.5695931477516059, - "grad_norm": 0.2068022698111163, + "grad_norm": 0.23470822509906164, "learning_rate": 2.650793650793651e-05, - "loss": 0.3714, + "loss": 0.3698, "step": 733 }, { "epoch": 1.5717344753747322, - "grad_norm": 0.19705404333823645, + "grad_norm": 0.19926495187855447, "learning_rate": 2.646825396825397e-05, - "loss": 0.3648, + "loss": 0.3622, "step": 734 }, { "epoch": 1.5738758029978586, - "grad_norm": 0.20492416067065145, + "grad_norm": 0.24016060928887698, "learning_rate": 2.642857142857143e-05, - "loss": 0.36, + "loss": 0.3582, "step": 735 }, { "epoch": 1.576017130620985, - "grad_norm": 0.20508644827871644, + "grad_norm": 0.22487176055383068, "learning_rate": 2.6388888888888892e-05, - "loss": 0.3874, + "loss": 0.386, "step": 736 }, { "epoch": 1.5781584582441113, - "grad_norm": 0.20727086230283992, + "grad_norm": 0.21079271486663667, "learning_rate": 2.6349206349206353e-05, - "loss": 0.3641, + "loss": 0.3623, "step": 737 }, { "epoch": 1.5802997858672376, - "grad_norm": 0.21066080929705655, + "grad_norm": 0.2334507646133209, "learning_rate": 2.6309523809523813e-05, - "loss": 0.3775, + "loss": 0.3762, "step": 738 }, { "epoch": 1.582441113490364, - "grad_norm": 0.1858511474413401, + "grad_norm": 0.19667313653775484, "learning_rate": 2.626984126984127e-05, - "loss": 0.3493, + "loss": 0.3474, "step": 739 }, { "epoch": 1.5845824411134903, - "grad_norm": 0.1948727545134536, + "grad_norm": 0.21267970407005282, "learning_rate": 2.623015873015873e-05, - "loss": 0.3604, + "loss": 0.3582, "step": 740 }, { "epoch": 1.5867237687366167, - "grad_norm": 0.18377772006229867, + "grad_norm": 0.19547706247312066, "learning_rate": 2.6190476190476192e-05, - "loss": 0.3656, + "loss": 0.364, "step": 741 }, { "epoch": 1.588865096359743, - "grad_norm": 0.20759171733907542, + "grad_norm": 0.22715519405850873, "learning_rate": 2.6150793650793653e-05, - "loss": 0.36, + "loss": 0.3589, "step": 742 }, { "epoch": 1.5910064239828694, - "grad_norm": 0.20021552648301658, + "grad_norm": 0.2248474130605718, "learning_rate": 2.6111111111111114e-05, - "loss": 0.3672, + "loss": 0.3658, "step": 743 }, { "epoch": 1.5931477516059958, - "grad_norm": 0.1987003787234111, + "grad_norm": 0.207264130106897, "learning_rate": 2.6071428571428574e-05, - "loss": 0.3697, + "loss": 0.3677, "step": 744 }, { "epoch": 1.595289079229122, - "grad_norm": 0.1992569458099437, + "grad_norm": 0.20684564047898432, "learning_rate": 2.6031746031746035e-05, - "loss": 0.3722, + "loss": 0.3696, "step": 745 }, { "epoch": 1.5974304068522485, - "grad_norm": 0.19490857264680375, + "grad_norm": 0.20367525386940707, "learning_rate": 2.5992063492063496e-05, - "loss": 0.3487, + "loss": 0.3481, "step": 746 }, { "epoch": 1.5995717344753748, - "grad_norm": 0.20992568333120643, + "grad_norm": 0.20679426091923925, "learning_rate": 2.5952380952380957e-05, - "loss": 0.3721, + "loss": 0.3701, "step": 747 }, { "epoch": 1.6017130620985012, - "grad_norm": 0.20903627954075446, + "grad_norm": 0.20915864152813907, "learning_rate": 2.5912698412698417e-05, - "loss": 0.3645, + "loss": 0.3633, "step": 748 }, { "epoch": 1.6038543897216275, - "grad_norm": 0.2206708938047425, + "grad_norm": 0.2234637695145891, "learning_rate": 2.5873015873015878e-05, - "loss": 0.3596, + "loss": 0.3579, "step": 749 }, { "epoch": 1.6059957173447539, - "grad_norm": 0.2315828550558367, + "grad_norm": 0.23358262127655052, "learning_rate": 2.5833333333333336e-05, - "loss": 0.3833, + "loss": 0.3823, "step": 750 }, { "epoch": 1.6081370449678802, - "grad_norm": 0.20729043706506806, + "grad_norm": 0.22541931799902007, "learning_rate": 2.5793650793650796e-05, - "loss": 0.3676, + "loss": 0.3651, "step": 751 }, { "epoch": 1.6102783725910066, - "grad_norm": 0.21304496616203045, + "grad_norm": 0.22458604036006147, "learning_rate": 2.5753968253968254e-05, - "loss": 0.3692, + "loss": 0.3672, "step": 752 }, { "epoch": 1.6124197002141327, - "grad_norm": 0.2066888063019116, + "grad_norm": 0.19850956726240754, "learning_rate": 2.5714285714285714e-05, - "loss": 0.3513, + "loss": 0.3495, "step": 753 }, { "epoch": 1.614561027837259, - "grad_norm": 0.20035669627587474, + "grad_norm": 0.22459012390393387, "learning_rate": 2.5674603174603172e-05, - "loss": 0.3625, + "loss": 0.3615, "step": 754 }, { "epoch": 1.6167023554603854, - "grad_norm": 0.21327760276750352, + "grad_norm": 0.22306811697664986, "learning_rate": 2.5634920634920633e-05, - "loss": 0.3642, + "loss": 0.3629, "step": 755 }, { "epoch": 1.6188436830835118, - "grad_norm": 0.2144247994926133, + "grad_norm": 0.22471296174548994, "learning_rate": 2.5595238095238093e-05, - "loss": 0.378, + "loss": 0.3764, "step": 756 }, { "epoch": 1.6209850107066381, - "grad_norm": 0.21826292313986448, + "grad_norm": 0.21158605873651065, "learning_rate": 2.5555555555555554e-05, - "loss": 0.3682, + "loss": 0.3653, "step": 757 }, { "epoch": 1.6231263383297645, - "grad_norm": 0.2112350938263051, + "grad_norm": 0.21671080824519806, "learning_rate": 2.5515873015873015e-05, - "loss": 0.3737, + "loss": 0.371, "step": 758 }, { "epoch": 1.6252676659528906, - "grad_norm": 0.22455328732151775, + "grad_norm": 0.2356438023908563, "learning_rate": 2.5476190476190476e-05, - "loss": 0.3654, + "loss": 0.364, "step": 759 }, { "epoch": 1.627408993576017, - "grad_norm": 0.18501213734273322, + "grad_norm": 0.19447430345844002, "learning_rate": 2.5436507936507936e-05, - "loss": 0.3383, + "loss": 0.3368, "step": 760 }, { "epoch": 1.6295503211991433, - "grad_norm": 0.20942821521960064, + "grad_norm": 0.22994645655583826, "learning_rate": 2.5396825396825397e-05, - "loss": 0.3583, + "loss": 0.3567, "step": 761 }, { "epoch": 1.6316916488222697, - "grad_norm": 0.22203677151968088, + "grad_norm": 0.23001951683149954, "learning_rate": 2.5357142857142858e-05, - "loss": 0.37, + "loss": 0.3684, "step": 762 }, { "epoch": 1.633832976445396, - "grad_norm": 0.22074729844028526, + "grad_norm": 0.2119578315059528, "learning_rate": 2.531746031746032e-05, - "loss": 0.3555, + "loss": 0.3536, "step": 763 }, { "epoch": 1.6359743040685224, - "grad_norm": 0.24745475269473763, + "grad_norm": 0.2615589609533618, "learning_rate": 2.527777777777778e-05, - "loss": 0.3913, + "loss": 0.389, "step": 764 }, { "epoch": 1.6381156316916488, - "grad_norm": 0.21980493117452848, + "grad_norm": 0.22688862347727404, "learning_rate": 2.523809523809524e-05, - "loss": 0.3684, + "loss": 0.3659, "step": 765 }, { "epoch": 1.640256959314775, - "grad_norm": 0.21850689842540688, + "grad_norm": 0.22643676902104382, "learning_rate": 2.5198412698412697e-05, - "loss": 0.3727, + "loss": 0.3698, "step": 766 }, { "epoch": 1.6423982869379015, - "grad_norm": 0.21446663216573078, + "grad_norm": 0.21169667439175707, "learning_rate": 2.5158730158730158e-05, - "loss": 0.3782, + "loss": 0.3762, "step": 767 }, { "epoch": 1.6445396145610278, - "grad_norm": 0.21778149384624332, + "grad_norm": 0.22932111037643213, "learning_rate": 2.511904761904762e-05, - "loss": 0.3683, + "loss": 0.367, "step": 768 }, { "epoch": 1.6466809421841542, - "grad_norm": 0.20468677302613195, + "grad_norm": 0.21149524956469937, "learning_rate": 2.507936507936508e-05, - "loss": 0.3771, + "loss": 0.3764, "step": 769 }, { "epoch": 1.6488222698072805, - "grad_norm": 0.23332787991529938, + "grad_norm": 0.22593475888288625, "learning_rate": 2.503968253968254e-05, - "loss": 0.367, + "loss": 0.3651, "step": 770 }, { "epoch": 1.6509635974304069, - "grad_norm": 0.21897816586893398, + "grad_norm": 0.22008417552751885, "learning_rate": 2.5e-05, - "loss": 0.3783, + "loss": 0.377, "step": 771 }, { "epoch": 1.6531049250535332, - "grad_norm": 0.2030029868201131, + "grad_norm": 0.20441837816311934, "learning_rate": 2.4960317460317462e-05, - "loss": 0.3745, + "loss": 0.3716, "step": 772 }, { "epoch": 1.6552462526766596, - "grad_norm": 0.21310414937938124, + "grad_norm": 0.21552942260647906, "learning_rate": 2.4920634920634923e-05, - "loss": 0.3405, + "loss": 0.3385, "step": 773 }, { "epoch": 1.657387580299786, - "grad_norm": 0.22195298457726914, + "grad_norm": 0.2353507438436094, "learning_rate": 2.4880952380952383e-05, - "loss": 0.3874, + "loss": 0.3864, "step": 774 }, { "epoch": 1.6595289079229123, - "grad_norm": 0.20300473974521194, + "grad_norm": 0.21413713760888206, "learning_rate": 2.4841269841269844e-05, - "loss": 0.3663, + "loss": 0.3647, "step": 775 }, { "epoch": 1.6616702355460387, - "grad_norm": 0.22067636878573646, + "grad_norm": 0.2160670298547777, "learning_rate": 2.4801587301587305e-05, - "loss": 0.3887, + "loss": 0.3868, "step": 776 }, { "epoch": 1.663811563169165, - "grad_norm": 0.21073789338832113, + "grad_norm": 0.21929322311304728, "learning_rate": 2.4761904761904762e-05, - "loss": 0.3743, + "loss": 0.3739, "step": 777 }, { "epoch": 1.6659528907922914, - "grad_norm": 0.22786185959131758, + "grad_norm": 0.22788233645299552, "learning_rate": 2.4722222222222223e-05, - "loss": 0.3743, + "loss": 0.373, "step": 778 }, { "epoch": 1.6680942184154177, - "grad_norm": 0.20200585214379246, + "grad_norm": 0.210338880481676, "learning_rate": 2.4682539682539684e-05, - "loss": 0.3725, + "loss": 0.3713, "step": 779 }, { "epoch": 1.6702355460385439, - "grad_norm": 0.21952221945022432, + "grad_norm": 0.22350067161462142, "learning_rate": 2.4642857142857145e-05, - "loss": 0.3678, + "loss": 0.3664, "step": 780 }, { "epoch": 1.6723768736616702, - "grad_norm": 0.49934368068454904, + "grad_norm": 0.22340962121009117, "learning_rate": 2.4603174603174602e-05, - "loss": 0.3964, + "loss": 0.3881, "step": 781 }, { "epoch": 1.6745182012847966, - "grad_norm": 0.19757194754481436, + "grad_norm": 0.1990507179838242, "learning_rate": 2.4563492063492063e-05, - "loss": 0.3763, + "loss": 0.3746, "step": 782 }, { "epoch": 1.676659528907923, - "grad_norm": 0.19767924902990838, + "grad_norm": 0.22318485815266106, "learning_rate": 2.4523809523809523e-05, - "loss": 0.3543, + "loss": 0.353, "step": 783 }, { "epoch": 1.6788008565310493, - "grad_norm": 0.20919876927515002, + "grad_norm": 0.22041375294909685, "learning_rate": 2.4484126984126984e-05, - "loss": 0.3735, + "loss": 0.3724, "step": 784 }, { "epoch": 1.6809421841541756, - "grad_norm": 0.22040578372419578, + "grad_norm": 0.21498770171477544, "learning_rate": 2.4444444444444445e-05, - "loss": 0.3829, + "loss": 0.3819, "step": 785 }, { "epoch": 1.683083511777302, - "grad_norm": 0.22430773595492015, + "grad_norm": 0.23245272113804277, "learning_rate": 2.4404761904761906e-05, - "loss": 0.3656, + "loss": 0.3632, "step": 786 }, { "epoch": 1.685224839400428, - "grad_norm": 0.20259143808355987, + "grad_norm": 0.2249931267054549, "learning_rate": 2.4365079365079366e-05, - "loss": 0.3647, + "loss": 0.3639, "step": 787 }, { "epoch": 1.6873661670235545, - "grad_norm": 0.21894252518737073, + "grad_norm": 0.21799548712885633, "learning_rate": 2.4325396825396827e-05, - "loss": 0.3601, + "loss": 0.3581, "step": 788 }, { "epoch": 1.6895074946466808, - "grad_norm": 0.21462981545836501, + "grad_norm": 0.22018263174045988, "learning_rate": 2.4285714285714288e-05, - "loss": 0.3629, + "loss": 0.3612, "step": 789 }, { "epoch": 1.6916488222698072, - "grad_norm": 0.18869575445100586, + "grad_norm": 0.20033740087548269, "learning_rate": 2.424603174603175e-05, - "loss": 0.3323, + "loss": 0.3313, "step": 790 }, { "epoch": 1.6937901498929335, - "grad_norm": 0.20241689753494335, + "grad_norm": 0.19788624340907016, "learning_rate": 2.4206349206349206e-05, - "loss": 0.3464, + "loss": 0.345, "step": 791 }, { "epoch": 1.6959314775160599, - "grad_norm": 0.19953981094595588, + "grad_norm": 0.21154589560113807, "learning_rate": 2.4166666666666667e-05, - "loss": 0.3458, + "loss": 0.3448, "step": 792 }, { "epoch": 1.6980728051391862, - "grad_norm": 0.19961349385460256, + "grad_norm": 0.21812600023361592, "learning_rate": 2.4126984126984128e-05, - "loss": 0.3527, + "loss": 0.3506, "step": 793 }, { "epoch": 1.7002141327623126, - "grad_norm": 0.21973818696015726, + "grad_norm": 0.21274687742218662, "learning_rate": 2.408730158730159e-05, - "loss": 0.3611, + "loss": 0.3602, "step": 794 }, { "epoch": 1.702355460385439, - "grad_norm": 0.1887578663893525, + "grad_norm": 0.20945076032569784, "learning_rate": 2.404761904761905e-05, - "loss": 0.3624, + "loss": 0.3609, "step": 795 }, { "epoch": 1.7044967880085653, - "grad_norm": 0.19628057987549807, + "grad_norm": 0.22319883020181144, "learning_rate": 2.400793650793651e-05, - "loss": 0.3717, + "loss": 0.3707, "step": 796 }, { "epoch": 1.7066381156316917, - "grad_norm": 0.20905187089247354, + "grad_norm": 0.23376387552204406, "learning_rate": 2.396825396825397e-05, - "loss": 0.3797, + "loss": 0.3777, "step": 797 }, { "epoch": 1.708779443254818, - "grad_norm": 0.210161778786647, + "grad_norm": 0.21375479088785895, "learning_rate": 2.392857142857143e-05, - "loss": 0.3648, + "loss": 0.3637, "step": 798 }, { "epoch": 1.7109207708779444, - "grad_norm": 0.18782590789097356, + "grad_norm": 0.20013544930852412, "learning_rate": 2.3888888888888892e-05, - "loss": 0.3672, + "loss": 0.3662, "step": 799 }, { "epoch": 1.7130620985010707, - "grad_norm": 0.21536059285769518, + "grad_norm": 0.23771022394213084, "learning_rate": 2.3849206349206353e-05, - "loss": 0.3556, + "loss": 0.3541, "step": 800 }, { "epoch": 1.715203426124197, - "grad_norm": 0.19872849614371818, + "grad_norm": 0.2125013216241467, "learning_rate": 2.380952380952381e-05, - "loss": 0.3606, + "loss": 0.3586, "step": 801 }, { "epoch": 1.7173447537473234, - "grad_norm": 0.2190527880750777, + "grad_norm": 0.24611181377977226, "learning_rate": 2.376984126984127e-05, - "loss": 0.3778, + "loss": 0.3765, "step": 802 }, { "epoch": 1.7194860813704498, - "grad_norm": 0.21010529253337107, + "grad_norm": 0.2327060873577735, "learning_rate": 2.373015873015873e-05, - "loss": 0.355, + "loss": 0.3533, "step": 803 }, { "epoch": 1.7216274089935761, - "grad_norm": 0.20516661849890908, + "grad_norm": 0.22138909685445068, "learning_rate": 2.369047619047619e-05, - "loss": 0.3595, + "loss": 0.3583, "step": 804 }, { "epoch": 1.7237687366167025, - "grad_norm": 0.21809159516111487, + "grad_norm": 0.208629956972268, "learning_rate": 2.365079365079365e-05, - "loss": 0.3774, + "loss": 0.3755, "step": 805 }, { "epoch": 1.7259100642398288, - "grad_norm": 0.21787730326337182, + "grad_norm": 0.23858026530704865, "learning_rate": 2.361111111111111e-05, - "loss": 0.3716, + "loss": 0.3694, "step": 806 }, { "epoch": 1.728051391862955, - "grad_norm": 0.20722387140693752, + "grad_norm": 0.21718501714953736, "learning_rate": 2.357142857142857e-05, - "loss": 0.3538, + "loss": 0.353, "step": 807 }, { "epoch": 1.7301927194860813, - "grad_norm": 0.19038300413501802, + "grad_norm": 0.1938804380949089, "learning_rate": 2.3531746031746032e-05, - "loss": 0.3487, + "loss": 0.3474, "step": 808 }, { "epoch": 1.7323340471092077, - "grad_norm": 0.21385692991090183, + "grad_norm": 0.21740566223114047, "learning_rate": 2.3492063492063493e-05, - "loss": 0.3765, + "loss": 0.3754, "step": 809 }, { "epoch": 1.734475374732334, - "grad_norm": 0.20161843453069309, + "grad_norm": 0.2142359822570361, "learning_rate": 2.3452380952380954e-05, - "loss": 0.3593, + "loss": 0.358, "step": 810 }, { "epoch": 1.7366167023554604, - "grad_norm": 0.1991611902428487, + "grad_norm": 0.2070519093899908, "learning_rate": 2.3412698412698414e-05, - "loss": 0.3667, + "loss": 0.3648, "step": 811 }, { "epoch": 1.7387580299785867, - "grad_norm": 0.19751266841983958, + "grad_norm": 0.1994890470093942, "learning_rate": 2.3373015873015875e-05, - "loss": 0.3725, + "loss": 0.3721, "step": 812 }, { "epoch": 1.740899357601713, - "grad_norm": 0.18754293892801968, + "grad_norm": 0.19930905538311752, "learning_rate": 2.3333333333333336e-05, - "loss": 0.3649, + "loss": 0.3634, "step": 813 }, { "epoch": 1.7430406852248392, - "grad_norm": 0.22386270344186163, + "grad_norm": 0.22943112463202014, "learning_rate": 2.3293650793650797e-05, - "loss": 0.3685, + "loss": 0.3674, "step": 814 }, { "epoch": 1.7451820128479656, - "grad_norm": 0.1966840040343867, + "grad_norm": 0.19595373289639564, "learning_rate": 2.3253968253968257e-05, - "loss": 0.3842, + "loss": 0.3826, "step": 815 }, { "epoch": 1.747323340471092, - "grad_norm": 0.21657779353698456, + "grad_norm": 0.23295233026294598, "learning_rate": 2.3214285714285715e-05, - "loss": 0.3667, + "loss": 0.3656, "step": 816 }, { "epoch": 1.7494646680942183, - "grad_norm": 0.2288614122016469, + "grad_norm": 0.21540081438261183, "learning_rate": 2.3174603174603175e-05, - "loss": 0.3843, + "loss": 0.3829, "step": 817 }, { "epoch": 1.7516059957173447, - "grad_norm": 0.1836444137030942, + "grad_norm": 0.1864879412189447, "learning_rate": 2.3134920634920636e-05, - "loss": 0.3583, + "loss": 0.3554, "step": 818 }, { "epoch": 1.753747323340471, - "grad_norm": 0.23767773436120954, + "grad_norm": 0.2387089230839924, "learning_rate": 2.3095238095238097e-05, - "loss": 0.3863, + "loss": 0.385, "step": 819 }, { "epoch": 1.7558886509635974, - "grad_norm": 0.2041079485593283, + "grad_norm": 0.2232438395168808, "learning_rate": 2.3055555555555558e-05, - "loss": 0.3522, + "loss": 0.3504, "step": 820 }, { "epoch": 1.7580299785867237, - "grad_norm": 0.20093188325290554, + "grad_norm": 0.20415844428245095, "learning_rate": 2.3015873015873015e-05, - "loss": 0.3795, + "loss": 0.3789, "step": 821 }, { "epoch": 1.76017130620985, - "grad_norm": 0.19995939365331822, + "grad_norm": 0.19921172666296888, "learning_rate": 2.2976190476190476e-05, - "loss": 0.3721, + "loss": 0.3701, "step": 822 }, { "epoch": 1.7623126338329764, - "grad_norm": 0.2166910316666174, + "grad_norm": 0.22470908559459463, "learning_rate": 2.2936507936507937e-05, - "loss": 0.3634, + "loss": 0.3619, "step": 823 }, { "epoch": 1.7644539614561028, - "grad_norm": 0.18670680995845637, + "grad_norm": 0.1959331611395145, "learning_rate": 2.2896825396825397e-05, - "loss": 0.344, + "loss": 0.343, "step": 824 }, { "epoch": 1.7665952890792291, - "grad_norm": 0.21173006300417144, + "grad_norm": 0.21307394355566658, "learning_rate": 2.2857142857142858e-05, - "loss": 0.356, + "loss": 0.3546, "step": 825 }, { "epoch": 1.7687366167023555, - "grad_norm": 0.21040949951223306, + "grad_norm": 0.20578224521899074, "learning_rate": 2.281746031746032e-05, - "loss": 0.3816, + "loss": 0.3795, "step": 826 }, { "epoch": 1.7708779443254818, - "grad_norm": 0.20778505679750192, + "grad_norm": 0.2189247535460949, "learning_rate": 2.277777777777778e-05, - "loss": 0.3618, + "loss": 0.3607, "step": 827 }, { "epoch": 1.7730192719486082, - "grad_norm": 0.213799456066487, + "grad_norm": 0.21596310355670684, "learning_rate": 2.273809523809524e-05, - "loss": 0.3589, + "loss": 0.3584, "step": 828 }, { "epoch": 1.7751605995717346, - "grad_norm": 0.2043033051385102, + "grad_norm": 0.2008393875425573, "learning_rate": 2.2698412698412698e-05, - "loss": 0.366, + "loss": 0.3647, "step": 829 }, { "epoch": 1.777301927194861, - "grad_norm": 0.19754414108874052, + "grad_norm": 0.21184243202841968, "learning_rate": 2.265873015873016e-05, - "loss": 0.347, + "loss": 0.3455, "step": 830 }, { "epoch": 1.7794432548179873, - "grad_norm": 0.2158296684757847, + "grad_norm": 0.22865192619400915, "learning_rate": 2.261904761904762e-05, - "loss": 0.3915, + "loss": 0.3901, "step": 831 }, { "epoch": 1.7815845824411136, - "grad_norm": 0.22243362581646817, + "grad_norm": 0.22618545858724132, "learning_rate": 2.257936507936508e-05, - "loss": 0.3854, + "loss": 0.3832, "step": 832 }, { "epoch": 1.78372591006424, - "grad_norm": 0.21571356659346066, + "grad_norm": 0.22738066487492137, "learning_rate": 2.253968253968254e-05, - "loss": 0.3409, + "loss": 0.3399, "step": 833 }, { "epoch": 1.7858672376873663, - "grad_norm": 0.22447191028202343, + "grad_norm": 0.23621094995588868, "learning_rate": 2.25e-05, - "loss": 0.3749, + "loss": 0.3737, "step": 834 }, { "epoch": 1.7880085653104925, - "grad_norm": 0.215504263290714, + "grad_norm": 0.2157005750309211, "learning_rate": 2.2460317460317462e-05, - "loss": 0.3586, + "loss": 0.3556, "step": 835 }, { "epoch": 1.7901498929336188, - "grad_norm": 0.20688541961124565, + "grad_norm": 0.20967061124515987, "learning_rate": 2.2420634920634923e-05, - "loss": 0.3723, + "loss": 0.3711, "step": 836 }, { "epoch": 1.7922912205567452, - "grad_norm": 0.20199140517623562, + "grad_norm": 0.21011640159026213, "learning_rate": 2.2380952380952384e-05, - "loss": 0.371, + "loss": 0.3696, "step": 837 }, { "epoch": 1.7944325481798715, - "grad_norm": 0.22478296242958862, + "grad_norm": 0.2301543332161994, "learning_rate": 2.2341269841269844e-05, - "loss": 0.3634, + "loss": 0.3619, "step": 838 }, { "epoch": 1.7965738758029979, - "grad_norm": 0.19888513887055395, + "grad_norm": 0.19783929504773992, "learning_rate": 2.2301587301587305e-05, - "loss": 0.3733, + "loss": 0.3711, "step": 839 }, { "epoch": 1.7987152034261242, - "grad_norm": 0.19421047575082181, + "grad_norm": 0.2027645971091363, "learning_rate": 2.2261904761904763e-05, - "loss": 0.3597, + "loss": 0.3575, "step": 840 }, { "epoch": 1.8008565310492506, - "grad_norm": 0.20475219316843102, + "grad_norm": 0.21010286682467308, "learning_rate": 2.2222222222222223e-05, - "loss": 0.3708, + "loss": 0.3684, "step": 841 }, { "epoch": 1.8029978586723767, - "grad_norm": 0.2114777102870167, + "grad_norm": 0.1962020508553967, "learning_rate": 2.2182539682539684e-05, - "loss": 0.3677, + "loss": 0.3651, "step": 842 }, { "epoch": 1.805139186295503, - "grad_norm": 0.19598571633723222, + "grad_norm": 0.20781819889894468, "learning_rate": 2.214285714285714e-05, - "loss": 0.3661, + "loss": 0.365, "step": 843 }, { "epoch": 1.8072805139186294, - "grad_norm": 0.20252036775324378, + "grad_norm": 0.19696845772215546, "learning_rate": 2.2103174603174602e-05, - "loss": 0.3645, + "loss": 0.3629, "step": 844 }, { "epoch": 1.8094218415417558, - "grad_norm": 0.20511403057320743, + "grad_norm": 0.21583117590285078, "learning_rate": 2.2063492063492063e-05, - "loss": 0.3633, + "loss": 0.3617, "step": 845 }, { "epoch": 1.8115631691648821, - "grad_norm": 0.20558945002253082, + "grad_norm": 0.2173603569007904, "learning_rate": 2.2023809523809524e-05, - "loss": 0.3752, + "loss": 0.3734, "step": 846 }, { "epoch": 1.8137044967880085, - "grad_norm": 0.21428205634679462, + "grad_norm": 0.2022970217705777, "learning_rate": 2.1984126984126984e-05, - "loss": 0.3683, + "loss": 0.3659, "step": 847 }, { "epoch": 1.8158458244111348, - "grad_norm": 0.23331079602832885, + "grad_norm": 0.24139485756064838, "learning_rate": 2.1944444444444445e-05, - "loss": 0.3671, + "loss": 0.3644, "step": 848 }, { "epoch": 1.8179871520342612, - "grad_norm": 0.21486622134643, + "grad_norm": 0.216967803177166, "learning_rate": 2.1904761904761906e-05, - "loss": 0.3652, + "loss": 0.3625, "step": 849 }, { "epoch": 1.8201284796573876, - "grad_norm": 0.19969930097292724, + "grad_norm": 0.21018593632480406, "learning_rate": 2.1865079365079367e-05, - "loss": 0.3689, + "loss": 0.3674, "step": 850 }, { "epoch": 1.822269807280514, - "grad_norm": 0.21534324934924104, + "grad_norm": 0.21491326859755341, "learning_rate": 2.1825396825396827e-05, - "loss": 0.3774, + "loss": 0.376, "step": 851 }, { "epoch": 1.8244111349036403, - "grad_norm": 0.21397843196592883, + "grad_norm": 0.2016970958055073, "learning_rate": 2.1785714285714288e-05, - "loss": 0.3694, + "loss": 0.3682, "step": 852 }, { "epoch": 1.8265524625267666, - "grad_norm": 0.23443278557392533, + "grad_norm": 0.23649936879215913, "learning_rate": 2.174603174603175e-05, - "loss": 0.3699, + "loss": 0.3671, "step": 853 }, { "epoch": 1.828693790149893, - "grad_norm": 0.19730634328807958, + "grad_norm": 0.20337632262549168, "learning_rate": 2.170634920634921e-05, - "loss": 0.3478, + "loss": 0.3461, "step": 854 }, { "epoch": 1.8308351177730193, - "grad_norm": 0.21913953376852294, + "grad_norm": 0.21300952050085178, "learning_rate": 2.1666666666666667e-05, - "loss": 0.3567, + "loss": 0.3542, "step": 855 }, { "epoch": 1.8329764453961457, - "grad_norm": 0.20501578279293847, + "grad_norm": 0.21370860755844873, "learning_rate": 2.1626984126984128e-05, - "loss": 0.3497, + "loss": 0.3485, "step": 856 }, { "epoch": 1.835117773019272, - "grad_norm": 0.20935735217003343, + "grad_norm": 0.2238345205767126, "learning_rate": 2.158730158730159e-05, - "loss": 0.3551, + "loss": 0.3538, "step": 857 }, { "epoch": 1.8372591006423984, - "grad_norm": 0.19034194770904342, + "grad_norm": 0.20314755234997184, "learning_rate": 2.154761904761905e-05, - "loss": 0.3628, + "loss": 0.3618, "step": 858 }, { "epoch": 1.8394004282655247, - "grad_norm": 0.18870437952631242, + "grad_norm": 0.19823141224145824, "learning_rate": 2.150793650793651e-05, - "loss": 0.3513, + "loss": 0.3496, "step": 859 }, { "epoch": 1.841541755888651, - "grad_norm": 0.1919743307796265, + "grad_norm": 0.19176641722853272, "learning_rate": 2.1468253968253967e-05, - "loss": 0.3732, + "loss": 0.3712, "step": 860 }, { "epoch": 1.8436830835117775, - "grad_norm": 0.20417690639096803, + "grad_norm": 0.20478508554466868, "learning_rate": 2.1428571428571428e-05, - "loss": 0.3554, + "loss": 0.3548, "step": 861 }, { "epoch": 1.8458244111349036, - "grad_norm": 0.21048046032387568, + "grad_norm": 0.19864762266874297, "learning_rate": 2.138888888888889e-05, - "loss": 0.3632, + "loss": 0.3616, "step": 862 }, { "epoch": 1.84796573875803, - "grad_norm": 0.19340209108489914, + "grad_norm": 0.20776884101667364, "learning_rate": 2.134920634920635e-05, - "loss": 0.3691, + "loss": 0.3674, "step": 863 }, { "epoch": 1.8501070663811563, - "grad_norm": 0.2147750143951101, + "grad_norm": 0.20958140144946166, "learning_rate": 2.130952380952381e-05, - "loss": 0.3691, + "loss": 0.3681, "step": 864 }, { "epoch": 1.8522483940042827, - "grad_norm": 0.19393590878356723, + "grad_norm": 0.1866017449921812, "learning_rate": 2.126984126984127e-05, - "loss": 0.3531, + "loss": 0.3509, "step": 865 }, { "epoch": 1.854389721627409, - "grad_norm": 0.18438206629844184, + "grad_norm": 0.1893458266122647, "learning_rate": 2.1230158730158732e-05, - "loss": 0.3529, + "loss": 0.3508, "step": 866 }, { "epoch": 1.8565310492505354, - "grad_norm": 0.23087737411403764, + "grad_norm": 0.22008473964117356, "learning_rate": 2.1190476190476193e-05, - "loss": 0.369, + "loss": 0.3678, "step": 867 }, { "epoch": 1.8586723768736617, - "grad_norm": 0.20076795586544216, + "grad_norm": 0.1936262836818084, "learning_rate": 2.115079365079365e-05, - "loss": 0.3523, + "loss": 0.3507, "step": 868 }, { "epoch": 1.8608137044967878, - "grad_norm": 0.20026389882619192, + "grad_norm": 0.19939279008943295, "learning_rate": 2.111111111111111e-05, - "loss": 0.3623, + "loss": 0.3606, "step": 869 }, { "epoch": 1.8629550321199142, - "grad_norm": 0.2187940555605095, + "grad_norm": 0.20666085743177595, "learning_rate": 2.107142857142857e-05, - "loss": 0.3579, + "loss": 0.3564, "step": 870 }, { "epoch": 1.8650963597430406, - "grad_norm": 0.22989363140509406, + "grad_norm": 0.22692963345797962, "learning_rate": 2.1031746031746032e-05, - "loss": 0.3537, + "loss": 0.3512, "step": 871 }, { "epoch": 1.867237687366167, - "grad_norm": 0.187165767075903, + "grad_norm": 0.21239492391629777, "learning_rate": 2.0992063492063493e-05, - "loss": 0.3734, + "loss": 0.3728, "step": 872 }, { "epoch": 1.8693790149892933, - "grad_norm": 0.2107266950762665, + "grad_norm": 0.1984911807328775, "learning_rate": 2.0952380952380954e-05, - "loss": 0.3672, + "loss": 0.3651, "step": 873 }, { "epoch": 1.8715203426124196, - "grad_norm": 0.20955760948663935, + "grad_norm": 0.2035710275475753, "learning_rate": 2.0912698412698415e-05, - "loss": 0.3584, + "loss": 0.3566, "step": 874 }, { "epoch": 1.873661670235546, - "grad_norm": 0.20510030528233222, + "grad_norm": 0.21183590394095844, "learning_rate": 2.0873015873015875e-05, - "loss": 0.3564, + "loss": 0.3544, "step": 875 }, { "epoch": 1.8758029978586723, - "grad_norm": 0.22155578745771093, + "grad_norm": 0.20710933660481542, "learning_rate": 2.0833333333333336e-05, - "loss": 0.3543, + "loss": 0.3526, "step": 876 }, { "epoch": 1.8779443254817987, - "grad_norm": 0.21075973711020585, + "grad_norm": 0.20824572300787447, "learning_rate": 2.0793650793650797e-05, - "loss": 0.344, + "loss": 0.3429, "step": 877 }, { "epoch": 1.880085653104925, - "grad_norm": 0.21005937923832468, + "grad_norm": 0.21277139237572662, "learning_rate": 2.0753968253968258e-05, - "loss": 0.3672, + "loss": 0.3647, "step": 878 }, { "epoch": 1.8822269807280514, - "grad_norm": 0.20105451656897086, + "grad_norm": 0.20675962730393843, "learning_rate": 2.0714285714285718e-05, - "loss": 0.3672, + "loss": 0.3656, "step": 879 }, { "epoch": 1.8843683083511777, - "grad_norm": 0.245253064751467, + "grad_norm": 0.2578061912750937, "learning_rate": 2.0674603174603176e-05, - "loss": 0.3551, + "loss": 0.3533, "step": 880 }, { "epoch": 1.886509635974304, - "grad_norm": 0.21136163316257608, + "grad_norm": 0.20710356677023856, "learning_rate": 2.0634920634920636e-05, - "loss": 0.363, + "loss": 0.3609, "step": 881 }, { "epoch": 1.8886509635974305, - "grad_norm": 0.20602573266640592, + "grad_norm": 0.20736571003231136, "learning_rate": 2.0595238095238094e-05, - "loss": 0.3537, + "loss": 0.3518, "step": 882 }, { "epoch": 1.8907922912205568, - "grad_norm": 0.20197617146682256, + "grad_norm": 0.219731020163135, "learning_rate": 2.0555555555555555e-05, - "loss": 0.3525, + "loss": 0.3507, "step": 883 }, { "epoch": 1.8929336188436832, - "grad_norm": 0.20854372113696887, + "grad_norm": 0.21068978476088052, "learning_rate": 2.0515873015873015e-05, - "loss": 0.3514, + "loss": 0.3499, "step": 884 }, { "epoch": 1.8950749464668095, - "grad_norm": 0.4220973103981105, + "grad_norm": 0.2585013587777131, "learning_rate": 2.0476190476190476e-05, - "loss": 0.3666, + "loss": 0.3627, "step": 885 }, { "epoch": 1.8972162740899359, - "grad_norm": 0.19662405811862105, + "grad_norm": 0.19255613092161056, "learning_rate": 2.0436507936507937e-05, - "loss": 0.3557, + "loss": 0.3547, "step": 886 }, { "epoch": 1.8993576017130622, - "grad_norm": 0.21424028404578552, + "grad_norm": 0.21046046883766856, "learning_rate": 2.0396825396825398e-05, - "loss": 0.3705, + "loss": 0.3688, "step": 887 }, { "epoch": 1.9014989293361886, - "grad_norm": 0.20444704514356857, + "grad_norm": 0.2031259267946657, "learning_rate": 2.0357142857142858e-05, - "loss": 0.3671, + "loss": 0.366, "step": 888 }, { "epoch": 1.903640256959315, - "grad_norm": 0.209872146829905, + "grad_norm": 0.22385718639419275, "learning_rate": 2.031746031746032e-05, - "loss": 0.3675, + "loss": 0.366, "step": 889 }, { "epoch": 1.905781584582441, - "grad_norm": 0.2128263431128925, + "grad_norm": 0.20168399193948486, "learning_rate": 2.027777777777778e-05, - "loss": 0.3578, + "loss": 0.3562, "step": 890 }, { "epoch": 1.9079229122055674, - "grad_norm": 0.21350097793924863, + "grad_norm": 0.19396525152354396, "learning_rate": 2.023809523809524e-05, - "loss": 0.3579, + "loss": 0.356, "step": 891 }, { "epoch": 1.9100642398286938, - "grad_norm": 0.19676311955725848, + "grad_norm": 0.20798336659145256, "learning_rate": 2.01984126984127e-05, - "loss": 0.3453, + "loss": 0.3438, "step": 892 }, { "epoch": 1.9122055674518201, - "grad_norm": 0.18886161439157978, + "grad_norm": 0.2135461327301388, "learning_rate": 2.015873015873016e-05, - "loss": 0.3576, + "loss": 0.3556, "step": 893 }, { "epoch": 1.9143468950749465, - "grad_norm": 0.19774803879409486, + "grad_norm": 0.20025569788736822, "learning_rate": 2.011904761904762e-05, - "loss": 0.3534, + "loss": 0.3522, "step": 894 }, { "epoch": 1.9164882226980728, - "grad_norm": 0.23099750551614034, + "grad_norm": 0.21073141685921978, "learning_rate": 2.007936507936508e-05, - "loss": 0.3699, + "loss": 0.3669, "step": 895 }, { "epoch": 1.9186295503211992, - "grad_norm": 0.2045355403156652, + "grad_norm": 0.20550729295532494, "learning_rate": 2.003968253968254e-05, - "loss": 0.359, + "loss": 0.3563, "step": 896 }, { "epoch": 1.9207708779443253, - "grad_norm": 0.1971397275342457, + "grad_norm": 0.2219289886669546, "learning_rate": 2e-05, - "loss": 0.3635, + "loss": 0.3632, "step": 897 }, { "epoch": 1.9229122055674517, - "grad_norm": 0.21049030917982176, + "grad_norm": 0.20426747560999273, "learning_rate": 1.9960317460317462e-05, - "loss": 0.3745, + "loss": 0.3731, "step": 898 }, { "epoch": 1.925053533190578, - "grad_norm": 0.222721408427582, + "grad_norm": 0.21309608447398956, "learning_rate": 1.992063492063492e-05, - "loss": 0.3848, + "loss": 0.3822, "step": 899 }, { "epoch": 1.9271948608137044, - "grad_norm": 0.21296826592149146, + "grad_norm": 0.2047630612588562, "learning_rate": 1.988095238095238e-05, - "loss": 0.3845, + "loss": 0.3826, "step": 900 }, { "epoch": 1.9293361884368307, - "grad_norm": 0.19916011701018338, + "grad_norm": 0.20339744126965478, "learning_rate": 1.984126984126984e-05, - "loss": 0.3653, + "loss": 0.3635, "step": 901 }, { "epoch": 1.931477516059957, - "grad_norm": 0.2010065237557244, + "grad_norm": 0.19470480788266056, "learning_rate": 1.9801587301587302e-05, - "loss": 0.346, + "loss": 0.3444, "step": 902 }, { "epoch": 1.9336188436830835, - "grad_norm": 0.22109341655275905, + "grad_norm": 0.22460440732783768, "learning_rate": 1.9761904761904763e-05, - "loss": 0.3811, + "loss": 0.379, "step": 903 }, { "epoch": 1.9357601713062098, - "grad_norm": 0.21704603464265765, + "grad_norm": 0.22213495508308273, "learning_rate": 1.9722222222222224e-05, - "loss": 0.3697, + "loss": 0.3676, "step": 904 }, { "epoch": 1.9379014989293362, - "grad_norm": 0.19589016044717736, + "grad_norm": 0.20343330118385294, "learning_rate": 1.9682539682539684e-05, - "loss": 0.3561, + "loss": 0.3546, "step": 905 }, { "epoch": 1.9400428265524625, - "grad_norm": 0.1844953806039601, + "grad_norm": 0.19101512005815446, "learning_rate": 1.9642857142857145e-05, - "loss": 0.355, + "loss": 0.3545, "step": 906 }, { "epoch": 1.9421841541755889, - "grad_norm": 0.2450452225863573, + "grad_norm": 0.27419563179542383, "learning_rate": 1.9603174603174602e-05, - "loss": 0.34, + "loss": 0.339, "step": 907 }, { "epoch": 1.9443254817987152, - "grad_norm": 0.1985635379433612, + "grad_norm": 0.20460100369509523, "learning_rate": 1.9563492063492063e-05, - "loss": 0.3724, + "loss": 0.3713, "step": 908 }, { "epoch": 1.9464668094218416, - "grad_norm": 0.20873253034957434, + "grad_norm": 0.21726477076370293, "learning_rate": 1.9523809523809524e-05, - "loss": 0.3693, + "loss": 0.3674, "step": 909 }, { "epoch": 1.948608137044968, - "grad_norm": 0.22144116581790702, + "grad_norm": 0.2366081598522377, "learning_rate": 1.9484126984126985e-05, - "loss": 0.3729, + "loss": 0.3716, "step": 910 }, { "epoch": 1.9507494646680943, - "grad_norm": 0.23269442568625717, + "grad_norm": 0.2309915328014547, "learning_rate": 1.9444444444444445e-05, - "loss": 0.3807, + "loss": 0.3777, "step": 911 }, { "epoch": 1.9528907922912206, - "grad_norm": 0.21675592494303747, + "grad_norm": 0.21457199704732352, "learning_rate": 1.9404761904761906e-05, - "loss": 0.3533, + "loss": 0.3519, "step": 912 }, { "epoch": 1.955032119914347, - "grad_norm": 0.2197742949597075, + "grad_norm": 0.21421731139535555, "learning_rate": 1.9365079365079367e-05, - "loss": 0.3607, + "loss": 0.3595, "step": 913 }, { "epoch": 1.9571734475374734, - "grad_norm": 0.21883406348238188, + "grad_norm": 0.2227014663977621, "learning_rate": 1.9325396825396828e-05, - "loss": 0.3635, + "loss": 0.3618, "step": 914 }, { "epoch": 1.9593147751605997, - "grad_norm": 0.2026715665088903, + "grad_norm": 0.19983383858352952, "learning_rate": 1.928571428571429e-05, - "loss": 0.358, + "loss": 0.3566, "step": 915 }, { "epoch": 1.961456102783726, - "grad_norm": 0.1974253749656061, + "grad_norm": 0.19358450153214074, "learning_rate": 1.924603174603175e-05, - "loss": 0.3474, + "loss": 0.3456, "step": 916 }, { "epoch": 1.9635974304068522, - "grad_norm": 0.19294611529574526, + "grad_norm": 0.19694776679615417, "learning_rate": 1.920634920634921e-05, - "loss": 0.3625, + "loss": 0.3601, "step": 917 }, { "epoch": 1.9657387580299786, - "grad_norm": 0.22729056002121276, + "grad_norm": 0.20280005475553886, "learning_rate": 1.9166666666666667e-05, - "loss": 0.3548, + "loss": 0.3537, "step": 918 }, { "epoch": 1.967880085653105, - "grad_norm": 0.20766953098408916, + "grad_norm": 0.19909912132770102, "learning_rate": 1.9126984126984128e-05, - "loss": 0.3599, + "loss": 0.3579, "step": 919 }, { "epoch": 1.9700214132762313, - "grad_norm": 0.23817301013936817, + "grad_norm": 0.22713082416817215, "learning_rate": 1.9087301587301585e-05, - "loss": 0.3787, + "loss": 0.3768, "step": 920 }, { "epoch": 1.9721627408993576, - "grad_norm": 0.21184335304060803, + "grad_norm": 0.211019913621537, "learning_rate": 1.9047619047619046e-05, - "loss": 0.356, + "loss": 0.3541, "step": 921 }, { "epoch": 1.974304068522484, - "grad_norm": 0.19810583708611373, + "grad_norm": 0.20335978017528178, "learning_rate": 1.9007936507936507e-05, - "loss": 0.373, + "loss": 0.3715, "step": 922 }, { "epoch": 1.9764453961456103, - "grad_norm": 0.217549480144923, + "grad_norm": 0.2138102480162949, "learning_rate": 1.8968253968253968e-05, - "loss": 0.3611, + "loss": 0.3591, "step": 923 }, { "epoch": 1.9785867237687365, - "grad_norm": 0.2376544391246724, + "grad_norm": 0.24453013877035817, "learning_rate": 1.892857142857143e-05, - "loss": 0.3867, + "loss": 0.3853, "step": 924 }, { "epoch": 1.9807280513918628, - "grad_norm": 0.19811936382123432, + "grad_norm": 0.20138123670876026, "learning_rate": 1.888888888888889e-05, - "loss": 0.3706, + "loss": 0.3693, "step": 925 }, { "epoch": 1.9828693790149892, - "grad_norm": 0.196442826279356, + "grad_norm": 0.2130228707007481, "learning_rate": 1.884920634920635e-05, - "loss": 0.3766, + "loss": 0.3756, "step": 926 }, { "epoch": 1.9850107066381155, - "grad_norm": 0.21591468021679908, + "grad_norm": 0.21159991261313507, "learning_rate": 1.880952380952381e-05, - "loss": 0.3573, + "loss": 0.3559, "step": 927 }, { "epoch": 1.9871520342612419, - "grad_norm": 0.20004992781857317, + "grad_norm": 0.1926680844736931, "learning_rate": 1.876984126984127e-05, - "loss": 0.3573, + "loss": 0.3564, "step": 928 }, { "epoch": 1.9892933618843682, - "grad_norm": 0.18155989142218865, + "grad_norm": 0.19459069355708655, "learning_rate": 1.8730158730158732e-05, - "loss": 0.3508, + "loss": 0.3496, "step": 929 }, { "epoch": 1.9914346895074946, - "grad_norm": 0.19220975795928547, + "grad_norm": 0.2133220689283602, "learning_rate": 1.8690476190476193e-05, - "loss": 0.3665, + "loss": 0.3651, "step": 930 }, { "epoch": 1.993576017130621, - "grad_norm": 0.20680633105790955, + "grad_norm": 0.2027554437180913, "learning_rate": 1.8650793650793654e-05, - "loss": 0.3709, + "loss": 0.3695, "step": 931 }, { "epoch": 1.9957173447537473, - "grad_norm": 0.19718351617490748, + "grad_norm": 0.19702179225284736, "learning_rate": 1.861111111111111e-05, - "loss": 0.3674, + "loss": 0.3659, "step": 932 }, { "epoch": 1.9978586723768736, - "grad_norm": 0.20385992307338152, + "grad_norm": 0.21240087364498741, "learning_rate": 1.8571428571428572e-05, - "loss": 0.3647, + "loss": 0.362, "step": 933 }, { "epoch": 2.0, - "grad_norm": 0.19952173716760616, + "grad_norm": 0.21641052576639658, "learning_rate": 1.8531746031746032e-05, - "loss": 0.3498, + "loss": 0.3479, "step": 934 }, { "epoch": 2.0021413276231264, - "grad_norm": 0.2887390533219721, + "grad_norm": 0.3127795260229852, "learning_rate": 1.8492063492063493e-05, - "loss": 0.2978, + "loss": 0.2938, "step": 935 }, { "epoch": 2.0042826552462527, - "grad_norm": 0.2261310029497388, + "grad_norm": 0.20537155001189475, "learning_rate": 1.8452380952380954e-05, - "loss": 0.2753, + "loss": 0.2725, "step": 936 }, { "epoch": 2.006423982869379, - "grad_norm": 0.19608695195304876, + "grad_norm": 0.4035860023057039, "learning_rate": 1.8412698412698415e-05, - "loss": 0.2894, + "loss": 0.2877, "step": 937 }, { "epoch": 2.0085653104925054, - "grad_norm": 0.2670758274924303, + "grad_norm": 0.2729391560493891, "learning_rate": 1.8373015873015875e-05, - "loss": 0.2855, + "loss": 0.2821, "step": 938 }, { "epoch": 2.0107066381156318, - "grad_norm": 0.2729391112548352, + "grad_norm": 0.23930664003910482, "learning_rate": 1.8333333333333333e-05, - "loss": 0.2766, + "loss": 0.2732, "step": 939 }, { "epoch": 2.012847965738758, - "grad_norm": 0.2268053076448433, + "grad_norm": 0.2534172257988802, "learning_rate": 1.8293650793650794e-05, - "loss": 0.2825, + "loss": 0.2823, "step": 940 }, { "epoch": 2.0149892933618845, - "grad_norm": 0.22891152322973052, + "grad_norm": 0.2842003636688084, "learning_rate": 1.8253968253968254e-05, - "loss": 0.2704, + "loss": 0.2699, "step": 941 }, { "epoch": 2.017130620985011, - "grad_norm": 0.24951718116860663, + "grad_norm": 0.2546284637134941, "learning_rate": 1.8214285714285715e-05, - "loss": 0.2925, + "loss": 0.2898, "step": 942 }, { "epoch": 2.019271948608137, - "grad_norm": 0.24317478809263185, + "grad_norm": 0.25923976328845644, "learning_rate": 1.8174603174603176e-05, - "loss": 0.2852, + "loss": 0.2835, "step": 943 }, { "epoch": 2.0214132762312635, - "grad_norm": 0.22035333150669048, + "grad_norm": 0.2694090119639284, "learning_rate": 1.8134920634920637e-05, - "loss": 0.28, + "loss": 0.2789, "step": 944 }, { "epoch": 2.02355460385439, - "grad_norm": 0.22762363253514034, + "grad_norm": 0.2560183056621401, "learning_rate": 1.8095238095238094e-05, - "loss": 0.2894, + "loss": 0.288, "step": 945 }, { "epoch": 2.0256959314775163, - "grad_norm": 0.22788105608795867, + "grad_norm": 0.20946395889175382, "learning_rate": 1.8055555555555555e-05, - "loss": 0.2743, + "loss": 0.2726, "step": 946 }, { "epoch": 2.0278372591006426, - "grad_norm": 0.20269423040955745, + "grad_norm": 0.23286538060698495, "learning_rate": 1.8015873015873015e-05, - "loss": 0.2663, + "loss": 0.2649, "step": 947 }, { "epoch": 2.0299785867237685, - "grad_norm": 0.22523427656669212, + "grad_norm": 0.2812062902226991, "learning_rate": 1.7976190476190476e-05, - "loss": 0.2781, + "loss": 0.2761, "step": 948 }, { "epoch": 2.032119914346895, - "grad_norm": 0.21588221204986352, + "grad_norm": 0.21066691235196103, "learning_rate": 1.7936507936507937e-05, - "loss": 0.2788, + "loss": 0.2767, "step": 949 }, { "epoch": 2.0342612419700212, - "grad_norm": 0.195728356650988, + "grad_norm": 0.22546963160204261, "learning_rate": 1.7896825396825398e-05, - "loss": 0.2769, + "loss": 0.2748, "step": 950 }, { "epoch": 2.0364025695931476, - "grad_norm": 0.2161839738558612, + "grad_norm": 0.2695316483089908, "learning_rate": 1.785714285714286e-05, - "loss": 0.2941, + "loss": 0.2931, "step": 951 }, { "epoch": 2.038543897216274, - "grad_norm": 0.23767256005231727, + "grad_norm": 0.224650667985638, "learning_rate": 1.781746031746032e-05, - "loss": 0.2895, + "loss": 0.286, "step": 952 }, { "epoch": 2.0406852248394003, - "grad_norm": 0.20171938363284947, + "grad_norm": 0.23425810863567934, "learning_rate": 1.777777777777778e-05, - "loss": 0.2781, + "loss": 0.2768, "step": 953 }, { "epoch": 2.0428265524625266, - "grad_norm": 0.1856341413030586, + "grad_norm": 0.21885265388287464, "learning_rate": 1.773809523809524e-05, - "loss": 0.2685, + "loss": 0.2664, "step": 954 }, { "epoch": 2.044967880085653, - "grad_norm": 0.2145432452323137, + "grad_norm": 0.2220128915627926, "learning_rate": 1.76984126984127e-05, - "loss": 0.281, + "loss": 0.2792, "step": 955 }, { "epoch": 2.0471092077087794, - "grad_norm": 0.2112321163664146, + "grad_norm": 0.21795261361627535, "learning_rate": 1.7658730158730162e-05, - "loss": 0.2977, + "loss": 0.2963, "step": 956 }, { "epoch": 2.0492505353319057, - "grad_norm": 0.2067934311888714, + "grad_norm": 0.22853139984187426, "learning_rate": 1.761904761904762e-05, - "loss": 0.2793, + "loss": 0.2774, "step": 957 }, { "epoch": 2.051391862955032, - "grad_norm": 0.21063958513546374, + "grad_norm": 0.21005634342559973, "learning_rate": 1.757936507936508e-05, - "loss": 0.2807, + "loss": 0.278, "step": 958 }, { "epoch": 2.0535331905781584, - "grad_norm": 0.19678228876621853, + "grad_norm": 0.1972736100196403, "learning_rate": 1.7539682539682538e-05, - "loss": 0.2773, + "loss": 0.276, "step": 959 }, { "epoch": 2.0556745182012848, - "grad_norm": 0.1926373898265207, + "grad_norm": 0.2013981720969393, "learning_rate": 1.75e-05, - "loss": 0.2747, + "loss": 0.2725, "step": 960 }, { "epoch": 2.057815845824411, - "grad_norm": 0.20704097224369902, + "grad_norm": 0.21114011142047054, "learning_rate": 1.746031746031746e-05, - "loss": 0.2722, + "loss": 0.2692, "step": 961 }, { "epoch": 2.0599571734475375, - "grad_norm": 0.19808242466728723, + "grad_norm": 0.19810710536290826, "learning_rate": 1.742063492063492e-05, - "loss": 0.271, + "loss": 0.2689, "step": 962 }, { "epoch": 2.062098501070664, - "grad_norm": 0.19665606509062533, + "grad_norm": 0.20962838769163566, "learning_rate": 1.738095238095238e-05, - "loss": 0.2769, + "loss": 0.2755, "step": 963 }, { "epoch": 2.06423982869379, - "grad_norm": 0.20159555272227964, + "grad_norm": 0.2007780468677198, "learning_rate": 1.734126984126984e-05, - "loss": 0.2754, + "loss": 0.2742, "step": 964 }, { "epoch": 2.0663811563169165, - "grad_norm": 0.20715083834741455, + "grad_norm": 0.20730329087379573, "learning_rate": 1.7301587301587302e-05, - "loss": 0.2819, + "loss": 0.2796, "step": 965 }, { "epoch": 2.068522483940043, - "grad_norm": 0.1994028177934581, + "grad_norm": 0.20422916553473747, "learning_rate": 1.7261904761904763e-05, - "loss": 0.2737, + "loss": 0.2717, "step": 966 }, { "epoch": 2.0706638115631693, - "grad_norm": 0.21226238723876503, + "grad_norm": 0.21624827975621064, "learning_rate": 1.7222222222222224e-05, - "loss": 0.2755, + "loss": 0.2736, "step": 967 }, { "epoch": 2.0728051391862956, - "grad_norm": 0.21621507021709063, + "grad_norm": 0.211530596933148, "learning_rate": 1.7182539682539684e-05, - "loss": 0.2871, + "loss": 0.2856, "step": 968 }, { "epoch": 2.074946466809422, - "grad_norm": 0.2054891028222431, + "grad_norm": 0.21392283135095658, "learning_rate": 1.7142857142857145e-05, - "loss": 0.281, + "loss": 0.2784, "step": 969 }, { "epoch": 2.0770877944325483, - "grad_norm": 0.20802135509652517, + "grad_norm": 0.22575272429775897, "learning_rate": 1.7103174603174606e-05, - "loss": 0.2706, + "loss": 0.2683, "step": 970 }, { "epoch": 2.0792291220556747, - "grad_norm": 0.19737703811731966, + "grad_norm": 0.20807075241886105, "learning_rate": 1.7063492063492063e-05, - "loss": 0.2825, + "loss": 0.2811, "step": 971 }, { "epoch": 2.081370449678801, - "grad_norm": 0.18858696479602974, + "grad_norm": 0.19696007356880943, "learning_rate": 1.7023809523809524e-05, - "loss": 0.2631, + "loss": 0.2616, "step": 972 }, { "epoch": 2.0835117773019274, - "grad_norm": 0.19571849751196396, + "grad_norm": 0.2067907488812531, "learning_rate": 1.6984126984126985e-05, - "loss": 0.2878, + "loss": 0.2867, "step": 973 }, { "epoch": 2.0856531049250537, - "grad_norm": 0.19389617623346578, + "grad_norm": 0.20785661967310565, "learning_rate": 1.6944444444444446e-05, - "loss": 0.2821, + "loss": 0.2804, "step": 974 }, { "epoch": 2.08779443254818, - "grad_norm": 0.19768747519412655, + "grad_norm": 0.21316354407545085, "learning_rate": 1.6904761904761906e-05, - "loss": 0.2836, + "loss": 0.2825, "step": 975 }, { "epoch": 2.089935760171306, - "grad_norm": 0.19370599491498322, + "grad_norm": 0.20417574340319766, "learning_rate": 1.6865079365079367e-05, - "loss": 0.2712, + "loss": 0.2698, "step": 976 }, { "epoch": 2.0920770877944324, - "grad_norm": 0.20240036491722738, + "grad_norm": 0.20059086132923634, "learning_rate": 1.6825396825396828e-05, - "loss": 0.287, + "loss": 0.2838, "step": 977 }, { "epoch": 2.0942184154175587, - "grad_norm": 0.2036262743697464, + "grad_norm": 0.20806774530665742, "learning_rate": 1.6785714285714285e-05, - "loss": 0.297, + "loss": 0.2954, "step": 978 }, { "epoch": 2.096359743040685, - "grad_norm": 0.20250473360666518, + "grad_norm": 0.21795686396623343, "learning_rate": 1.6746031746031746e-05, - "loss": 0.2777, + "loss": 0.2758, "step": 979 }, { "epoch": 2.0985010706638114, - "grad_norm": 0.20192268740605415, + "grad_norm": 0.20525399917369724, "learning_rate": 1.6706349206349207e-05, - "loss": 0.2864, + "loss": 0.2852, "step": 980 }, { "epoch": 2.1006423982869378, - "grad_norm": 0.19075170756571364, + "grad_norm": 0.20153493188561758, "learning_rate": 1.6666666666666667e-05, - "loss": 0.2701, + "loss": 0.2671, "step": 981 }, { "epoch": 2.102783725910064, - "grad_norm": 0.19697486168056122, + "grad_norm": 0.21682029099052538, "learning_rate": 1.6626984126984128e-05, - "loss": 0.2811, + "loss": 0.2794, "step": 982 }, { "epoch": 2.1049250535331905, - "grad_norm": 0.19096670479918926, + "grad_norm": 0.20982679296246803, "learning_rate": 1.658730158730159e-05, - "loss": 0.2769, + "loss": 0.2755, "step": 983 }, { "epoch": 2.107066381156317, - "grad_norm": 0.19574292129874354, + "grad_norm": 0.20945525288086686, "learning_rate": 1.6547619047619046e-05, - "loss": 0.2701, + "loss": 0.2675, "step": 984 }, { "epoch": 2.109207708779443, - "grad_norm": 0.21404348512022472, + "grad_norm": 0.21376143878454182, "learning_rate": 1.6507936507936507e-05, - "loss": 0.2849, + "loss": 0.2832, "step": 985 }, { "epoch": 2.1113490364025695, - "grad_norm": 0.19567445704922734, + "grad_norm": 0.19436511910516777, "learning_rate": 1.6468253968253968e-05, - "loss": 0.2753, + "loss": 0.2737, "step": 986 }, { "epoch": 2.113490364025696, - "grad_norm": 0.2069043774274054, + "grad_norm": 0.20930636270391903, "learning_rate": 1.642857142857143e-05, - "loss": 0.2776, + "loss": 0.2758, "step": 987 }, { "epoch": 2.1156316916488223, - "grad_norm": 0.22126156756781362, + "grad_norm": 0.23244369003497906, "learning_rate": 1.638888888888889e-05, - "loss": 0.2979, + "loss": 0.296, "step": 988 }, { "epoch": 2.1177730192719486, - "grad_norm": 0.21047914160745446, + "grad_norm": 0.19066256457699418, "learning_rate": 1.634920634920635e-05, - "loss": 0.2759, + "loss": 0.2738, "step": 989 }, { "epoch": 2.119914346895075, - "grad_norm": 0.1866961799813759, + "grad_norm": 0.19911072991641976, "learning_rate": 1.630952380952381e-05, - "loss": 0.2734, + "loss": 0.2715, "step": 990 }, { "epoch": 2.1220556745182013, - "grad_norm": 0.19697846429602314, + "grad_norm": 0.21201959394684514, "learning_rate": 1.626984126984127e-05, - "loss": 0.2932, + "loss": 0.2922, "step": 991 }, { "epoch": 2.1241970021413277, - "grad_norm": 0.1907591280617684, + "grad_norm": 0.1967089705488496, "learning_rate": 1.6230158730158732e-05, - "loss": 0.2743, + "loss": 0.2715, "step": 992 }, { "epoch": 2.126338329764454, - "grad_norm": 0.18785957871532322, + "grad_norm": 0.19369231393643502, "learning_rate": 1.6190476190476193e-05, - "loss": 0.2797, + "loss": 0.2784, "step": 993 }, { "epoch": 2.1284796573875804, - "grad_norm": 0.1881059477765473, + "grad_norm": 0.2040785519443548, "learning_rate": 1.6150793650793654e-05, - "loss": 0.2795, + "loss": 0.2767, "step": 994 }, { "epoch": 2.1306209850107067, - "grad_norm": 0.1895829464393248, + "grad_norm": 0.1898501436472166, "learning_rate": 1.6111111111111115e-05, - "loss": 0.2699, + "loss": 0.2678, "step": 995 }, { "epoch": 2.132762312633833, - "grad_norm": 0.1894894322187353, + "grad_norm": 0.20111950499112383, "learning_rate": 1.6071428571428572e-05, - "loss": 0.2839, + "loss": 0.2823, "step": 996 }, { "epoch": 2.1349036402569594, - "grad_norm": 0.20942109541121104, + "grad_norm": 0.1981498781288381, "learning_rate": 1.6031746031746033e-05, - "loss": 0.276, + "loss": 0.2747, "step": 997 }, { "epoch": 2.137044967880086, - "grad_norm": 0.1929206423206803, + "grad_norm": 0.19271209341880322, "learning_rate": 1.599206349206349e-05, - "loss": 0.2781, + "loss": 0.2752, "step": 998 }, { "epoch": 2.139186295503212, - "grad_norm": 0.20059791418349765, + "grad_norm": 0.2221235505452683, "learning_rate": 1.595238095238095e-05, - "loss": 0.2731, + "loss": 0.2716, "step": 999 }, { "epoch": 2.1413276231263385, - "grad_norm": 0.19734665993628578, + "grad_norm": 0.19713198559950088, "learning_rate": 1.591269841269841e-05, - "loss": 0.2841, + "loss": 0.2812, "step": 1000 }, { "epoch": 2.143468950749465, - "grad_norm": 0.19634335711712297, + "grad_norm": 0.19897355551674525, "learning_rate": 1.5873015873015872e-05, - "loss": 0.2861, + "loss": 0.2846, "step": 1001 }, { "epoch": 2.145610278372591, - "grad_norm": 0.18641688183989846, + "grad_norm": 0.19298432452205258, "learning_rate": 1.5833333333333333e-05, - "loss": 0.2699, + "loss": 0.2682, "step": 1002 }, { "epoch": 2.147751605995717, - "grad_norm": 0.19839241096997418, + "grad_norm": 0.2019146536667124, "learning_rate": 1.5793650793650794e-05, - "loss": 0.2746, + "loss": 0.2735, "step": 1003 }, { "epoch": 2.1498929336188435, - "grad_norm": 0.1903736062839147, + "grad_norm": 0.20154869107354417, "learning_rate": 1.5753968253968255e-05, - "loss": 0.2649, + "loss": 0.2632, "step": 1004 }, { "epoch": 2.15203426124197, - "grad_norm": 0.19577687838702312, + "grad_norm": 0.19363808622473339, "learning_rate": 1.5714285714285715e-05, - "loss": 0.2885, + "loss": 0.2874, "step": 1005 }, { "epoch": 2.154175588865096, - "grad_norm": 0.20084961494956982, + "grad_norm": 0.2046720100698483, "learning_rate": 1.5674603174603176e-05, - "loss": 0.277, + "loss": 0.2754, "step": 1006 }, { "epoch": 2.1563169164882225, - "grad_norm": 0.19044578659203962, + "grad_norm": 0.1930006937085633, "learning_rate": 1.5634920634920637e-05, - "loss": 0.2759, + "loss": 0.2726, "step": 1007 }, { "epoch": 2.158458244111349, - "grad_norm": 0.20168687457620502, + "grad_norm": 0.20734097354646286, "learning_rate": 1.5595238095238098e-05, - "loss": 0.2883, + "loss": 0.2879, "step": 1008 }, { "epoch": 2.1605995717344753, - "grad_norm": 0.1917192379674667, + "grad_norm": 0.19538298745906257, "learning_rate": 1.5555555555555555e-05, - "loss": 0.2822, + "loss": 0.2809, "step": 1009 }, { "epoch": 2.1627408993576016, - "grad_norm": 0.21720443733726488, + "grad_norm": 0.2252535619453128, "learning_rate": 1.5515873015873016e-05, - "loss": 0.287, + "loss": 0.286, "step": 1010 }, { "epoch": 2.164882226980728, - "grad_norm": 0.19059089571719792, + "grad_norm": 0.19426151913359876, "learning_rate": 1.5476190476190476e-05, - "loss": 0.2827, + "loss": 0.2806, "step": 1011 }, { "epoch": 2.1670235546038543, - "grad_norm": 0.20266248908092088, + "grad_norm": 0.20018884891444158, "learning_rate": 1.5436507936507937e-05, - "loss": 0.2944, + "loss": 0.2922, "step": 1012 }, { "epoch": 2.1691648822269807, - "grad_norm": 0.20424509394375523, + "grad_norm": 0.20745084715389592, "learning_rate": 1.5396825396825398e-05, - "loss": 0.2736, + "loss": 0.272, "step": 1013 }, { "epoch": 2.171306209850107, - "grad_norm": 0.19086220079548602, + "grad_norm": 0.2005966941166649, "learning_rate": 1.535714285714286e-05, - "loss": 0.2772, + "loss": 0.2748, "step": 1014 }, { "epoch": 2.1734475374732334, - "grad_norm": 0.19412331734644836, + "grad_norm": 0.1959554222950539, "learning_rate": 1.531746031746032e-05, - "loss": 0.2748, + "loss": 0.2725, "step": 1015 }, { "epoch": 2.1755888650963597, - "grad_norm": 0.20630378133521415, + "grad_norm": 0.2024673027309239, "learning_rate": 1.527777777777778e-05, - "loss": 0.2938, + "loss": 0.2925, "step": 1016 }, { "epoch": 2.177730192719486, - "grad_norm": 0.18492977798415206, + "grad_norm": 0.18968936546486206, "learning_rate": 1.5238095238095241e-05, - "loss": 0.2745, + "loss": 0.2734, "step": 1017 }, { "epoch": 2.1798715203426124, - "grad_norm": 0.2039798887458119, + "grad_norm": 0.21451121125699396, "learning_rate": 1.5198412698412698e-05, - "loss": 0.2755, + "loss": 0.2729, "step": 1018 }, { "epoch": 2.182012847965739, - "grad_norm": 0.20121459942322972, + "grad_norm": 0.20234427279725273, "learning_rate": 1.5158730158730159e-05, - "loss": 0.2862, + "loss": 0.2846, "step": 1019 }, { "epoch": 2.184154175588865, - "grad_norm": 0.1827134805798544, + "grad_norm": 0.19307169286798295, "learning_rate": 1.5119047619047618e-05, - "loss": 0.2718, + "loss": 0.2704, "step": 1020 }, { "epoch": 2.1862955032119915, - "grad_norm": 0.17917411734145225, + "grad_norm": 0.1831847011586216, "learning_rate": 1.5079365079365079e-05, - "loss": 0.2695, + "loss": 0.2682, "step": 1021 }, { "epoch": 2.188436830835118, - "grad_norm": 0.18767988072287312, + "grad_norm": 0.19718660927111226, "learning_rate": 1.503968253968254e-05, - "loss": 0.2684, + "loss": 0.2665, "step": 1022 }, { "epoch": 2.190578158458244, - "grad_norm": 0.1942652275305181, + "grad_norm": 0.21657711095140936, "learning_rate": 1.5e-05, - "loss": 0.2803, + "loss": 0.2793, "step": 1023 }, { "epoch": 2.1927194860813706, - "grad_norm": 0.20926677447772102, + "grad_norm": 0.2085788754213796, "learning_rate": 1.4960317460317461e-05, - "loss": 0.2843, + "loss": 0.2816, "step": 1024 }, { "epoch": 2.194860813704497, - "grad_norm": 0.1991623873969263, + "grad_norm": 0.20682253217524985, "learning_rate": 1.4920634920634922e-05, - "loss": 0.2874, + "loss": 0.2861, "step": 1025 }, { "epoch": 2.1970021413276233, - "grad_norm": 0.18616156346828455, + "grad_norm": 0.19353447072521687, "learning_rate": 1.4880952380952381e-05, - "loss": 0.2836, + "loss": 0.2818, "step": 1026 }, { "epoch": 2.1991434689507496, - "grad_norm": 0.2001544304384233, + "grad_norm": 0.2023000282481551, "learning_rate": 1.4841269841269842e-05, - "loss": 0.292, + "loss": 0.2897, "step": 1027 }, { "epoch": 2.201284796573876, - "grad_norm": 0.19807876601861757, + "grad_norm": 0.2091382321658723, "learning_rate": 1.4801587301587302e-05, - "loss": 0.2815, + "loss": 0.2803, "step": 1028 }, { "epoch": 2.2034261241970023, - "grad_norm": 0.18403372703681253, + "grad_norm": 0.19040214411798756, "learning_rate": 1.4761904761904763e-05, - "loss": 0.2733, + "loss": 0.2709, "step": 1029 }, { "epoch": 2.2055674518201283, - "grad_norm": 0.1886899672546482, + "grad_norm": 0.18805950537913466, "learning_rate": 1.4722222222222224e-05, - "loss": 0.2664, + "loss": 0.2641, "step": 1030 }, { "epoch": 2.207708779443255, - "grad_norm": 0.19490843580765388, + "grad_norm": 0.19809642692965637, "learning_rate": 1.4682539682539683e-05, - "loss": 0.2899, + "loss": 0.2886, "step": 1031 }, { "epoch": 2.209850107066381, - "grad_norm": 0.18904649297785464, + "grad_norm": 0.1912784314191019, "learning_rate": 1.4642857142857144e-05, - "loss": 0.2805, + "loss": 0.2795, "step": 1032 }, { "epoch": 2.2119914346895073, - "grad_norm": 0.18795150882002612, + "grad_norm": 0.1966882118912259, "learning_rate": 1.4603174603174605e-05, - "loss": 0.2855, + "loss": 0.2837, "step": 1033 }, { "epoch": 2.2141327623126337, - "grad_norm": 0.19796365275830283, + "grad_norm": 0.19981226873116584, "learning_rate": 1.4563492063492065e-05, - "loss": 0.2818, + "loss": 0.2798, "step": 1034 }, { "epoch": 2.21627408993576, - "grad_norm": 0.18219404726440058, + "grad_norm": 0.19609944493947298, "learning_rate": 1.4523809523809526e-05, - "loss": 0.287, + "loss": 0.2851, "step": 1035 }, { "epoch": 2.2184154175588864, - "grad_norm": 0.19237435500214886, + "grad_norm": 0.20172541461591478, "learning_rate": 1.4484126984126987e-05, - "loss": 0.2821, + "loss": 0.2806, "step": 1036 }, { "epoch": 2.2205567451820127, - "grad_norm": 0.21247677887677272, + "grad_norm": 0.2184127818693799, "learning_rate": 1.4444444444444444e-05, - "loss": 0.2806, + "loss": 0.2788, "step": 1037 }, { "epoch": 2.222698072805139, - "grad_norm": 0.19613559228915892, + "grad_norm": 0.1934456256040119, "learning_rate": 1.4404761904761905e-05, - "loss": 0.2705, + "loss": 0.2684, "step": 1038 }, { "epoch": 2.2248394004282654, - "grad_norm": 0.20206602453648073, + "grad_norm": 0.2082433040162603, "learning_rate": 1.4365079365079364e-05, - "loss": 0.2936, + "loss": 0.2923, "step": 1039 }, { "epoch": 2.226980728051392, - "grad_norm": 0.21986971551175186, + "grad_norm": 0.22380158655105736, "learning_rate": 1.4325396825396825e-05, - "loss": 0.281, + "loss": 0.2779, "step": 1040 }, { "epoch": 2.229122055674518, - "grad_norm": 0.19544790141801302, + "grad_norm": 0.1942322520767476, "learning_rate": 1.4285714285714285e-05, - "loss": 0.2772, + "loss": 0.2759, "step": 1041 }, { "epoch": 2.2312633832976445, - "grad_norm": 0.187999178943079, + "grad_norm": 0.1974411304770536, "learning_rate": 1.4246031746031746e-05, - "loss": 0.2712, + "loss": 0.2688, "step": 1042 }, { "epoch": 2.233404710920771, - "grad_norm": 0.20155673411774347, + "grad_norm": 0.19709284568411106, "learning_rate": 1.4206349206349207e-05, - "loss": 0.2796, + "loss": 0.2789, "step": 1043 }, { "epoch": 2.235546038543897, - "grad_norm": 0.24201763616546484, + "grad_norm": 0.19313497813772162, "learning_rate": 1.4166666666666668e-05, - "loss": 0.2821, + "loss": 0.28, "step": 1044 }, { "epoch": 2.2376873661670236, - "grad_norm": 0.20108574451228317, + "grad_norm": 0.19756395359956005, "learning_rate": 1.4126984126984127e-05, - "loss": 0.2886, + "loss": 0.287, "step": 1045 }, { "epoch": 2.23982869379015, - "grad_norm": 0.19434223451617216, + "grad_norm": 0.19707117382557698, "learning_rate": 1.4087301587301587e-05, - "loss": 0.2814, + "loss": 0.2793, "step": 1046 }, { "epoch": 2.2419700214132763, - "grad_norm": 0.19496558913111342, + "grad_norm": 0.2185295361546076, "learning_rate": 1.4047619047619048e-05, - "loss": 0.2809, + "loss": 0.2801, "step": 1047 }, { "epoch": 2.2441113490364026, - "grad_norm": 0.20072045682891357, + "grad_norm": 0.1959782383553124, "learning_rate": 1.4007936507936509e-05, - "loss": 0.2791, + "loss": 0.2778, "step": 1048 }, { "epoch": 2.246252676659529, - "grad_norm": 0.19738112557289852, + "grad_norm": 0.20220432730319396, "learning_rate": 1.396825396825397e-05, - "loss": 0.2756, + "loss": 0.2739, "step": 1049 }, { "epoch": 2.2483940042826553, - "grad_norm": 0.18581301381293844, + "grad_norm": 0.203202312939082, "learning_rate": 1.392857142857143e-05, - "loss": 0.2696, + "loss": 0.2686, "step": 1050 }, { "epoch": 2.2505353319057817, - "grad_norm": 0.19262455905174403, + "grad_norm": 0.19629196239471303, "learning_rate": 1.388888888888889e-05, - "loss": 0.2758, + "loss": 0.2741, "step": 1051 }, { "epoch": 2.252676659528908, - "grad_norm": 0.2044076909216322, + "grad_norm": 0.20785913348172835, "learning_rate": 1.384920634920635e-05, - "loss": 0.2759, + "loss": 0.2738, "step": 1052 }, { "epoch": 2.2548179871520344, - "grad_norm": 0.20744550401581244, + "grad_norm": 0.20245154360926115, "learning_rate": 1.3809523809523811e-05, - "loss": 0.2952, + "loss": 0.2944, "step": 1053 }, { "epoch": 2.2569593147751608, - "grad_norm": 0.19946431730972056, + "grad_norm": 0.2096114189668644, "learning_rate": 1.3769841269841272e-05, - "loss": 0.2776, + "loss": 0.2759, "step": 1054 }, { "epoch": 2.259100642398287, - "grad_norm": 0.2061227664972564, + "grad_norm": 0.24320117601366367, "learning_rate": 1.3730158730158733e-05, - "loss": 0.273, + "loss": 0.271, "step": 1055 }, { "epoch": 2.2612419700214135, - "grad_norm": 0.1958943903490819, + "grad_norm": 0.19424837338985246, "learning_rate": 1.3690476190476192e-05, - "loss": 0.281, + "loss": 0.2793, "step": 1056 }, { "epoch": 2.2633832976445394, - "grad_norm": 0.17986444789632078, + "grad_norm": 0.19235543786637513, "learning_rate": 1.365079365079365e-05, - "loss": 0.282, + "loss": 0.2797, "step": 1057 }, { "epoch": 2.265524625267666, - "grad_norm": 0.20331838773796787, + "grad_norm": 0.2014689883501026, "learning_rate": 1.3611111111111111e-05, - "loss": 0.2822, + "loss": 0.2804, "step": 1058 }, { "epoch": 2.267665952890792, - "grad_norm": 0.1854886814182152, + "grad_norm": 0.19388371477246316, "learning_rate": 1.357142857142857e-05, - "loss": 0.2871, + "loss": 0.2857, "step": 1059 }, { "epoch": 2.2698072805139184, - "grad_norm": 0.19038399977635145, + "grad_norm": 0.18600354706155486, "learning_rate": 1.3531746031746031e-05, - "loss": 0.2705, + "loss": 0.268, "step": 1060 }, { "epoch": 2.271948608137045, - "grad_norm": 0.18815130656204335, + "grad_norm": 0.1997457077450587, "learning_rate": 1.3492063492063492e-05, - "loss": 0.2756, + "loss": 0.2744, "step": 1061 }, { "epoch": 2.274089935760171, - "grad_norm": 0.19455215664990538, + "grad_norm": 0.21254455192384775, "learning_rate": 1.3452380952380953e-05, - "loss": 0.283, + "loss": 0.281, "step": 1062 }, { "epoch": 2.2762312633832975, - "grad_norm": 0.20059126614368925, + "grad_norm": 0.19277682702608626, "learning_rate": 1.3412698412698413e-05, - "loss": 0.2785, + "loss": 0.2766, "step": 1063 }, { "epoch": 2.278372591006424, - "grad_norm": 0.20535076737597538, + "grad_norm": 0.19996995603399093, "learning_rate": 1.3373015873015873e-05, - "loss": 0.2827, + "loss": 0.2811, "step": 1064 }, { "epoch": 2.28051391862955, - "grad_norm": 0.19133006544393558, + "grad_norm": 0.2037157346895961, "learning_rate": 1.3333333333333333e-05, - "loss": 0.2876, + "loss": 0.286, "step": 1065 }, { "epoch": 2.2826552462526766, - "grad_norm": 0.18259775238692044, + "grad_norm": 0.2038456192268336, "learning_rate": 1.3293650793650794e-05, - "loss": 0.2778, + "loss": 0.2756, "step": 1066 }, { "epoch": 2.284796573875803, - "grad_norm": 0.18347983126035391, + "grad_norm": 0.19897665841544188, "learning_rate": 1.3253968253968255e-05, - "loss": 0.2761, + "loss": 0.2726, "step": 1067 }, { "epoch": 2.2869379014989293, - "grad_norm": 0.18777852570980433, + "grad_norm": 0.2001072986623387, "learning_rate": 1.3214285714285716e-05, - "loss": 0.2818, + "loss": 0.2795, "step": 1068 }, { "epoch": 2.2890792291220556, - "grad_norm": 0.18410525589405727, + "grad_norm": 0.19517842971444538, "learning_rate": 1.3174603174603176e-05, - "loss": 0.2656, + "loss": 0.2635, "step": 1069 }, { "epoch": 2.291220556745182, - "grad_norm": 0.19035078229116137, + "grad_norm": 0.2068829830909192, "learning_rate": 1.3134920634920635e-05, - "loss": 0.2816, + "loss": 0.2791, "step": 1070 }, { "epoch": 2.2933618843683083, - "grad_norm": 0.20015240192900935, + "grad_norm": 0.21700568586294433, "learning_rate": 1.3095238095238096e-05, - "loss": 0.3031, + "loss": 0.3014, "step": 1071 }, { "epoch": 2.2955032119914347, - "grad_norm": 0.18184078914671584, + "grad_norm": 0.19043215131446242, "learning_rate": 1.3055555555555557e-05, - "loss": 0.2784, + "loss": 0.2772, "step": 1072 }, { "epoch": 2.297644539614561, - "grad_norm": 0.19404174251026468, + "grad_norm": 0.20431273180658213, "learning_rate": 1.3015873015873018e-05, - "loss": 0.2761, + "loss": 0.2738, "step": 1073 }, { "epoch": 2.2997858672376874, - "grad_norm": 0.18586683492531947, + "grad_norm": 0.1971225707821823, "learning_rate": 1.2976190476190478e-05, - "loss": 0.2781, + "loss": 0.2762, "step": 1074 }, { "epoch": 2.3019271948608138, - "grad_norm": 0.18361259801329616, + "grad_norm": 0.19556074151794256, "learning_rate": 1.2936507936507939e-05, - "loss": 0.2588, + "loss": 0.2572, "step": 1075 }, { "epoch": 2.30406852248394, - "grad_norm": 0.19747669576089893, + "grad_norm": 0.20833957821982024, "learning_rate": 1.2896825396825398e-05, - "loss": 0.2819, + "loss": 0.2794, "step": 1076 }, { "epoch": 2.3062098501070665, - "grad_norm": 0.1962675355280158, + "grad_norm": 0.2025740903256515, "learning_rate": 1.2857142857142857e-05, - "loss": 0.2755, + "loss": 0.2733, "step": 1077 }, { "epoch": 2.308351177730193, - "grad_norm": 0.19942980028292878, + "grad_norm": 0.20532981633640338, "learning_rate": 1.2817460317460316e-05, - "loss": 0.2818, + "loss": 0.2801, "step": 1078 }, { "epoch": 2.310492505353319, - "grad_norm": 0.19501376392056294, + "grad_norm": 0.20302693087098408, "learning_rate": 1.2777777777777777e-05, - "loss": 0.2882, + "loss": 0.2855, "step": 1079 }, { "epoch": 2.3126338329764455, - "grad_norm": 0.18868853481419254, + "grad_norm": 0.19316097914518776, "learning_rate": 1.2738095238095238e-05, - "loss": 0.2762, + "loss": 0.2744, "step": 1080 }, { "epoch": 2.314775160599572, - "grad_norm": 0.18259220767781897, + "grad_norm": 0.1890481893959948, "learning_rate": 1.2698412698412699e-05, - "loss": 0.2862, + "loss": 0.2858, "step": 1081 }, { "epoch": 2.3169164882226982, - "grad_norm": 0.20057460578986203, + "grad_norm": 0.20709172274897492, "learning_rate": 1.265873015873016e-05, - "loss": 0.289, + "loss": 0.287, "step": 1082 }, { "epoch": 2.3190578158458246, - "grad_norm": 0.20457960878239465, + "grad_norm": 0.21179564472663326, "learning_rate": 1.261904761904762e-05, - "loss": 0.2855, + "loss": 0.2837, "step": 1083 }, { "epoch": 2.3211991434689505, - "grad_norm": 0.19996856442993224, + "grad_norm": 0.2076511560969458, "learning_rate": 1.2579365079365079e-05, - "loss": 0.2831, + "loss": 0.2809, "step": 1084 }, { "epoch": 2.3233404710920773, - "grad_norm": 0.19193162471753858, + "grad_norm": 0.2053185208412779, "learning_rate": 1.253968253968254e-05, - "loss": 0.2788, + "loss": 0.2778, "step": 1085 }, { "epoch": 2.325481798715203, - "grad_norm": 0.20129531906043394, + "grad_norm": 0.19958273635866866, "learning_rate": 1.25e-05, - "loss": 0.2653, + "loss": 0.2637, "step": 1086 }, { "epoch": 2.3276231263383296, - "grad_norm": 0.19714902241567792, + "grad_norm": 0.2089718737763175, "learning_rate": 1.2460317460317461e-05, - "loss": 0.2898, + "loss": 0.2871, "step": 1087 }, { "epoch": 2.329764453961456, - "grad_norm": 0.20515273593194386, + "grad_norm": 0.2051818133160847, "learning_rate": 1.2420634920634922e-05, - "loss": 0.2977, + "loss": 0.2973, "step": 1088 }, { "epoch": 2.3319057815845823, - "grad_norm": 0.19580178817727573, + "grad_norm": 0.6114154780985942, "learning_rate": 1.2380952380952381e-05, - "loss": 0.2802, + "loss": 0.2961, "step": 1089 }, { "epoch": 2.3340471092077086, - "grad_norm": 0.17768572531273108, + "grad_norm": 0.1835556937496075, "learning_rate": 1.2341269841269842e-05, - "loss": 0.2703, + "loss": 0.269, "step": 1090 }, { "epoch": 2.336188436830835, - "grad_norm": 0.1956542549620933, + "grad_norm": 0.20213777732489752, "learning_rate": 1.2301587301587301e-05, - "loss": 0.2757, + "loss": 0.2747, "step": 1091 }, { "epoch": 2.3383297644539613, - "grad_norm": 0.20127910379978708, + "grad_norm": 0.21427750205781732, "learning_rate": 1.2261904761904762e-05, - "loss": 0.2837, + "loss": 0.2811, "step": 1092 }, { "epoch": 2.3404710920770877, - "grad_norm": 0.20060227683373613, + "grad_norm": 0.2087757066721972, "learning_rate": 1.2222222222222222e-05, - "loss": 0.2802, + "loss": 0.2787, "step": 1093 }, { "epoch": 2.342612419700214, - "grad_norm": 0.19431031021153464, + "grad_norm": 0.18757584192245175, "learning_rate": 1.2182539682539683e-05, - "loss": 0.291, + "loss": 0.2889, "step": 1094 }, { "epoch": 2.3447537473233404, - "grad_norm": 0.19260339789181197, + "grad_norm": 0.20608037222439926, "learning_rate": 1.2142857142857144e-05, - "loss": 0.2802, + "loss": 0.2789, "step": 1095 }, { "epoch": 2.3468950749464668, - "grad_norm": 0.20821596374744575, + "grad_norm": 0.21935230178468215, "learning_rate": 1.2103174603174603e-05, - "loss": 0.2866, + "loss": 0.285, "step": 1096 }, { "epoch": 2.349036402569593, - "grad_norm": 0.20675441129158606, + "grad_norm": 0.21133802393480924, "learning_rate": 1.2063492063492064e-05, - "loss": 0.2926, + "loss": 0.2899, "step": 1097 }, { "epoch": 2.3511777301927195, - "grad_norm": 0.1968034926711773, + "grad_norm": 0.18791527322276846, "learning_rate": 1.2023809523809525e-05, - "loss": 0.2757, + "loss": 0.2742, "step": 1098 }, { "epoch": 2.353319057815846, - "grad_norm": 0.18488353953473421, + "grad_norm": 0.1971862704976824, "learning_rate": 1.1984126984126985e-05, - "loss": 0.2868, + "loss": 0.2847, "step": 1099 }, { "epoch": 2.355460385438972, - "grad_norm": 0.20338110721794142, + "grad_norm": 0.21189728780479716, "learning_rate": 1.1944444444444446e-05, - "loss": 0.2882, + "loss": 0.2872, "step": 1100 }, { "epoch": 2.3576017130620985, - "grad_norm": 0.19166668181356114, + "grad_norm": 0.19227294980450937, "learning_rate": 1.1904761904761905e-05, - "loss": 0.292, + "loss": 0.2907, "step": 1101 }, { "epoch": 2.359743040685225, - "grad_norm": 0.1927455443301824, + "grad_norm": 0.20548532335329017, "learning_rate": 1.1865079365079366e-05, - "loss": 0.2649, + "loss": 0.2643, "step": 1102 }, { "epoch": 2.3618843683083512, - "grad_norm": 0.17477212497660993, + "grad_norm": 0.18375693623718486, "learning_rate": 1.1825396825396825e-05, - "loss": 0.2752, + "loss": 0.2735, "step": 1103 }, { "epoch": 2.3640256959314776, - "grad_norm": 0.19987372705216191, + "grad_norm": 0.21294408502746626, "learning_rate": 1.1785714285714286e-05, - "loss": 0.2819, + "loss": 0.281, "step": 1104 }, { "epoch": 2.366167023554604, - "grad_norm": 0.1922206507729967, + "grad_norm": 0.19727873974409793, "learning_rate": 1.1746031746031746e-05, - "loss": 0.2823, + "loss": 0.281, "step": 1105 }, { "epoch": 2.3683083511777303, - "grad_norm": 0.18451313459925092, + "grad_norm": 0.18895600615844696, "learning_rate": 1.1706349206349207e-05, - "loss": 0.2858, + "loss": 0.2852, "step": 1106 }, { "epoch": 2.3704496788008567, - "grad_norm": 0.194347680132915, + "grad_norm": 0.19703744794029157, "learning_rate": 1.1666666666666668e-05, - "loss": 0.2885, + "loss": 0.2856, "step": 1107 }, { "epoch": 2.372591006423983, - "grad_norm": 0.19502504685509406, + "grad_norm": 0.20958076853632984, "learning_rate": 1.1626984126984129e-05, - "loss": 0.2818, + "loss": 0.279, "step": 1108 }, { "epoch": 2.3747323340471094, - "grad_norm": 0.18898431759295656, + "grad_norm": 0.19332254165250848, "learning_rate": 1.1587301587301588e-05, - "loss": 0.2904, + "loss": 0.2894, "step": 1109 }, { "epoch": 2.3768736616702357, - "grad_norm": 0.19153164662259894, + "grad_norm": 0.19661719515205164, "learning_rate": 1.1547619047619048e-05, - "loss": 0.2726, + "loss": 0.2702, "step": 1110 }, { "epoch": 2.3790149892933616, - "grad_norm": 0.18268604277337688, + "grad_norm": 0.1907420302248947, "learning_rate": 1.1507936507936508e-05, - "loss": 0.279, + "loss": 0.2771, "step": 1111 }, { "epoch": 2.3811563169164884, - "grad_norm": 0.17533082071419925, + "grad_norm": 0.18023957300416488, "learning_rate": 1.1468253968253968e-05, - "loss": 0.2732, + "loss": 0.2717, "step": 1112 }, { "epoch": 2.3832976445396143, - "grad_norm": 0.2003790102740343, + "grad_norm": 0.2072993037766396, "learning_rate": 1.1428571428571429e-05, - "loss": 0.2951, + "loss": 0.2933, "step": 1113 }, { "epoch": 2.385438972162741, - "grad_norm": 0.18705857015717448, + "grad_norm": 0.1904556433015721, "learning_rate": 1.138888888888889e-05, - "loss": 0.2793, + "loss": 0.2777, "step": 1114 }, { "epoch": 2.387580299785867, - "grad_norm": 0.1868482525264962, + "grad_norm": 0.190065817649398, "learning_rate": 1.1349206349206349e-05, - "loss": 0.2747, + "loss": 0.2737, "step": 1115 }, { "epoch": 2.3897216274089934, - "grad_norm": 0.18021169330552053, + "grad_norm": 0.18734473596445086, "learning_rate": 1.130952380952381e-05, - "loss": 0.2812, + "loss": 0.2798, "step": 1116 }, { "epoch": 2.3918629550321198, - "grad_norm": 0.19301415468138103, + "grad_norm": 0.20310161931125695, "learning_rate": 1.126984126984127e-05, - "loss": 0.2886, + "loss": 0.2862, "step": 1117 }, { "epoch": 2.394004282655246, - "grad_norm": 0.19072636775125612, + "grad_norm": 0.19855963578116342, "learning_rate": 1.1230158730158731e-05, - "loss": 0.2806, + "loss": 0.2784, "step": 1118 }, { "epoch": 2.3961456102783725, - "grad_norm": 0.1840802920208615, + "grad_norm": 0.189077523042222, "learning_rate": 1.1190476190476192e-05, - "loss": 0.2879, + "loss": 0.286, "step": 1119 }, { "epoch": 2.398286937901499, - "grad_norm": 0.18258129644157975, + "grad_norm": 0.18894803290615364, "learning_rate": 1.1150793650793653e-05, - "loss": 0.2803, + "loss": 0.2779, "step": 1120 }, { "epoch": 2.400428265524625, - "grad_norm": 0.1827848542873828, + "grad_norm": 0.2015116046355913, "learning_rate": 1.1111111111111112e-05, - "loss": 0.2881, + "loss": 0.2865, "step": 1121 }, { "epoch": 2.4025695931477515, - "grad_norm": 0.18900064907660674, + "grad_norm": 0.19437606708029698, "learning_rate": 1.107142857142857e-05, - "loss": 0.2938, + "loss": 0.2933, "step": 1122 }, { "epoch": 2.404710920770878, - "grad_norm": 0.18804850484657887, + "grad_norm": 0.18852414654829355, "learning_rate": 1.1031746031746031e-05, - "loss": 0.2858, + "loss": 0.284, "step": 1123 }, { "epoch": 2.4068522483940042, - "grad_norm": 0.2006000850638827, + "grad_norm": 0.1879656318380588, "learning_rate": 1.0992063492063492e-05, - "loss": 0.2876, + "loss": 0.2874, "step": 1124 }, { "epoch": 2.4089935760171306, - "grad_norm": 0.18714684164420253, + "grad_norm": 0.19791655089067275, "learning_rate": 1.0952380952380953e-05, - "loss": 0.2803, + "loss": 0.2786, "step": 1125 }, { "epoch": 2.411134903640257, - "grad_norm": 0.19792928506355778, + "grad_norm": 0.20226850687509584, "learning_rate": 1.0912698412698414e-05, - "loss": 0.2936, + "loss": 0.2922, "step": 1126 }, { "epoch": 2.4132762312633833, - "grad_norm": 0.18865247304598853, + "grad_norm": 0.18552400783172202, "learning_rate": 1.0873015873015874e-05, - "loss": 0.2778, + "loss": 0.2757, "step": 1127 }, { "epoch": 2.4154175588865097, - "grad_norm": 0.1929275031551357, + "grad_norm": 0.1837825345997371, "learning_rate": 1.0833333333333334e-05, - "loss": 0.2777, + "loss": 0.2764, "step": 1128 }, { "epoch": 2.417558886509636, - "grad_norm": 0.1806973386093782, + "grad_norm": 0.19528894541437605, "learning_rate": 1.0793650793650794e-05, - "loss": 0.2763, + "loss": 0.2742, "step": 1129 }, { "epoch": 2.4197002141327624, - "grad_norm": 0.19706536820552648, + "grad_norm": 0.21464260349692135, "learning_rate": 1.0753968253968255e-05, - "loss": 0.2883, + "loss": 0.2873, "step": 1130 }, { "epoch": 2.4218415417558887, - "grad_norm": 0.1910372018652057, + "grad_norm": 0.1945459121129588, "learning_rate": 1.0714285714285714e-05, - "loss": 0.2837, + "loss": 0.2821, "step": 1131 }, { "epoch": 2.423982869379015, - "grad_norm": 0.1788916292232767, + "grad_norm": 0.18478419433751667, "learning_rate": 1.0674603174603175e-05, - "loss": 0.2841, + "loss": 0.2826, "step": 1132 }, { "epoch": 2.4261241970021414, - "grad_norm": 0.18659149187977309, + "grad_norm": 0.20225910020937835, "learning_rate": 1.0634920634920636e-05, - "loss": 0.2894, + "loss": 0.2873, "step": 1133 }, { "epoch": 2.428265524625268, - "grad_norm": 0.1925123938840746, + "grad_norm": 0.20292439805906615, "learning_rate": 1.0595238095238096e-05, - "loss": 0.2838, + "loss": 0.282, "step": 1134 }, { "epoch": 2.430406852248394, - "grad_norm": 0.1776215701785684, + "grad_norm": 0.18473437227064216, "learning_rate": 1.0555555555555555e-05, - "loss": 0.2707, + "loss": 0.269, "step": 1135 }, { "epoch": 2.4325481798715205, - "grad_norm": 0.20843365977457115, + "grad_norm": 0.211980884226004, "learning_rate": 1.0515873015873016e-05, - "loss": 0.2892, + "loss": 0.2882, "step": 1136 }, { "epoch": 2.434689507494647, - "grad_norm": 0.19764138815197937, + "grad_norm": 0.22535605642238996, "learning_rate": 1.0476190476190477e-05, - "loss": 0.2648, + "loss": 0.265, "step": 1137 }, { "epoch": 2.436830835117773, - "grad_norm": 0.19252613410306155, + "grad_norm": 0.1991542202837908, "learning_rate": 1.0436507936507938e-05, - "loss": 0.2911, + "loss": 0.2887, "step": 1138 }, { "epoch": 2.4389721627408996, - "grad_norm": 0.20085600997800981, + "grad_norm": 0.2072336431629644, "learning_rate": 1.0396825396825398e-05, - "loss": 0.2826, + "loss": 0.2805, "step": 1139 }, { "epoch": 2.4411134903640255, - "grad_norm": 0.188049018268479, + "grad_norm": 0.20543914654797396, "learning_rate": 1.0357142857142859e-05, - "loss": 0.2911, + "loss": 0.2905, "step": 1140 }, { "epoch": 2.4432548179871523, - "grad_norm": 0.19493824847548077, + "grad_norm": 0.20157112476161199, "learning_rate": 1.0317460317460318e-05, - "loss": 0.286, + "loss": 0.2843, "step": 1141 }, { "epoch": 2.445396145610278, - "grad_norm": 0.191504777727128, + "grad_norm": 0.1916041907370502, "learning_rate": 1.0277777777777777e-05, - "loss": 0.2933, + "loss": 0.2916, "step": 1142 }, { "epoch": 2.4475374732334045, - "grad_norm": 0.19108814910860578, + "grad_norm": 0.19261294432915077, "learning_rate": 1.0238095238095238e-05, - "loss": 0.2785, + "loss": 0.2766, "step": 1143 }, { "epoch": 2.449678800856531, - "grad_norm": 0.20799246077044664, + "grad_norm": 0.21519445528802897, "learning_rate": 1.0198412698412699e-05, - "loss": 0.2837, + "loss": 0.2819, "step": 1144 }, { "epoch": 2.4518201284796572, - "grad_norm": 0.1824764184302531, + "grad_norm": 0.19022925356343928, "learning_rate": 1.015873015873016e-05, - "loss": 0.2652, + "loss": 0.2633, "step": 1145 }, { "epoch": 2.4539614561027836, - "grad_norm": 0.1824775079826765, + "grad_norm": 0.18842920918262623, "learning_rate": 1.011904761904762e-05, - "loss": 0.2716, + "loss": 0.2701, "step": 1146 }, { "epoch": 2.45610278372591, - "grad_norm": 0.1932203896888414, + "grad_norm": 0.18644571326970072, "learning_rate": 1.007936507936508e-05, - "loss": 0.276, + "loss": 0.2747, "step": 1147 }, { "epoch": 2.4582441113490363, - "grad_norm": 0.2007871128283347, + "grad_norm": 0.20125355865381975, "learning_rate": 1.003968253968254e-05, - "loss": 0.264, + "loss": 0.2627, "step": 1148 }, { "epoch": 2.4603854389721627, - "grad_norm": 0.21083133706351928, + "grad_norm": 0.21999175240537394, "learning_rate": 1e-05, - "loss": 0.2816, + "loss": 0.2805, "step": 1149 }, { "epoch": 2.462526766595289, - "grad_norm": 0.19278202499501207, + "grad_norm": 0.19787370820057412, "learning_rate": 9.96031746031746e-06, - "loss": 0.2823, + "loss": 0.2811, "step": 1150 }, { "epoch": 2.4646680942184154, - "grad_norm": 0.18797686590775003, + "grad_norm": 0.18955007917593744, "learning_rate": 9.92063492063492e-06, - "loss": 0.2879, + "loss": 0.286, "step": 1151 }, { "epoch": 2.4668094218415417, - "grad_norm": 0.20454699746081098, + "grad_norm": 0.20431736362786296, "learning_rate": 9.880952380952381e-06, - "loss": 0.2672, + "loss": 0.2657, "step": 1152 }, { "epoch": 2.468950749464668, - "grad_norm": 0.18367484130511327, + "grad_norm": 0.19320732235445437, "learning_rate": 9.841269841269842e-06, - "loss": 0.2806, + "loss": 0.2791, "step": 1153 }, { "epoch": 2.4710920770877944, - "grad_norm": 0.1831686319686948, + "grad_norm": 0.19183959802084546, "learning_rate": 9.801587301587301e-06, - "loss": 0.273, + "loss": 0.2702, "step": 1154 }, { "epoch": 2.473233404710921, - "grad_norm": 0.19694402234544842, + "grad_norm": 0.19805919853745266, "learning_rate": 9.761904761904762e-06, - "loss": 0.2878, + "loss": 0.2858, "step": 1155 }, { "epoch": 2.475374732334047, - "grad_norm": 0.19408589237490334, + "grad_norm": 0.1907652749236508, "learning_rate": 9.722222222222223e-06, - "loss": 0.2943, + "loss": 0.294, "step": 1156 }, { "epoch": 2.4775160599571735, - "grad_norm": 0.19445588500969652, + "grad_norm": 0.19770176710293932, "learning_rate": 9.682539682539683e-06, - "loss": 0.2784, + "loss": 0.2759, "step": 1157 }, { "epoch": 2.4796573875803, - "grad_norm": 0.17963888726600663, + "grad_norm": 0.19222146084786232, "learning_rate": 9.642857142857144e-06, - "loss": 0.2752, + "loss": 0.2732, "step": 1158 }, { "epoch": 2.481798715203426, - "grad_norm": 0.19347132870813594, + "grad_norm": 0.20359350613339589, "learning_rate": 9.603174603174605e-06, - "loss": 0.2867, + "loss": 0.2858, "step": 1159 }, { "epoch": 2.4839400428265526, - "grad_norm": 0.18730325675923679, + "grad_norm": 0.19032755422213457, "learning_rate": 9.563492063492064e-06, - "loss": 0.2762, + "loss": 0.2742, "step": 1160 }, { "epoch": 2.486081370449679, - "grad_norm": 0.18460049863260458, + "grad_norm": 0.19335063670418315, "learning_rate": 9.523809523809523e-06, - "loss": 0.2872, + "loss": 0.2859, "step": 1161 }, { "epoch": 2.4882226980728053, - "grad_norm": 0.1907698896483033, + "grad_norm": 0.19897996740846366, "learning_rate": 9.484126984126984e-06, - "loss": 0.287, + "loss": 0.2857, "step": 1162 }, { "epoch": 2.4903640256959316, - "grad_norm": 0.18810661893439526, + "grad_norm": 0.1941351806336689, "learning_rate": 9.444444444444445e-06, - "loss": 0.2797, + "loss": 0.2784, "step": 1163 }, { "epoch": 2.492505353319058, - "grad_norm": 0.19192431651136277, + "grad_norm": 0.19765019351850296, "learning_rate": 9.404761904761905e-06, - "loss": 0.2779, + "loss": 0.2761, "step": 1164 }, { "epoch": 2.4946466809421843, - "grad_norm": 0.1791506541350854, + "grad_norm": 0.18516337216937107, "learning_rate": 9.365079365079366e-06, - "loss": 0.278, + "loss": 0.2764, "step": 1165 }, { "epoch": 2.4967880085653107, - "grad_norm": 0.18069237791730375, + "grad_norm": 0.18907608503624435, "learning_rate": 9.325396825396827e-06, - "loss": 0.2792, + "loss": 0.277, "step": 1166 }, { "epoch": 2.4989293361884366, - "grad_norm": 0.18412348076309296, + "grad_norm": 0.1941807441287669, "learning_rate": 9.285714285714286e-06, - "loss": 0.2771, + "loss": 0.275, "step": 1167 }, { "epoch": 2.5010706638115634, - "grad_norm": 0.1813880522758016, + "grad_norm": 0.18811473358774278, "learning_rate": 9.246031746031747e-06, - "loss": 0.2835, + "loss": 0.2825, "step": 1168 }, { "epoch": 2.5032119914346893, - "grad_norm": 0.19273095158091466, + "grad_norm": 0.19866610507866173, "learning_rate": 9.206349206349207e-06, - "loss": 0.2923, + "loss": 0.2903, "step": 1169 }, { "epoch": 2.505353319057816, - "grad_norm": 0.18337449464589062, + "grad_norm": 0.18733861692093362, "learning_rate": 9.166666666666666e-06, - "loss": 0.2884, + "loss": 0.2873, "step": 1170 }, { "epoch": 2.507494646680942, - "grad_norm": 0.19024259739381238, + "grad_norm": 0.20268747557365815, "learning_rate": 9.126984126984127e-06, - "loss": 0.2956, + "loss": 0.2948, "step": 1171 }, { "epoch": 2.5096359743040684, - "grad_norm": 0.19901572512340374, + "grad_norm": 0.1996977559423424, "learning_rate": 9.087301587301588e-06, - "loss": 0.2737, + "loss": 0.2713, "step": 1172 }, { "epoch": 2.5117773019271947, - "grad_norm": 0.194998468534562, + "grad_norm": 0.1996076333163632, "learning_rate": 9.047619047619047e-06, - "loss": 0.28, + "loss": 0.2779, "step": 1173 }, { "epoch": 2.513918629550321, - "grad_norm": 0.18412933977392476, + "grad_norm": 0.19011932221335692, "learning_rate": 9.007936507936508e-06, - "loss": 0.2819, + "loss": 0.2803, "step": 1174 }, { "epoch": 2.5160599571734474, - "grad_norm": 0.18374938776839936, + "grad_norm": 0.18599656098112288, "learning_rate": 8.968253968253968e-06, - "loss": 0.2813, + "loss": 0.2791, "step": 1175 }, { "epoch": 2.518201284796574, - "grad_norm": 0.18751207889857766, + "grad_norm": 0.19678433899725203, "learning_rate": 8.92857142857143e-06, - "loss": 0.2831, + "loss": 0.2822, "step": 1176 }, { "epoch": 2.5203426124197, - "grad_norm": 0.17923076174542152, + "grad_norm": 0.1846108520466212, "learning_rate": 8.88888888888889e-06, - "loss": 0.2682, + "loss": 0.266, "step": 1177 }, { "epoch": 2.5224839400428265, - "grad_norm": 0.18035308954107676, + "grad_norm": 0.18960292191925746, "learning_rate": 8.84920634920635e-06, - "loss": 0.2677, + "loss": 0.2662, "step": 1178 }, { "epoch": 2.524625267665953, - "grad_norm": 0.22973038098447218, + "grad_norm": 0.2200193235205345, "learning_rate": 8.80952380952381e-06, - "loss": 0.2874, + "loss": 0.2854, "step": 1179 }, { "epoch": 2.526766595289079, - "grad_norm": 0.18325732719083584, + "grad_norm": 0.18902634217465988, "learning_rate": 8.769841269841269e-06, - "loss": 0.271, + "loss": 0.2687, "step": 1180 }, { "epoch": 2.5289079229122056, - "grad_norm": 0.1838482554089651, + "grad_norm": 0.19098504833889798, "learning_rate": 8.73015873015873e-06, - "loss": 0.2743, + "loss": 0.2735, "step": 1181 }, { "epoch": 2.531049250535332, - "grad_norm": 0.18469422748009917, + "grad_norm": 0.19175998158414528, "learning_rate": 8.69047619047619e-06, - "loss": 0.2809, + "loss": 0.2785, "step": 1182 }, { "epoch": 2.5331905781584583, - "grad_norm": 0.19824918591422117, + "grad_norm": 0.20056232550718314, "learning_rate": 8.650793650793651e-06, - "loss": 0.2956, + "loss": 0.2941, "step": 1183 }, { "epoch": 2.5353319057815846, - "grad_norm": 0.18477009428383825, + "grad_norm": 0.19066315286775598, "learning_rate": 8.611111111111112e-06, - "loss": 0.2746, + "loss": 0.2731, "step": 1184 }, { "epoch": 2.537473233404711, - "grad_norm": 0.18367643368070655, + "grad_norm": 0.18331446647935762, "learning_rate": 8.571428571428573e-06, - "loss": 0.2779, + "loss": 0.2758, "step": 1185 }, { "epoch": 2.5396145610278373, - "grad_norm": 0.18552020064619668, + "grad_norm": 0.184471748142892, "learning_rate": 8.531746031746032e-06, - "loss": 0.2726, + "loss": 0.2703, "step": 1186 }, { "epoch": 2.5417558886509637, - "grad_norm": 0.18823754582760785, + "grad_norm": 0.18934407240554624, "learning_rate": 8.492063492063492e-06, - "loss": 0.2936, + "loss": 0.2917, "step": 1187 }, { "epoch": 2.54389721627409, - "grad_norm": 0.1838357043381974, + "grad_norm": 0.1839570377909689, "learning_rate": 8.452380952380953e-06, - "loss": 0.2754, + "loss": 0.2733, "step": 1188 }, { "epoch": 2.5460385438972164, - "grad_norm": 0.1930931282449352, + "grad_norm": 0.19505246832240355, "learning_rate": 8.412698412698414e-06, - "loss": 0.2848, + "loss": 0.2821, "step": 1189 }, { "epoch": 2.5481798715203428, - "grad_norm": 0.18519908919348946, + "grad_norm": 0.1868170944308521, "learning_rate": 8.373015873015873e-06, - "loss": 0.2776, + "loss": 0.2756, "step": 1190 }, { "epoch": 2.550321199143469, - "grad_norm": 0.18754273175059186, + "grad_norm": 0.1863338672371585, "learning_rate": 8.333333333333334e-06, - "loss": 0.2799, + "loss": 0.2779, "step": 1191 }, { "epoch": 2.552462526766595, - "grad_norm": 0.18545398781411668, + "grad_norm": 0.18694030749510335, "learning_rate": 8.293650793650794e-06, - "loss": 0.2651, + "loss": 0.2623, "step": 1192 }, { "epoch": 2.554603854389722, - "grad_norm": 0.1847349009972924, + "grad_norm": 0.18346739599692435, "learning_rate": 8.253968253968254e-06, - "loss": 0.2714, + "loss": 0.2692, "step": 1193 }, { "epoch": 2.5567451820128477, - "grad_norm": 0.19268643145654266, + "grad_norm": 0.193938387145013, "learning_rate": 8.214285714285714e-06, - "loss": 0.279, + "loss": 0.2762, "step": 1194 }, { "epoch": 2.5588865096359745, - "grad_norm": 0.19778277420764503, + "grad_norm": 0.1968770084022562, "learning_rate": 8.174603174603175e-06, - "loss": 0.2778, + "loss": 0.2749, "step": 1195 }, { "epoch": 2.5610278372591004, - "grad_norm": 0.1998660395641265, + "grad_norm": 0.2012872958431149, "learning_rate": 8.134920634920636e-06, - "loss": 0.2864, + "loss": 0.2841, "step": 1196 }, { "epoch": 2.5631691648822272, - "grad_norm": 0.19294158536685677, + "grad_norm": 0.19648015971972235, "learning_rate": 8.095238095238097e-06, - "loss": 0.2696, + "loss": 0.2677, "step": 1197 }, { "epoch": 2.565310492505353, - "grad_norm": 0.18786548806483286, + "grad_norm": 0.18810436672950626, "learning_rate": 8.055555555555557e-06, - "loss": 0.2692, + "loss": 0.2668, "step": 1198 }, { "epoch": 2.5674518201284795, - "grad_norm": 0.19390591817924888, + "grad_norm": 0.19172953088721956, "learning_rate": 8.015873015873016e-06, - "loss": 0.278, + "loss": 0.2754, "step": 1199 }, { "epoch": 2.569593147751606, - "grad_norm": 0.18907409735630998, + "grad_norm": 0.2022573936763731, "learning_rate": 7.976190476190475e-06, - "loss": 0.2885, + "loss": 0.2878, "step": 1200 }, { "epoch": 2.571734475374732, - "grad_norm": 0.20258943279374564, + "grad_norm": 0.20158630502809705, "learning_rate": 7.936507936507936e-06, - "loss": 0.303, + "loss": 0.3009, "step": 1201 }, { "epoch": 2.5738758029978586, - "grad_norm": 0.19454350482759478, + "grad_norm": 0.19444900388460162, "learning_rate": 7.896825396825397e-06, - "loss": 0.2739, + "loss": 0.2723, "step": 1202 }, { "epoch": 2.576017130620985, - "grad_norm": 0.18958864960792593, + "grad_norm": 0.19393996035159938, "learning_rate": 7.857142857142858e-06, - "loss": 0.2804, + "loss": 0.2784, "step": 1203 }, { "epoch": 2.5781584582441113, - "grad_norm": 0.1982183376800514, + "grad_norm": 0.20388456521131612, "learning_rate": 7.817460317460318e-06, - "loss": 0.2991, + "loss": 0.2972, "step": 1204 }, { "epoch": 2.5802997858672376, - "grad_norm": 0.1913275931410259, + "grad_norm": 0.19046236974876576, "learning_rate": 7.777777777777777e-06, - "loss": 0.2888, + "loss": 0.2866, "step": 1205 }, { "epoch": 2.582441113490364, - "grad_norm": 0.21019225544474518, + "grad_norm": 0.21198070199178753, "learning_rate": 7.738095238095238e-06, - "loss": 0.2887, + "loss": 0.2874, "step": 1206 }, { "epoch": 2.5845824411134903, - "grad_norm": 0.2042220303342691, + "grad_norm": 0.20591590293289885, "learning_rate": 7.698412698412699e-06, - "loss": 0.3031, + "loss": 0.3017, "step": 1207 }, { "epoch": 2.5867237687366167, - "grad_norm": 0.19819365750871445, + "grad_norm": 0.20766349326464448, "learning_rate": 7.65873015873016e-06, - "loss": 0.2896, + "loss": 0.2874, "step": 1208 }, { "epoch": 2.588865096359743, - "grad_norm": 0.18837665341574075, + "grad_norm": 0.18365533590321947, "learning_rate": 7.6190476190476205e-06, - "loss": 0.2775, + "loss": 0.2761, "step": 1209 }, { "epoch": 2.5910064239828694, - "grad_norm": 0.18886359577025771, + "grad_norm": 0.19032688283851512, "learning_rate": 7.5793650793650795e-06, - "loss": 0.2718, + "loss": 0.2708, "step": 1210 }, { "epoch": 2.5931477516059958, - "grad_norm": 0.19164944758078695, + "grad_norm": 0.1980622035151324, "learning_rate": 7.5396825396825394e-06, - "loss": 0.2705, + "loss": 0.269, "step": 1211 }, { "epoch": 2.595289079229122, - "grad_norm": 0.1946688680184309, + "grad_norm": 0.1983391081538549, "learning_rate": 7.5e-06, - "loss": 0.2817, + "loss": 0.2796, "step": 1212 }, { "epoch": 2.5974304068522485, - "grad_norm": 0.17995484096025638, + "grad_norm": 0.18057379441303084, "learning_rate": 7.460317460317461e-06, - "loss": 0.2763, + "loss": 0.2746, "step": 1213 }, { "epoch": 2.599571734475375, - "grad_norm": 0.18827882235831994, + "grad_norm": 0.19068010050148992, "learning_rate": 7.420634920634921e-06, - "loss": 0.2649, + "loss": 0.263, "step": 1214 }, { "epoch": 2.601713062098501, - "grad_norm": 0.1911172359855126, + "grad_norm": 0.18991511897664065, "learning_rate": 7.380952380952382e-06, - "loss": 0.2785, + "loss": 0.2773, "step": 1215 }, { "epoch": 2.6038543897216275, - "grad_norm": 0.20732711252193384, + "grad_norm": 0.19991447304073107, "learning_rate": 7.3412698412698415e-06, - "loss": 0.2956, + "loss": 0.2946, "step": 1216 }, { "epoch": 2.605995717344754, - "grad_norm": 0.19615616622963164, + "grad_norm": 0.20077693323441242, "learning_rate": 7.301587301587302e-06, - "loss": 0.2961, + "loss": 0.2952, "step": 1217 }, { "epoch": 2.6081370449678802, - "grad_norm": 0.18728761641057254, + "grad_norm": 0.1910164898967557, "learning_rate": 7.261904761904763e-06, - "loss": 0.2884, + "loss": 0.2871, "step": 1218 }, { "epoch": 2.6102783725910066, - "grad_norm": 0.18873348095093828, + "grad_norm": 0.19760453195164532, "learning_rate": 7.222222222222222e-06, - "loss": 0.2788, + "loss": 0.276, "step": 1219 }, { "epoch": 2.612419700214133, - "grad_norm": 0.1895538111432152, + "grad_norm": 0.19107969100185188, "learning_rate": 7.182539682539682e-06, - "loss": 0.2874, + "loss": 0.2857, "step": 1220 }, { "epoch": 2.614561027837259, - "grad_norm": 0.17912222034312614, + "grad_norm": 0.18522026162274652, "learning_rate": 7.142857142857143e-06, - "loss": 0.2712, + "loss": 0.269, "step": 1221 }, { "epoch": 2.6167023554603857, - "grad_norm": 0.18895587187642374, + "grad_norm": 0.1940614696572163, "learning_rate": 7.1031746031746035e-06, - "loss": 0.2777, + "loss": 0.2755, "step": 1222 }, { "epoch": 2.6188436830835116, - "grad_norm": 0.18078738265388186, + "grad_norm": 0.19575778265780672, "learning_rate": 7.063492063492063e-06, - "loss": 0.282, + "loss": 0.2807, "step": 1223 }, { "epoch": 2.6209850107066384, - "grad_norm": 0.1761591551782532, + "grad_norm": 0.18786133539258287, "learning_rate": 7.023809523809524e-06, - "loss": 0.2746, + "loss": 0.2726, "step": 1224 }, { "epoch": 2.6231263383297643, - "grad_norm": 0.18492923876395373, + "grad_norm": 0.18567364878354917, "learning_rate": 6.984126984126985e-06, - "loss": 0.2758, + "loss": 0.2737, "step": 1225 }, { "epoch": 2.6252676659528906, - "grad_norm": 0.18920352772469154, + "grad_norm": 0.18207760252968563, "learning_rate": 6.944444444444445e-06, - "loss": 0.2711, + "loss": 0.2692, "step": 1226 }, { "epoch": 2.627408993576017, - "grad_norm": 0.18550091010732825, + "grad_norm": 0.190042944624669, "learning_rate": 6.9047619047619055e-06, - "loss": 0.2697, + "loss": 0.2677, "step": 1227 }, { "epoch": 2.6295503211991433, - "grad_norm": 0.1785541730706682, + "grad_norm": 0.18201431307145552, "learning_rate": 6.865079365079366e-06, - "loss": 0.2763, + "loss": 0.2741, "step": 1228 }, { "epoch": 2.6316916488222697, - "grad_norm": 0.17857492350218257, + "grad_norm": 0.1865727217496232, "learning_rate": 6.825396825396825e-06, - "loss": 0.2691, + "loss": 0.2671, "step": 1229 }, { "epoch": 2.633832976445396, - "grad_norm": 0.1753222130027031, + "grad_norm": 0.1801997882324314, "learning_rate": 6.785714285714285e-06, - "loss": 0.2717, + "loss": 0.2699, "step": 1230 }, { "epoch": 2.6359743040685224, - "grad_norm": 0.19278231524559503, + "grad_norm": 0.19389188154337386, "learning_rate": 6.746031746031746e-06, - "loss": 0.2784, + "loss": 0.2761, "step": 1231 }, { "epoch": 2.6381156316916488, - "grad_norm": 0.20070264553264905, + "grad_norm": 0.1876099016700767, "learning_rate": 6.706349206349207e-06, - "loss": 0.2987, + "loss": 0.2974, "step": 1232 }, { "epoch": 2.640256959314775, - "grad_norm": 0.2010595496902971, + "grad_norm": 0.19121391315236033, "learning_rate": 6.666666666666667e-06, - "loss": 0.3015, + "loss": 0.2988, "step": 1233 }, { "epoch": 2.6423982869379015, - "grad_norm": 0.18108631640949424, + "grad_norm": 0.19053822839952972, "learning_rate": 6.626984126984127e-06, - "loss": 0.2814, + "loss": 0.2797, "step": 1234 }, { "epoch": 2.644539614561028, - "grad_norm": 0.18555134865906892, + "grad_norm": 0.18505965289729664, "learning_rate": 6.587301587301588e-06, - "loss": 0.2677, + "loss": 0.2666, "step": 1235 }, { "epoch": 2.646680942184154, - "grad_norm": 0.18862946658278595, + "grad_norm": 0.1893938662916577, "learning_rate": 6.547619047619048e-06, - "loss": 0.2738, + "loss": 0.2719, "step": 1236 }, { "epoch": 2.6488222698072805, - "grad_norm": 0.1859370117993973, + "grad_norm": 0.1938288347509404, "learning_rate": 6.507936507936509e-06, - "loss": 0.2845, + "loss": 0.2823, "step": 1237 }, { "epoch": 2.650963597430407, - "grad_norm": 0.18716469379849815, + "grad_norm": 0.19032293804902783, "learning_rate": 6.4682539682539696e-06, - "loss": 0.2716, + "loss": 0.2687, "step": 1238 }, { "epoch": 2.6531049250535332, - "grad_norm": 0.18552514906798132, + "grad_norm": 0.1799378115721631, "learning_rate": 6.428571428571429e-06, - "loss": 0.2742, + "loss": 0.2708, "step": 1239 }, { "epoch": 2.6552462526766596, - "grad_norm": 0.19953825231892838, + "grad_norm": 0.19301414441267256, "learning_rate": 6.3888888888888885e-06, - "loss": 0.2964, + "loss": 0.2952, "step": 1240 }, { "epoch": 2.657387580299786, - "grad_norm": 0.18682163268591973, + "grad_norm": 0.1849662307056679, "learning_rate": 6.349206349206349e-06, - "loss": 0.2692, + "loss": 0.2668, "step": 1241 }, { "epoch": 2.6595289079229123, - "grad_norm": 0.1794385339174016, + "grad_norm": 0.18055776145465033, "learning_rate": 6.30952380952381e-06, - "loss": 0.2703, + "loss": 0.2688, "step": 1242 }, { "epoch": 2.6616702355460387, - "grad_norm": 0.1887882853913073, + "grad_norm": 0.1886120175112293, "learning_rate": 6.26984126984127e-06, - "loss": 0.2874, + "loss": 0.2858, "step": 1243 }, { "epoch": 2.663811563169165, - "grad_norm": 0.1872385299129515, + "grad_norm": 0.18653293808215643, "learning_rate": 6.230158730158731e-06, - "loss": 0.2863, + "loss": 0.2848, "step": 1244 }, { "epoch": 2.6659528907922914, - "grad_norm": 0.1819046482781277, + "grad_norm": 0.1843692042836253, "learning_rate": 6.190476190476191e-06, - "loss": 0.2705, + "loss": 0.2688, "step": 1245 }, { "epoch": 2.6680942184154177, - "grad_norm": 0.19114242870168724, + "grad_norm": 0.19256050930680374, "learning_rate": 6.1507936507936505e-06, - "loss": 0.2782, + "loss": 0.276, "step": 1246 }, { "epoch": 2.670235546038544, - "grad_norm": 0.19244904517702394, + "grad_norm": 0.1854008621346679, "learning_rate": 6.111111111111111e-06, - "loss": 0.2683, + "loss": 0.265, "step": 1247 }, { "epoch": 2.67237687366167, - "grad_norm": 0.18118655863654376, + "grad_norm": 0.19104484469553526, "learning_rate": 6.071428571428572e-06, - "loss": 0.2776, + "loss": 0.2758, "step": 1248 }, { "epoch": 2.674518201284797, - "grad_norm": 0.1895477899880136, + "grad_norm": 0.18892651998233595, "learning_rate": 6.031746031746032e-06, - "loss": 0.2955, + "loss": 0.2929, "step": 1249 }, { "epoch": 2.6766595289079227, - "grad_norm": 0.2037032863019117, + "grad_norm": 0.21181690679734574, "learning_rate": 5.992063492063493e-06, - "loss": 0.278, + "loss": 0.2766, "step": 1250 }, { "epoch": 2.6788008565310495, - "grad_norm": 0.215090909915821, + "grad_norm": 0.2257805250104584, "learning_rate": 5.9523809523809525e-06, - "loss": 0.2779, + "loss": 0.2767, "step": 1251 }, { "epoch": 2.6809421841541754, - "grad_norm": 0.1806317361792395, + "grad_norm": 0.18618064006808122, "learning_rate": 5.9126984126984124e-06, - "loss": 0.279, + "loss": 0.2768, "step": 1252 }, { "epoch": 2.683083511777302, - "grad_norm": 0.17940601877117468, + "grad_norm": 0.17853160960610476, "learning_rate": 5.873015873015873e-06, - "loss": 0.2834, + "loss": 0.2819, "step": 1253 }, { "epoch": 2.685224839400428, - "grad_norm": 0.181650657819111, + "grad_norm": 0.1853256859343023, "learning_rate": 5.833333333333334e-06, - "loss": 0.2867, + "loss": 0.285, "step": 1254 }, { "epoch": 2.6873661670235545, - "grad_norm": 0.19030036855437318, + "grad_norm": 0.20073292717441923, "learning_rate": 5.793650793650794e-06, - "loss": 0.2742, + "loss": 0.2717, "step": 1255 }, { "epoch": 2.689507494646681, - "grad_norm": 0.18217728969404254, + "grad_norm": 0.18795175480989068, "learning_rate": 5.753968253968254e-06, - "loss": 0.2798, + "loss": 0.2785, "step": 1256 }, { "epoch": 2.691648822269807, - "grad_norm": 0.17476001635638663, + "grad_norm": 0.1776226753510265, "learning_rate": 5.7142857142857145e-06, - "loss": 0.2736, + "loss": 0.2713, "step": 1257 }, { "epoch": 2.6937901498929335, - "grad_norm": 0.16928638733834248, + "grad_norm": 0.17299646249891704, "learning_rate": 5.674603174603174e-06, - "loss": 0.2695, + "loss": 0.2671, "step": 1258 }, { "epoch": 2.69593147751606, - "grad_norm": 0.17143125336122542, + "grad_norm": 0.1740946943196013, "learning_rate": 5.634920634920635e-06, - "loss": 0.2746, + "loss": 0.2724, "step": 1259 }, { "epoch": 2.6980728051391862, - "grad_norm": 0.17538867599551303, + "grad_norm": 0.17712448583628357, "learning_rate": 5.595238095238096e-06, - "loss": 0.2829, + "loss": 0.281, "step": 1260 }, { "epoch": 2.7002141327623126, - "grad_norm": 0.17983290126938117, + "grad_norm": 0.18442023632973145, "learning_rate": 5.555555555555556e-06, - "loss": 0.2702, + "loss": 0.2691, "step": 1261 }, { "epoch": 2.702355460385439, - "grad_norm": 0.17677089972588253, + "grad_norm": 0.1813575399048717, "learning_rate": 5.515873015873016e-06, - "loss": 0.2843, + "loss": 0.2823, "step": 1262 }, { "epoch": 2.7044967880085653, - "grad_norm": 0.1844485018306519, + "grad_norm": 0.18738697864152037, "learning_rate": 5.4761904761904765e-06, - "loss": 0.2813, + "loss": 0.2791, "step": 1263 }, { "epoch": 2.7066381156316917, - "grad_norm": 0.18659859307800958, + "grad_norm": 0.18736026207972067, "learning_rate": 5.436507936507937e-06, - "loss": 0.2795, + "loss": 0.2777, "step": 1264 }, { "epoch": 2.708779443254818, - "grad_norm": 0.17342140616127946, + "grad_norm": 0.1769303559004464, "learning_rate": 5.396825396825397e-06, - "loss": 0.2905, + "loss": 0.2891, "step": 1265 }, { "epoch": 2.7109207708779444, - "grad_norm": 0.1751230736265568, + "grad_norm": 0.18034406343578302, "learning_rate": 5.357142857142857e-06, - "loss": 0.2769, + "loss": 0.2749, "step": 1266 }, { "epoch": 2.7130620985010707, - "grad_norm": 0.18568051901207977, + "grad_norm": 0.18273873357915627, "learning_rate": 5.317460317460318e-06, - "loss": 0.2744, + "loss": 0.2725, "step": 1267 }, { "epoch": 2.715203426124197, - "grad_norm": 0.18188150911305687, + "grad_norm": 0.19359646334285074, "learning_rate": 5.277777777777778e-06, - "loss": 0.2811, + "loss": 0.2784, "step": 1268 }, { "epoch": 2.7173447537473234, - "grad_norm": 0.18160082816537443, + "grad_norm": 0.18794519478477986, "learning_rate": 5.2380952380952384e-06, - "loss": 0.2776, + "loss": 0.275, "step": 1269 }, { "epoch": 2.71948608137045, - "grad_norm": 0.178758160602488, + "grad_norm": 0.18233700769386646, "learning_rate": 5.198412698412699e-06, - "loss": 0.2651, + "loss": 0.2625, "step": 1270 }, { "epoch": 2.721627408993576, - "grad_norm": 0.17965972231368907, + "grad_norm": 0.1870952848168329, "learning_rate": 5.158730158730159e-06, - "loss": 0.2818, + "loss": 0.2792, "step": 1271 }, { "epoch": 2.7237687366167025, - "grad_norm": 0.19233800455601202, + "grad_norm": 0.19371058024908325, "learning_rate": 5.119047619047619e-06, - "loss": 0.2899, + "loss": 0.2885, "step": 1272 }, { "epoch": 2.725910064239829, - "grad_norm": 0.2193361858358134, + "grad_norm": 0.22178418486723533, "learning_rate": 5.07936507936508e-06, - "loss": 0.3, + "loss": 0.2994, "step": 1273 }, { "epoch": 2.728051391862955, - "grad_norm": 0.19832192608275423, + "grad_norm": 0.2033197772755529, "learning_rate": 5.03968253968254e-06, - "loss": 0.2809, + "loss": 0.2798, "step": 1274 }, { "epoch": 2.730192719486081, - "grad_norm": 0.18205252478668657, + "grad_norm": 0.18764851342867586, "learning_rate": 5e-06, - "loss": 0.276, + "loss": 0.2739, "step": 1275 }, { "epoch": 2.732334047109208, - "grad_norm": 0.18256411369477107, + "grad_norm": 0.18851110490571485, "learning_rate": 4.96031746031746e-06, - "loss": 0.2827, + "loss": 0.2807, "step": 1276 }, { "epoch": 2.734475374732334, - "grad_norm": 0.174235840356806, + "grad_norm": 0.1860752013848117, "learning_rate": 4.920634920634921e-06, - "loss": 0.2824, + "loss": 0.2808, "step": 1277 }, { "epoch": 2.7366167023554606, - "grad_norm": 0.18107414207263575, + "grad_norm": 0.1913864953828912, "learning_rate": 4.880952380952381e-06, - "loss": 0.287, + "loss": 0.2857, "step": 1278 }, { "epoch": 2.7387580299785865, - "grad_norm": 0.1843871245288681, + "grad_norm": 0.18956585829358527, "learning_rate": 4.841269841269842e-06, - "loss": 0.2948, + "loss": 0.2929, "step": 1279 }, { "epoch": 2.7408993576017133, - "grad_norm": 0.18283457450751056, + "grad_norm": 0.18168087559276616, "learning_rate": 4.8015873015873025e-06, - "loss": 0.2657, + "loss": 0.2639, "step": 1280 }, { "epoch": 2.7430406852248392, - "grad_norm": 0.1802921251741436, + "grad_norm": 0.18483118773286475, "learning_rate": 4.7619047619047615e-06, - "loss": 0.2813, + "loss": 0.2796, "step": 1281 }, { "epoch": 2.7451820128479656, - "grad_norm": 0.17894436649475992, + "grad_norm": 0.18660137821522485, "learning_rate": 4.722222222222222e-06, - "loss": 0.2775, + "loss": 0.2767, "step": 1282 }, { "epoch": 2.747323340471092, - "grad_norm": 0.1744670375408823, + "grad_norm": 0.1870410587872287, "learning_rate": 4.682539682539683e-06, - "loss": 0.2629, + "loss": 0.2608, "step": 1283 }, { "epoch": 2.7494646680942183, - "grad_norm": 0.18257816801514468, + "grad_norm": 0.18934420087113804, "learning_rate": 4.642857142857143e-06, - "loss": 0.2803, + "loss": 0.2776, "step": 1284 }, { "epoch": 2.7516059957173447, - "grad_norm": 0.19392465112933904, + "grad_norm": 0.1944710962243241, "learning_rate": 4.603174603174604e-06, - "loss": 0.2836, + "loss": 0.2819, "step": 1285 }, { "epoch": 2.753747323340471, - "grad_norm": 0.18598309828859388, + "grad_norm": 0.18507420980354902, "learning_rate": 4.563492063492064e-06, - "loss": 0.2703, + "loss": 0.2675, "step": 1286 }, { "epoch": 2.7558886509635974, - "grad_norm": 0.1844599683925985, + "grad_norm": 0.18671295615639139, "learning_rate": 4.5238095238095235e-06, - "loss": 0.2928, + "loss": 0.2919, "step": 1287 }, { "epoch": 2.7580299785867237, - "grad_norm": 0.17453928950018807, + "grad_norm": 0.1820586621003967, "learning_rate": 4.484126984126984e-06, - "loss": 0.2772, + "loss": 0.276, "step": 1288 }, { "epoch": 2.76017130620985, - "grad_norm": 0.1825967298341021, + "grad_norm": 0.18863107638734553, "learning_rate": 4.444444444444445e-06, - "loss": 0.2837, + "loss": 0.2812, "step": 1289 }, { "epoch": 2.7623126338329764, - "grad_norm": 0.17501549592815271, + "grad_norm": 0.1886951420800492, "learning_rate": 4.404761904761905e-06, - "loss": 0.2708, + "loss": 0.2684, "step": 1290 }, { "epoch": 2.764453961456103, - "grad_norm": 0.17613103586860301, + "grad_norm": 0.18624344207988158, "learning_rate": 4.365079365079365e-06, - "loss": 0.2812, + "loss": 0.28, "step": 1291 }, { "epoch": 2.766595289079229, - "grad_norm": 0.18330117899392398, + "grad_norm": 0.18959188951160139, "learning_rate": 4.3253968253968256e-06, - "loss": 0.2606, + "loss": 0.2583, "step": 1292 }, { "epoch": 2.7687366167023555, - "grad_norm": 0.18120028818225953, + "grad_norm": 0.18636470840156413, "learning_rate": 4.285714285714286e-06, - "loss": 0.284, + "loss": 0.2816, "step": 1293 }, { "epoch": 2.770877944325482, - "grad_norm": 0.17679996406713563, + "grad_norm": 0.18043083965620976, "learning_rate": 4.246031746031746e-06, - "loss": 0.2772, + "loss": 0.2754, "step": 1294 }, { "epoch": 2.773019271948608, - "grad_norm": 0.17472312513614138, + "grad_norm": 0.18037368839844325, "learning_rate": 4.206349206349207e-06, - "loss": 0.281, + "loss": 0.279, "step": 1295 }, { "epoch": 2.7751605995717346, - "grad_norm": 0.17058631787650447, + "grad_norm": 0.1823817127914405, "learning_rate": 4.166666666666667e-06, - "loss": 0.2867, + "loss": 0.2854, "step": 1296 }, { "epoch": 2.777301927194861, - "grad_norm": 0.1724204672936674, + "grad_norm": 0.18058730214933147, "learning_rate": 4.126984126984127e-06, - "loss": 0.2753, + "loss": 0.2734, "step": 1297 }, { "epoch": 2.7794432548179873, - "grad_norm": 0.1783817683336482, + "grad_norm": 0.1879979042288386, "learning_rate": 4.0873015873015875e-06, - "loss": 0.2734, + "loss": 0.2717, "step": 1298 }, { "epoch": 2.7815845824411136, - "grad_norm": 0.1768963927750415, + "grad_norm": 0.18341480124861972, "learning_rate": 4.047619047619048e-06, - "loss": 0.286, + "loss": 0.2836, "step": 1299 }, { "epoch": 2.78372591006424, - "grad_norm": 0.1785831360758202, + "grad_norm": 0.18324668375068373, "learning_rate": 4.007936507936508e-06, - "loss": 0.288, + "loss": 0.286, "step": 1300 }, { "epoch": 2.7858672376873663, - "grad_norm": 0.1973578170792123, + "grad_norm": 0.1965187754215459, "learning_rate": 3.968253968253968e-06, - "loss": 0.2851, + "loss": 0.2823, "step": 1301 }, { "epoch": 2.7880085653104922, - "grad_norm": 0.17966086659509484, + "grad_norm": 0.18573889524253487, "learning_rate": 3.928571428571429e-06, - "loss": 0.2764, + "loss": 0.2751, "step": 1302 }, { "epoch": 2.790149892933619, - "grad_norm": 0.18832071036234518, + "grad_norm": 0.19265696353956446, "learning_rate": 3.888888888888889e-06, - "loss": 0.2722, + "loss": 0.2699, "step": 1303 }, { "epoch": 2.792291220556745, - "grad_norm": 0.17616870935040177, + "grad_norm": 0.19781851093500513, "learning_rate": 3.8492063492063495e-06, - "loss": 0.2799, + "loss": 0.2779, "step": 1304 }, { "epoch": 2.7944325481798717, - "grad_norm": 0.18136072174228499, + "grad_norm": 0.18360741758477603, "learning_rate": 3.8095238095238102e-06, - "loss": 0.28, + "loss": 0.2787, "step": 1305 }, { "epoch": 2.7965738758029977, - "grad_norm": 0.19260217036977118, + "grad_norm": 0.1862707572751917, "learning_rate": 3.7698412698412697e-06, - "loss": 0.2795, + "loss": 0.2774, "step": 1306 }, { "epoch": 2.7987152034261245, - "grad_norm": 0.17281606530646676, + "grad_norm": 0.17847216017441006, "learning_rate": 3.7301587301587305e-06, - "loss": 0.2694, + "loss": 0.2669, "step": 1307 }, { "epoch": 2.8008565310492504, - "grad_norm": 0.17227995573571817, + "grad_norm": 0.1743513654767412, "learning_rate": 3.690476190476191e-06, - "loss": 0.2642, + "loss": 0.2616, "step": 1308 }, { "epoch": 2.8029978586723767, - "grad_norm": 0.17354690029249242, + "grad_norm": 0.17678834999880497, "learning_rate": 3.650793650793651e-06, - "loss": 0.2743, + "loss": 0.2729, "step": 1309 }, { "epoch": 2.805139186295503, - "grad_norm": 0.18596920138748899, + "grad_norm": 0.18147041110133913, "learning_rate": 3.611111111111111e-06, - "loss": 0.2666, + "loss": 0.265, "step": 1310 }, { "epoch": 2.8072805139186294, - "grad_norm": 0.19988273525925654, + "grad_norm": 0.19719201740239473, "learning_rate": 3.5714285714285714e-06, - "loss": 0.2939, + "loss": 0.2923, "step": 1311 }, { "epoch": 2.809421841541756, - "grad_norm": 0.18393678561262797, + "grad_norm": 0.1876418916737588, "learning_rate": 3.5317460317460317e-06, - "loss": 0.2927, + "loss": 0.2909, "step": 1312 }, { "epoch": 2.811563169164882, - "grad_norm": 0.18018691120657057, + "grad_norm": 0.18222086562470918, "learning_rate": 3.4920634920634924e-06, - "loss": 0.2664, + "loss": 0.2643, "step": 1313 }, { "epoch": 2.8137044967880085, - "grad_norm": 0.18530712590206283, + "grad_norm": 0.18368483933597352, "learning_rate": 3.4523809523809528e-06, - "loss": 0.277, + "loss": 0.2758, "step": 1314 }, { "epoch": 2.815845824411135, - "grad_norm": 0.18226232243626736, + "grad_norm": 0.1837715941113332, "learning_rate": 3.4126984126984127e-06, - "loss": 0.281, + "loss": 0.2787, "step": 1315 }, { "epoch": 2.817987152034261, - "grad_norm": 0.19031200668984619, + "grad_norm": 0.19049420307445103, "learning_rate": 3.373015873015873e-06, - "loss": 0.2866, + "loss": 0.2847, "step": 1316 }, { "epoch": 2.8201284796573876, - "grad_norm": 0.1746390900856267, + "grad_norm": 0.17835342017317368, "learning_rate": 3.3333333333333333e-06, - "loss": 0.2805, + "loss": 0.2786, "step": 1317 }, { "epoch": 2.822269807280514, - "grad_norm": 0.17409052183537008, + "grad_norm": 0.17787960285957102, "learning_rate": 3.293650793650794e-06, - "loss": 0.2748, + "loss": 0.2727, "step": 1318 }, { "epoch": 2.8244111349036403, - "grad_norm": 0.1782862674279465, + "grad_norm": 0.17862229300209337, "learning_rate": 3.2539682539682544e-06, - "loss": 0.271, + "loss": 0.2695, "step": 1319 }, { "epoch": 2.8265524625267666, - "grad_norm": 0.18225874553079588, + "grad_norm": 0.1901757951555972, "learning_rate": 3.2142857142857143e-06, - "loss": 0.2602, + "loss": 0.2575, "step": 1320 }, { "epoch": 2.828693790149893, - "grad_norm": 0.17783389559399063, + "grad_norm": 0.18103134737351187, "learning_rate": 3.1746031746031746e-06, - "loss": 0.2762, + "loss": 0.2746, "step": 1321 }, { "epoch": 2.8308351177730193, - "grad_norm": 0.17855207309001997, + "grad_norm": 0.18702283268180547, "learning_rate": 3.134920634920635e-06, - "loss": 0.2872, + "loss": 0.2861, "step": 1322 }, { "epoch": 2.8329764453961457, - "grad_norm": 0.17935885821626021, + "grad_norm": 0.17858016179861205, "learning_rate": 3.0952380952380953e-06, - "loss": 0.2794, + "loss": 0.2767, "step": 1323 }, { "epoch": 2.835117773019272, - "grad_norm": 0.18232652024256538, + "grad_norm": 0.18620061639621535, "learning_rate": 3.0555555555555556e-06, - "loss": 0.2831, + "loss": 0.2815, "step": 1324 }, { "epoch": 2.8372591006423984, - "grad_norm": 0.17948962840127805, + "grad_norm": 0.1814788975276678, "learning_rate": 3.015873015873016e-06, - "loss": 0.2776, + "loss": 0.2767, "step": 1325 }, { "epoch": 2.8394004282655247, - "grad_norm": 0.18464576865432494, + "grad_norm": 0.18778455877330474, "learning_rate": 2.9761904761904763e-06, - "loss": 0.2941, + "loss": 0.2928, "step": 1326 }, { "epoch": 2.841541755888651, - "grad_norm": 0.17028939672073803, + "grad_norm": 0.17098509883295968, "learning_rate": 2.9365079365079366e-06, - "loss": 0.2672, + "loss": 0.2648, "step": 1327 }, { "epoch": 2.8436830835117775, - "grad_norm": 0.16916196038993978, + "grad_norm": 0.17520171745909632, "learning_rate": 2.896825396825397e-06, - "loss": 0.2741, + "loss": 0.2724, "step": 1328 }, { "epoch": 2.8458244111349034, - "grad_norm": 0.175350985222415, + "grad_norm": 0.17536544537693816, "learning_rate": 2.8571428571428573e-06, - "loss": 0.2796, + "loss": 0.2783, "step": 1329 }, { "epoch": 2.84796573875803, - "grad_norm": 0.17515671194072407, + "grad_norm": 0.17658589120185325, "learning_rate": 2.8174603174603176e-06, - "loss": 0.2775, + "loss": 0.2758, "step": 1330 }, { "epoch": 2.850107066381156, - "grad_norm": 0.17259697407853658, + "grad_norm": 0.17567307855848976, "learning_rate": 2.777777777777778e-06, - "loss": 0.2688, + "loss": 0.2668, "step": 1331 }, { "epoch": 2.852248394004283, - "grad_norm": 0.1739659486779704, + "grad_norm": 0.1806128444041324, "learning_rate": 2.7380952380952382e-06, - "loss": 0.2842, + "loss": 0.2835, "step": 1332 }, { "epoch": 2.854389721627409, - "grad_norm": 0.1897848574714334, + "grad_norm": 0.18946086509767565, "learning_rate": 2.6984126984126986e-06, - "loss": 0.2736, + "loss": 0.271, "step": 1333 }, { "epoch": 2.8565310492505356, - "grad_norm": 0.17285981110921295, + "grad_norm": 0.17379093474956162, "learning_rate": 2.658730158730159e-06, - "loss": 0.2689, + "loss": 0.2673, "step": 1334 }, { "epoch": 2.8586723768736615, - "grad_norm": 0.18566863960917998, + "grad_norm": 0.18610236327347768, "learning_rate": 2.6190476190476192e-06, - "loss": 0.2656, + "loss": 0.2635, "step": 1335 }, { "epoch": 2.860813704496788, - "grad_norm": 0.17490830341679464, + "grad_norm": 0.1773051015057536, "learning_rate": 2.5793650793650795e-06, - "loss": 0.282, + "loss": 0.2814, "step": 1336 }, { "epoch": 2.862955032119914, - "grad_norm": 0.16847205946436153, + "grad_norm": 0.16809841728125763, "learning_rate": 2.53968253968254e-06, - "loss": 0.2686, + "loss": 0.267, "step": 1337 }, { "epoch": 2.8650963597430406, - "grad_norm": 0.167671663902774, + "grad_norm": 0.1737833908046207, "learning_rate": 2.5e-06, - "loss": 0.2723, + "loss": 0.2701, "step": 1338 }, { "epoch": 2.867237687366167, - "grad_norm": 0.17505967809467687, + "grad_norm": 0.1748499028349341, "learning_rate": 2.4603174603174605e-06, - "loss": 0.2741, + "loss": 0.2723, "step": 1339 }, { "epoch": 2.8693790149892933, - "grad_norm": 0.18340786011909083, + "grad_norm": 0.182639754025226, "learning_rate": 2.420634920634921e-06, - "loss": 0.2929, + "loss": 0.2914, "step": 1340 }, { "epoch": 2.8715203426124196, - "grad_norm": 0.17940501576635812, + "grad_norm": 0.17803450959486153, "learning_rate": 2.3809523809523808e-06, - "loss": 0.2714, + "loss": 0.2689, "step": 1341 }, { "epoch": 2.873661670235546, - "grad_norm": 0.182484237285869, + "grad_norm": 0.1812081036704342, "learning_rate": 2.3412698412698415e-06, - "loss": 0.275, + "loss": 0.2725, "step": 1342 }, { "epoch": 2.8758029978586723, - "grad_norm": 0.18020814887479553, + "grad_norm": 0.17858190828209497, "learning_rate": 2.301587301587302e-06, - "loss": 0.2825, + "loss": 0.2809, "step": 1343 }, { "epoch": 2.8779443254817987, - "grad_norm": 0.17660806936645942, + "grad_norm": 0.17897541722080942, "learning_rate": 2.2619047619047617e-06, - "loss": 0.285, + "loss": 0.2827, "step": 1344 }, { "epoch": 2.880085653104925, - "grad_norm": 0.17617046995854874, + "grad_norm": 0.18620447467008244, "learning_rate": 2.2222222222222225e-06, - "loss": 0.2754, + "loss": 0.2739, "step": 1345 }, { "epoch": 2.8822269807280514, - "grad_norm": 0.1744513421687539, + "grad_norm": 0.17922254926198172, "learning_rate": 2.1825396825396824e-06, - "loss": 0.2951, + "loss": 0.2937, "step": 1346 }, { "epoch": 2.8843683083511777, - "grad_norm": 0.17176452515074214, + "grad_norm": 0.1733438564135853, "learning_rate": 2.142857142857143e-06, - "loss": 0.2668, + "loss": 0.2648, "step": 1347 }, { "epoch": 2.886509635974304, - "grad_norm": 0.17904296673736972, + "grad_norm": 0.18332268371523366, "learning_rate": 2.1031746031746035e-06, - "loss": 0.2701, + "loss": 0.2678, "step": 1348 }, { "epoch": 2.8886509635974305, - "grad_norm": 0.17647016428567208, + "grad_norm": 0.18120143996921909, "learning_rate": 2.0634920634920634e-06, - "loss": 0.2716, + "loss": 0.2703, "step": 1349 }, { "epoch": 2.890792291220557, - "grad_norm": 0.17235816276714014, + "grad_norm": 0.17603741176248058, "learning_rate": 2.023809523809524e-06, - "loss": 0.265, + "loss": 0.2634, "step": 1350 }, { "epoch": 2.892933618843683, - "grad_norm": 0.18658165461489193, + "grad_norm": 0.18073214958209138, "learning_rate": 1.984126984126984e-06, - "loss": 0.2799, + "loss": 0.2776, "step": 1351 }, { "epoch": 2.8950749464668095, - "grad_norm": 0.17186186390295335, + "grad_norm": 0.1765653541500017, "learning_rate": 1.9444444444444444e-06, - "loss": 0.2714, + "loss": 0.27, "step": 1352 }, { "epoch": 2.897216274089936, - "grad_norm": 0.17654423828340407, + "grad_norm": 0.17852720128484673, "learning_rate": 1.9047619047619051e-06, - "loss": 0.2748, + "loss": 0.2731, "step": 1353 }, { "epoch": 2.8993576017130622, - "grad_norm": 0.17779611321463967, + "grad_norm": 0.17920803523071938, "learning_rate": 1.8650793650793652e-06, - "loss": 0.2817, + "loss": 0.2803, "step": 1354 }, { "epoch": 2.9014989293361886, - "grad_norm": 0.16906995082417278, + "grad_norm": 0.17112784325767613, "learning_rate": 1.8253968253968256e-06, - "loss": 0.2779, + "loss": 0.2753, "step": 1355 }, { "epoch": 2.903640256959315, - "grad_norm": 0.1795175517079968, + "grad_norm": 0.18339378168166692, "learning_rate": 1.7857142857142857e-06, - "loss": 0.275, + "loss": 0.273, "step": 1356 }, { "epoch": 2.9057815845824413, - "grad_norm": 0.24069812689827963, + "grad_norm": 0.1984295346113488, "learning_rate": 1.7460317460317462e-06, - "loss": 0.2842, + "loss": 0.2796, "step": 1357 }, { "epoch": 2.907922912205567, - "grad_norm": 0.18261102094972528, + "grad_norm": 0.18306750932125437, "learning_rate": 1.7063492063492063e-06, - "loss": 0.2863, + "loss": 0.2844, "step": 1358 }, { "epoch": 2.910064239828694, - "grad_norm": 0.17916467521323348, + "grad_norm": 0.18197706558527013, "learning_rate": 1.6666666666666667e-06, - "loss": 0.2703, + "loss": 0.2685, "step": 1359 }, { "epoch": 2.91220556745182, - "grad_norm": 0.1684177935210992, + "grad_norm": 0.17423685740773412, "learning_rate": 1.6269841269841272e-06, - "loss": 0.2633, + "loss": 0.2614, "step": 1360 }, { "epoch": 2.9143468950749467, - "grad_norm": 0.17482795526886843, + "grad_norm": 0.17655186248410631, "learning_rate": 1.5873015873015873e-06, - "loss": 0.2733, + "loss": 0.27, "step": 1361 }, { "epoch": 2.9164882226980726, - "grad_norm": 0.17189677626587546, + "grad_norm": 0.17672011279618857, "learning_rate": 1.5476190476190476e-06, - "loss": 0.2786, + "loss": 0.2768, "step": 1362 }, { "epoch": 2.9186295503211994, - "grad_norm": 0.1793311737015449, + "grad_norm": 0.184344574950677, "learning_rate": 1.507936507936508e-06, - "loss": 0.3015, + "loss": 0.3001, "step": 1363 }, { "epoch": 2.9207708779443253, - "grad_norm": 0.1751443843257153, + "grad_norm": 0.17385175676196593, "learning_rate": 1.4682539682539683e-06, - "loss": 0.2701, + "loss": 0.2687, "step": 1364 }, { "epoch": 2.9229122055674517, - "grad_norm": 0.17885380423708533, + "grad_norm": 0.17861379692559234, "learning_rate": 1.4285714285714286e-06, - "loss": 0.2732, + "loss": 0.2719, "step": 1365 }, { "epoch": 2.925053533190578, - "grad_norm": 0.17570565566220792, + "grad_norm": 0.177882396776031, "learning_rate": 1.388888888888889e-06, - "loss": 0.2797, + "loss": 0.2772, "step": 1366 }, { "epoch": 2.9271948608137044, - "grad_norm": 0.1665971208193023, + "grad_norm": 0.16826587147555336, "learning_rate": 1.3492063492063493e-06, - "loss": 0.2726, + "loss": 0.2701, "step": 1367 }, { "epoch": 2.9293361884368307, - "grad_norm": 0.18707229378818965, + "grad_norm": 0.18519355648741595, "learning_rate": 1.3095238095238096e-06, - "loss": 0.3001, + "loss": 0.2992, "step": 1368 }, { "epoch": 2.931477516059957, - "grad_norm": 0.172875108848975, + "grad_norm": 0.1782680704296424, "learning_rate": 1.26984126984127e-06, - "loss": 0.2817, + "loss": 0.2801, "step": 1369 }, { "epoch": 2.9336188436830835, - "grad_norm": 0.17022686353646632, + "grad_norm": 0.17474930461615157, "learning_rate": 1.2301587301587303e-06, - "loss": 0.2776, + "loss": 0.2766, "step": 1370 }, { "epoch": 2.93576017130621, - "grad_norm": 0.17967995503693934, + "grad_norm": 0.18245581683863532, "learning_rate": 1.1904761904761904e-06, - "loss": 0.2683, + "loss": 0.2662, "step": 1371 }, { "epoch": 2.937901498929336, - "grad_norm": 0.17708372843749126, + "grad_norm": 0.17789694691861707, "learning_rate": 1.150793650793651e-06, - "loss": 0.2908, + "loss": 0.2885, "step": 1372 }, { "epoch": 2.9400428265524625, - "grad_norm": 0.16830115974661627, + "grad_norm": 0.1687387330562206, "learning_rate": 1.1111111111111112e-06, - "loss": 0.2561, + "loss": 0.2537, "step": 1373 }, { "epoch": 2.942184154175589, - "grad_norm": 0.17929164665415462, + "grad_norm": 0.18142938523732535, "learning_rate": 1.0714285714285716e-06, - "loss": 0.2714, + "loss": 0.2695, "step": 1374 }, { "epoch": 2.9443254817987152, - "grad_norm": 0.16840343711745376, + "grad_norm": 0.16915728914832637, "learning_rate": 1.0317460317460317e-06, - "loss": 0.2671, + "loss": 0.2641, "step": 1375 }, { "epoch": 2.9464668094218416, - "grad_norm": 0.17040496165809252, + "grad_norm": 0.1780681979316246, "learning_rate": 9.92063492063492e-07, - "loss": 0.2711, + "loss": 0.2693, "step": 1376 }, { "epoch": 2.948608137044968, - "grad_norm": 0.17750648853336623, + "grad_norm": 0.178537434553442, "learning_rate": 9.523809523809526e-07, - "loss": 0.2661, + "loss": 0.2635, "step": 1377 }, { "epoch": 2.9507494646680943, - "grad_norm": 0.17641859608757207, + "grad_norm": 0.17999898864703598, "learning_rate": 9.126984126984128e-07, - "loss": 0.282, + "loss": 0.2797, "step": 1378 }, { "epoch": 2.9528907922912206, - "grad_norm": 0.18027367970879757, + "grad_norm": 0.18157617617292318, "learning_rate": 8.730158730158731e-07, - "loss": 0.2795, + "loss": 0.2778, "step": 1379 }, { "epoch": 2.955032119914347, - "grad_norm": 0.1770116936691827, + "grad_norm": 0.180109121804787, "learning_rate": 8.333333333333333e-07, - "loss": 0.2846, + "loss": 0.2822, "step": 1380 }, { "epoch": 2.9571734475374734, - "grad_norm": 0.1718006942921136, + "grad_norm": 0.17585603613458994, "learning_rate": 7.936507936507937e-07, - "loss": 0.2812, + "loss": 0.2796, "step": 1381 }, { "epoch": 2.9593147751605997, - "grad_norm": 0.1770273339812149, + "grad_norm": 0.178180131081556, "learning_rate": 7.53968253968254e-07, - "loss": 0.2896, + "loss": 0.288, "step": 1382 }, { "epoch": 2.961456102783726, - "grad_norm": 0.17902070118474797, + "grad_norm": 0.174756761248787, "learning_rate": 7.142857142857143e-07, - "loss": 0.2978, + "loss": 0.2963, "step": 1383 }, { "epoch": 2.9635974304068524, - "grad_norm": 0.17533054931108796, + "grad_norm": 0.17507554933400857, "learning_rate": 6.746031746031746e-07, - "loss": 0.2769, + "loss": 0.2743, "step": 1384 }, { "epoch": 2.9657387580299783, - "grad_norm": 0.16993493071735802, + "grad_norm": 0.17237340079111105, "learning_rate": 6.34920634920635e-07, - "loss": 0.2827, + "loss": 0.2812, "step": 1385 }, { "epoch": 2.967880085653105, - "grad_norm": 0.17684972171074492, + "grad_norm": 0.17800514856704938, "learning_rate": 5.952380952380952e-07, - "loss": 0.2917, + "loss": 0.2901, "step": 1386 }, { "epoch": 2.970021413276231, - "grad_norm": 0.17897804806977222, + "grad_norm": 0.17518183885385935, "learning_rate": 5.555555555555556e-07, - "loss": 0.2834, + "loss": 0.2817, "step": 1387 }, { "epoch": 2.972162740899358, - "grad_norm": 0.19557715172028603, + "grad_norm": 0.19209259654412175, "learning_rate": 5.158730158730158e-07, - "loss": 0.3017, + "loss": 0.2979, "step": 1388 }, { "epoch": 2.9743040685224837, - "grad_norm": 0.17627182121205032, + "grad_norm": 0.18038864732034635, "learning_rate": 4.761904761904763e-07, - "loss": 0.2767, + "loss": 0.2758, "step": 1389 }, { "epoch": 2.9764453961456105, - "grad_norm": 0.17369058443780255, + "grad_norm": 0.17455696050193842, "learning_rate": 4.3650793650793655e-07, - "loss": 0.2798, + "loss": 0.2783, "step": 1390 }, { "epoch": 2.9785867237687365, - "grad_norm": 0.1740941045189046, + "grad_norm": 0.17453553372749056, "learning_rate": 3.9682539682539683e-07, - "loss": 0.2683, + "loss": 0.2662, "step": 1391 }, { "epoch": 2.980728051391863, - "grad_norm": 0.17915684223215836, + "grad_norm": 0.17577598361771224, "learning_rate": 3.5714285714285716e-07, - "loss": 0.2873, + "loss": 0.2856, "step": 1392 }, { "epoch": 2.982869379014989, - "grad_norm": 0.1679332636215031, + "grad_norm": 0.16869697010362938, "learning_rate": 3.174603174603175e-07, - "loss": 0.2745, + "loss": 0.2733, "step": 1393 }, { "epoch": 2.9850107066381155, - "grad_norm": 0.17729248198099018, + "grad_norm": 0.1776277204829826, "learning_rate": 2.777777777777778e-07, - "loss": 0.2756, + "loss": 0.2734, "step": 1394 }, { "epoch": 2.987152034261242, - "grad_norm": 0.16814830396556607, + "grad_norm": 0.17086305942740904, "learning_rate": 2.3809523809523814e-07, - "loss": 0.2713, + "loss": 0.2697, "step": 1395 }, { "epoch": 2.9892933618843682, - "grad_norm": 0.1765432649703903, + "grad_norm": 0.18182256735600724, "learning_rate": 1.9841269841269841e-07, - "loss": 0.2875, + "loss": 0.2862, "step": 1396 }, { "epoch": 2.9914346895074946, - "grad_norm": 0.1741654729465487, + "grad_norm": 0.1740762738414256, "learning_rate": 1.5873015873015874e-07, - "loss": 0.2909, + "loss": 0.2888, "step": 1397 }, { "epoch": 2.993576017130621, - "grad_norm": 0.1709774041473495, + "grad_norm": 0.1737350102448509, "learning_rate": 1.1904761904761907e-07, - "loss": 0.2796, + "loss": 0.2777, "step": 1398 }, { "epoch": 2.9957173447537473, - "grad_norm": 0.1642668435658554, + "grad_norm": 0.16946025736659603, "learning_rate": 7.936507936507937e-08, - "loss": 0.267, + "loss": 0.2651, "step": 1399 }, { "epoch": 2.9978586723768736, - "grad_norm": 0.17312311494944055, + "grad_norm": 0.1722572338000261, "learning_rate": 3.9682539682539686e-08, - "loss": 0.2641, + "loss": 0.261, "step": 1400 }, { "epoch": 3.0, - "grad_norm": 0.17221087745505198, + "grad_norm": 0.17642900461088157, "learning_rate": 0.0, - "loss": 0.2679, + "loss": 0.2644, "step": 1401 }, { "epoch": 3.0, "step": 1401, "total_flos": 1.5578375880118895e+19, - "train_loss": 0.39975321637486494, - "train_runtime": 116840.5239, - "train_samples_per_second": 0.192, - "train_steps_per_second": 0.012 + "train_loss": 0.4592563231913725, + "train_runtime": 43286.824, + "train_samples_per_second": 0.518, + "train_steps_per_second": 0.032 } ], "logging_steps": 1,