diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14532 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 2070, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014492753623188406, + "grad_norm": 48.2703755078483, + "learning_rate": 2.4154589371980677e-07, + "loss": 11.412, + "step": 1 + }, + { + "epoch": 0.002898550724637681, + "grad_norm": 47.846609744441345, + "learning_rate": 4.830917874396135e-07, + "loss": 11.3179, + "step": 2 + }, + { + "epoch": 0.004347826086956522, + "grad_norm": 43.327843265354325, + "learning_rate": 7.246376811594203e-07, + "loss": 11.7074, + "step": 3 + }, + { + "epoch": 0.005797101449275362, + "grad_norm": 45.996221072245085, + "learning_rate": 9.66183574879227e-07, + "loss": 11.5688, + "step": 4 + }, + { + "epoch": 0.007246376811594203, + "grad_norm": 44.80245082664995, + "learning_rate": 1.2077294685990338e-06, + "loss": 11.5525, + "step": 5 + }, + { + "epoch": 0.008695652173913044, + "grad_norm": 43.7912268141257, + "learning_rate": 1.4492753623188406e-06, + "loss": 11.6803, + "step": 6 + }, + { + "epoch": 0.010144927536231883, + "grad_norm": 46.19726966065695, + "learning_rate": 1.6908212560386474e-06, + "loss": 11.4856, + "step": 7 + }, + { + "epoch": 0.011594202898550725, + "grad_norm": 49.683280956555635, + "learning_rate": 1.932367149758454e-06, + "loss": 11.2487, + "step": 8 + }, + { + "epoch": 0.013043478260869565, + "grad_norm": 49.5432312831397, + "learning_rate": 2.173913043478261e-06, + "loss": 11.3125, + "step": 9 + }, + { + "epoch": 0.014492753623188406, + "grad_norm": 170.48068034517684, + "learning_rate": 2.4154589371980677e-06, + "loss": 10.4208, + "step": 10 + }, + { + "epoch": 0.015942028985507246, + "grad_norm": 74.68814324825067, + "learning_rate": 2.6570048309178746e-06, + "loss": 10.1322, + "step": 11 + }, + { + "epoch": 0.017391304347826087, + "grad_norm": 76.52431188574639, + "learning_rate": 2.898550724637681e-06, + "loss": 9.677, + "step": 12 + }, + { + "epoch": 0.01884057971014493, + "grad_norm": 96.8305482953425, + "learning_rate": 3.140096618357488e-06, + "loss": 6.0848, + "step": 13 + }, + { + "epoch": 0.020289855072463767, + "grad_norm": 79.49651285333789, + "learning_rate": 3.3816425120772947e-06, + "loss": 5.2455, + "step": 14 + }, + { + "epoch": 0.021739130434782608, + "grad_norm": 70.92706207793557, + "learning_rate": 3.6231884057971017e-06, + "loss": 4.555, + "step": 15 + }, + { + "epoch": 0.02318840579710145, + "grad_norm": 52.332573285132604, + "learning_rate": 3.864734299516908e-06, + "loss": 3.6889, + "step": 16 + }, + { + "epoch": 0.02463768115942029, + "grad_norm": 41.41102395269734, + "learning_rate": 4.106280193236716e-06, + "loss": 3.0651, + "step": 17 + }, + { + "epoch": 0.02608695652173913, + "grad_norm": 7.477039232121631, + "learning_rate": 4.347826086956522e-06, + "loss": 1.6705, + "step": 18 + }, + { + "epoch": 0.02753623188405797, + "grad_norm": 6.077802039052816, + "learning_rate": 4.589371980676329e-06, + "loss": 1.7725, + "step": 19 + }, + { + "epoch": 0.028985507246376812, + "grad_norm": 5.2869318809884875, + "learning_rate": 4.830917874396135e-06, + "loss": 1.6962, + "step": 20 + }, + { + "epoch": 0.030434782608695653, + "grad_norm": 4.168623701247975, + "learning_rate": 5.072463768115943e-06, + "loss": 1.4515, + "step": 21 + }, + { + "epoch": 0.03188405797101449, + "grad_norm": 3.1618053994702673, + "learning_rate": 5.314009661835749e-06, + "loss": 1.4379, + "step": 22 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 2.8417104654582968, + "learning_rate": 5.555555555555556e-06, + "loss": 1.6453, + "step": 23 + }, + { + "epoch": 0.034782608695652174, + "grad_norm": 2.2491818177379757, + "learning_rate": 5.797101449275362e-06, + "loss": 1.6312, + "step": 24 + }, + { + "epoch": 0.036231884057971016, + "grad_norm": 1.5951285511686666, + "learning_rate": 6.038647342995169e-06, + "loss": 1.2319, + "step": 25 + }, + { + "epoch": 0.03768115942028986, + "grad_norm": 1.7838476788934974, + "learning_rate": 6.280193236714976e-06, + "loss": 1.4211, + "step": 26 + }, + { + "epoch": 0.0391304347826087, + "grad_norm": 4.11606765432239, + "learning_rate": 6.521739130434783e-06, + "loss": 1.2726, + "step": 27 + }, + { + "epoch": 0.04057971014492753, + "grad_norm": 1.1922376154128103, + "learning_rate": 6.7632850241545894e-06, + "loss": 1.3099, + "step": 28 + }, + { + "epoch": 0.042028985507246375, + "grad_norm": 0.8946299762295795, + "learning_rate": 7.004830917874397e-06, + "loss": 1.0997, + "step": 29 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 0.9128998822679193, + "learning_rate": 7.246376811594203e-06, + "loss": 1.1738, + "step": 30 + }, + { + "epoch": 0.04492753623188406, + "grad_norm": 0.8261962791615375, + "learning_rate": 7.48792270531401e-06, + "loss": 1.1265, + "step": 31 + }, + { + "epoch": 0.0463768115942029, + "grad_norm": 0.7259010338295714, + "learning_rate": 7.729468599033817e-06, + "loss": 1.0761, + "step": 32 + }, + { + "epoch": 0.04782608695652174, + "grad_norm": 0.8144150263048314, + "learning_rate": 7.971014492753623e-06, + "loss": 1.1418, + "step": 33 + }, + { + "epoch": 0.04927536231884058, + "grad_norm": 0.6225984168130394, + "learning_rate": 8.212560386473431e-06, + "loss": 1.0675, + "step": 34 + }, + { + "epoch": 0.050724637681159424, + "grad_norm": 0.6957019400861381, + "learning_rate": 8.454106280193238e-06, + "loss": 1.1917, + "step": 35 + }, + { + "epoch": 0.05217391304347826, + "grad_norm": 0.6473409624233152, + "learning_rate": 8.695652173913044e-06, + "loss": 0.9325, + "step": 36 + }, + { + "epoch": 0.0536231884057971, + "grad_norm": 0.6816748318384368, + "learning_rate": 8.93719806763285e-06, + "loss": 1.2146, + "step": 37 + }, + { + "epoch": 0.05507246376811594, + "grad_norm": 0.6277804431834189, + "learning_rate": 9.178743961352658e-06, + "loss": 1.0025, + "step": 38 + }, + { + "epoch": 0.05652173913043478, + "grad_norm": 0.5421011053554157, + "learning_rate": 9.420289855072464e-06, + "loss": 1.0601, + "step": 39 + }, + { + "epoch": 0.057971014492753624, + "grad_norm": 0.5106889084966296, + "learning_rate": 9.66183574879227e-06, + "loss": 0.9464, + "step": 40 + }, + { + "epoch": 0.059420289855072465, + "grad_norm": 0.5149567843509089, + "learning_rate": 9.903381642512077e-06, + "loss": 1.0016, + "step": 41 + }, + { + "epoch": 0.06086956521739131, + "grad_norm": 0.5161093222444725, + "learning_rate": 1.0144927536231885e-05, + "loss": 0.9742, + "step": 42 + }, + { + "epoch": 0.06231884057971015, + "grad_norm": 0.4872914232682372, + "learning_rate": 1.0386473429951692e-05, + "loss": 0.9586, + "step": 43 + }, + { + "epoch": 0.06376811594202898, + "grad_norm": 0.5184224626160103, + "learning_rate": 1.0628019323671499e-05, + "loss": 0.9966, + "step": 44 + }, + { + "epoch": 0.06521739130434782, + "grad_norm": 0.43452221873592267, + "learning_rate": 1.0869565217391305e-05, + "loss": 1.0099, + "step": 45 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.48744214274446906, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.9639, + "step": 46 + }, + { + "epoch": 0.06811594202898551, + "grad_norm": 7.312818671660048, + "learning_rate": 1.1352657004830918e-05, + "loss": 0.995, + "step": 47 + }, + { + "epoch": 0.06956521739130435, + "grad_norm": 0.541581425101709, + "learning_rate": 1.1594202898550725e-05, + "loss": 0.9325, + "step": 48 + }, + { + "epoch": 0.07101449275362319, + "grad_norm": 0.4503734798990069, + "learning_rate": 1.1835748792270531e-05, + "loss": 0.868, + "step": 49 + }, + { + "epoch": 0.07246376811594203, + "grad_norm": 0.4886820292319067, + "learning_rate": 1.2077294685990338e-05, + "loss": 0.9599, + "step": 50 + }, + { + "epoch": 0.07391304347826087, + "grad_norm": 0.42811723185783795, + "learning_rate": 1.2318840579710146e-05, + "loss": 0.8892, + "step": 51 + }, + { + "epoch": 0.07536231884057971, + "grad_norm": 0.42226359715459455, + "learning_rate": 1.2560386473429953e-05, + "loss": 0.9332, + "step": 52 + }, + { + "epoch": 0.07681159420289856, + "grad_norm": 0.41341607893203236, + "learning_rate": 1.2801932367149761e-05, + "loss": 0.8111, + "step": 53 + }, + { + "epoch": 0.0782608695652174, + "grad_norm": 0.407434689365239, + "learning_rate": 1.3043478260869566e-05, + "loss": 0.9552, + "step": 54 + }, + { + "epoch": 0.07971014492753623, + "grad_norm": 0.3510103557036314, + "learning_rate": 1.3285024154589374e-05, + "loss": 0.9455, + "step": 55 + }, + { + "epoch": 0.08115942028985507, + "grad_norm": 0.4560728778977622, + "learning_rate": 1.3526570048309179e-05, + "loss": 0.8894, + "step": 56 + }, + { + "epoch": 0.08260869565217391, + "grad_norm": 0.36250775032726457, + "learning_rate": 1.3768115942028985e-05, + "loss": 0.7497, + "step": 57 + }, + { + "epoch": 0.08405797101449275, + "grad_norm": 0.3762659087593294, + "learning_rate": 1.4009661835748794e-05, + "loss": 0.8474, + "step": 58 + }, + { + "epoch": 0.08550724637681159, + "grad_norm": 0.36500198106850623, + "learning_rate": 1.4251207729468599e-05, + "loss": 0.8414, + "step": 59 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 0.3293408454439321, + "learning_rate": 1.4492753623188407e-05, + "loss": 0.7616, + "step": 60 + }, + { + "epoch": 0.08840579710144927, + "grad_norm": 0.3510265716577906, + "learning_rate": 1.4734299516908212e-05, + "loss": 0.847, + "step": 61 + }, + { + "epoch": 0.08985507246376812, + "grad_norm": 0.3348155244843317, + "learning_rate": 1.497584541062802e-05, + "loss": 0.7797, + "step": 62 + }, + { + "epoch": 0.09130434782608696, + "grad_norm": 0.35285060969891996, + "learning_rate": 1.5217391304347828e-05, + "loss": 0.9457, + "step": 63 + }, + { + "epoch": 0.0927536231884058, + "grad_norm": 0.34433082535838777, + "learning_rate": 1.5458937198067633e-05, + "loss": 0.9697, + "step": 64 + }, + { + "epoch": 0.09420289855072464, + "grad_norm": 0.9324600519344617, + "learning_rate": 1.570048309178744e-05, + "loss": 0.9696, + "step": 65 + }, + { + "epoch": 0.09565217391304348, + "grad_norm": 0.331145083306618, + "learning_rate": 1.5942028985507246e-05, + "loss": 0.8468, + "step": 66 + }, + { + "epoch": 0.09710144927536232, + "grad_norm": 0.3959920805542888, + "learning_rate": 1.6183574879227054e-05, + "loss": 0.9872, + "step": 67 + }, + { + "epoch": 0.09855072463768116, + "grad_norm": 0.3398180883831577, + "learning_rate": 1.6425120772946863e-05, + "loss": 0.9103, + "step": 68 + }, + { + "epoch": 0.1, + "grad_norm": 0.3507626031123879, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.9097, + "step": 69 + }, + { + "epoch": 0.10144927536231885, + "grad_norm": 0.31255794956434924, + "learning_rate": 1.6908212560386476e-05, + "loss": 0.7564, + "step": 70 + }, + { + "epoch": 0.10289855072463767, + "grad_norm": 0.37074114591156665, + "learning_rate": 1.714975845410628e-05, + "loss": 0.8267, + "step": 71 + }, + { + "epoch": 0.10434782608695652, + "grad_norm": 0.3656485623827102, + "learning_rate": 1.739130434782609e-05, + "loss": 0.8924, + "step": 72 + }, + { + "epoch": 0.10579710144927536, + "grad_norm": 0.3226469968742567, + "learning_rate": 1.7632850241545894e-05, + "loss": 0.6622, + "step": 73 + }, + { + "epoch": 0.1072463768115942, + "grad_norm": 0.33712867700768034, + "learning_rate": 1.78743961352657e-05, + "loss": 0.7811, + "step": 74 + }, + { + "epoch": 0.10869565217391304, + "grad_norm": 0.34247268410396536, + "learning_rate": 1.8115942028985507e-05, + "loss": 0.7057, + "step": 75 + }, + { + "epoch": 0.11014492753623188, + "grad_norm": 0.3354659334989842, + "learning_rate": 1.8357487922705315e-05, + "loss": 0.8827, + "step": 76 + }, + { + "epoch": 0.11159420289855072, + "grad_norm": 0.3070922646052218, + "learning_rate": 1.859903381642512e-05, + "loss": 0.7294, + "step": 77 + }, + { + "epoch": 0.11304347826086956, + "grad_norm": 0.3138658882523287, + "learning_rate": 1.8840579710144928e-05, + "loss": 0.6904, + "step": 78 + }, + { + "epoch": 0.1144927536231884, + "grad_norm": 1.5216350994227146, + "learning_rate": 1.9082125603864733e-05, + "loss": 0.803, + "step": 79 + }, + { + "epoch": 0.11594202898550725, + "grad_norm": 0.3382431707970964, + "learning_rate": 1.932367149758454e-05, + "loss": 0.7899, + "step": 80 + }, + { + "epoch": 0.11739130434782609, + "grad_norm": 0.296008590542212, + "learning_rate": 1.956521739130435e-05, + "loss": 0.8161, + "step": 81 + }, + { + "epoch": 0.11884057971014493, + "grad_norm": 0.3689599195710936, + "learning_rate": 1.9806763285024154e-05, + "loss": 0.873, + "step": 82 + }, + { + "epoch": 0.12028985507246377, + "grad_norm": 0.3121552852325846, + "learning_rate": 2.0048309178743963e-05, + "loss": 0.7462, + "step": 83 + }, + { + "epoch": 0.12173913043478261, + "grad_norm": 0.3548758204484305, + "learning_rate": 2.028985507246377e-05, + "loss": 0.9198, + "step": 84 + }, + { + "epoch": 0.12318840579710146, + "grad_norm": 0.30541661182285873, + "learning_rate": 2.0531400966183576e-05, + "loss": 0.8544, + "step": 85 + }, + { + "epoch": 0.1246376811594203, + "grad_norm": 0.29774802882201346, + "learning_rate": 2.0772946859903384e-05, + "loss": 0.7522, + "step": 86 + }, + { + "epoch": 0.12608695652173912, + "grad_norm": 0.33320845974248064, + "learning_rate": 2.101449275362319e-05, + "loss": 0.7726, + "step": 87 + }, + { + "epoch": 0.12753623188405797, + "grad_norm": 0.305867939123411, + "learning_rate": 2.1256038647342997e-05, + "loss": 0.6971, + "step": 88 + }, + { + "epoch": 0.1289855072463768, + "grad_norm": 0.3687626183800535, + "learning_rate": 2.1497584541062805e-05, + "loss": 0.7586, + "step": 89 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 0.40655909646647576, + "learning_rate": 2.173913043478261e-05, + "loss": 0.86, + "step": 90 + }, + { + "epoch": 0.1318840579710145, + "grad_norm": 0.3184064183198976, + "learning_rate": 2.198067632850242e-05, + "loss": 0.7743, + "step": 91 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 4.585823826697241, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.8626, + "step": 92 + }, + { + "epoch": 0.13478260869565217, + "grad_norm": 0.3487306040411324, + "learning_rate": 2.246376811594203e-05, + "loss": 0.8454, + "step": 93 + }, + { + "epoch": 0.13623188405797101, + "grad_norm": 0.37267592174262365, + "learning_rate": 2.2705314009661836e-05, + "loss": 0.8255, + "step": 94 + }, + { + "epoch": 0.13768115942028986, + "grad_norm": 0.32986815030036337, + "learning_rate": 2.294685990338164e-05, + "loss": 0.7607, + "step": 95 + }, + { + "epoch": 0.1391304347826087, + "grad_norm": 0.36689670642754957, + "learning_rate": 2.318840579710145e-05, + "loss": 0.7595, + "step": 96 + }, + { + "epoch": 0.14057971014492754, + "grad_norm": 0.3924500092697602, + "learning_rate": 2.3429951690821258e-05, + "loss": 0.7641, + "step": 97 + }, + { + "epoch": 0.14202898550724638, + "grad_norm": 0.3678298491507766, + "learning_rate": 2.3671497584541063e-05, + "loss": 0.8308, + "step": 98 + }, + { + "epoch": 0.14347826086956522, + "grad_norm": 0.4714024681413323, + "learning_rate": 2.391304347826087e-05, + "loss": 0.6769, + "step": 99 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.4412843647168353, + "learning_rate": 2.4154589371980676e-05, + "loss": 0.9089, + "step": 100 + }, + { + "epoch": 0.1463768115942029, + "grad_norm": 0.3625778040519065, + "learning_rate": 2.4396135265700484e-05, + "loss": 0.7376, + "step": 101 + }, + { + "epoch": 0.14782608695652175, + "grad_norm": 0.4200692042032697, + "learning_rate": 2.4637681159420292e-05, + "loss": 0.7893, + "step": 102 + }, + { + "epoch": 0.1492753623188406, + "grad_norm": 5.300388535004034, + "learning_rate": 2.4879227053140097e-05, + "loss": 0.7625, + "step": 103 + }, + { + "epoch": 0.15072463768115943, + "grad_norm": 0.5161561357287162, + "learning_rate": 2.5120772946859905e-05, + "loss": 0.8625, + "step": 104 + }, + { + "epoch": 0.15217391304347827, + "grad_norm": 0.35899298752423286, + "learning_rate": 2.5362318840579714e-05, + "loss": 0.7075, + "step": 105 + }, + { + "epoch": 0.1536231884057971, + "grad_norm": 0.436644448478288, + "learning_rate": 2.5603864734299522e-05, + "loss": 0.6964, + "step": 106 + }, + { + "epoch": 0.15507246376811595, + "grad_norm": 0.35851332585975537, + "learning_rate": 2.5845410628019323e-05, + "loss": 0.7348, + "step": 107 + }, + { + "epoch": 0.1565217391304348, + "grad_norm": 0.3838152624994046, + "learning_rate": 2.608695652173913e-05, + "loss": 0.8987, + "step": 108 + }, + { + "epoch": 0.15797101449275364, + "grad_norm": 0.3343603849395053, + "learning_rate": 2.632850241545894e-05, + "loss": 0.6829, + "step": 109 + }, + { + "epoch": 0.15942028985507245, + "grad_norm": 0.35962622420671014, + "learning_rate": 2.6570048309178748e-05, + "loss": 0.7414, + "step": 110 + }, + { + "epoch": 0.1608695652173913, + "grad_norm": 0.3757160134118775, + "learning_rate": 2.6811594202898553e-05, + "loss": 0.7767, + "step": 111 + }, + { + "epoch": 0.16231884057971013, + "grad_norm": 0.3469599966973019, + "learning_rate": 2.7053140096618358e-05, + "loss": 0.7929, + "step": 112 + }, + { + "epoch": 0.16376811594202897, + "grad_norm": 0.43902244288916725, + "learning_rate": 2.7294685990338166e-05, + "loss": 0.6906, + "step": 113 + }, + { + "epoch": 0.16521739130434782, + "grad_norm": 0.36775564724113863, + "learning_rate": 2.753623188405797e-05, + "loss": 0.8779, + "step": 114 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.4188102948097823, + "learning_rate": 2.777777777777778e-05, + "loss": 0.6696, + "step": 115 + }, + { + "epoch": 0.1681159420289855, + "grad_norm": 0.3387805292980607, + "learning_rate": 2.8019323671497587e-05, + "loss": 0.7831, + "step": 116 + }, + { + "epoch": 0.16956521739130434, + "grad_norm": 0.3598515255578289, + "learning_rate": 2.826086956521739e-05, + "loss": 0.7654, + "step": 117 + }, + { + "epoch": 0.17101449275362318, + "grad_norm": 0.39027009772473525, + "learning_rate": 2.8502415458937197e-05, + "loss": 0.6693, + "step": 118 + }, + { + "epoch": 0.17246376811594202, + "grad_norm": 0.3094539572612201, + "learning_rate": 2.8743961352657005e-05, + "loss": 0.6016, + "step": 119 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.42058909226444235, + "learning_rate": 2.8985507246376814e-05, + "loss": 0.6962, + "step": 120 + }, + { + "epoch": 0.1753623188405797, + "grad_norm": 0.34853589375868377, + "learning_rate": 2.9227053140096622e-05, + "loss": 0.7216, + "step": 121 + }, + { + "epoch": 0.17681159420289855, + "grad_norm": 0.4868735037868742, + "learning_rate": 2.9468599033816423e-05, + "loss": 0.7466, + "step": 122 + }, + { + "epoch": 0.1782608695652174, + "grad_norm": 0.4185621046394828, + "learning_rate": 2.971014492753623e-05, + "loss": 0.701, + "step": 123 + }, + { + "epoch": 0.17971014492753623, + "grad_norm": 0.4197568084080103, + "learning_rate": 2.995169082125604e-05, + "loss": 0.7263, + "step": 124 + }, + { + "epoch": 0.18115942028985507, + "grad_norm": 0.4657689848243366, + "learning_rate": 3.0193236714975848e-05, + "loss": 0.654, + "step": 125 + }, + { + "epoch": 0.1826086956521739, + "grad_norm": 0.37793581378328184, + "learning_rate": 3.0434782608695656e-05, + "loss": 0.6564, + "step": 126 + }, + { + "epoch": 0.18405797101449275, + "grad_norm": 0.434674876337351, + "learning_rate": 3.067632850241546e-05, + "loss": 0.7689, + "step": 127 + }, + { + "epoch": 0.1855072463768116, + "grad_norm": 0.48900991299205354, + "learning_rate": 3.0917874396135266e-05, + "loss": 0.7707, + "step": 128 + }, + { + "epoch": 0.18695652173913044, + "grad_norm": 0.5447607203343757, + "learning_rate": 3.1159420289855074e-05, + "loss": 0.6964, + "step": 129 + }, + { + "epoch": 0.18840579710144928, + "grad_norm": 0.4881199429761279, + "learning_rate": 3.140096618357488e-05, + "loss": 0.6979, + "step": 130 + }, + { + "epoch": 0.18985507246376812, + "grad_norm": 3.6888624601856974, + "learning_rate": 3.164251207729469e-05, + "loss": 0.7116, + "step": 131 + }, + { + "epoch": 0.19130434782608696, + "grad_norm": 0.8067506542863169, + "learning_rate": 3.188405797101449e-05, + "loss": 0.684, + "step": 132 + }, + { + "epoch": 0.1927536231884058, + "grad_norm": 0.4346182081032241, + "learning_rate": 3.21256038647343e-05, + "loss": 0.7532, + "step": 133 + }, + { + "epoch": 0.19420289855072465, + "grad_norm": 0.6270064201343746, + "learning_rate": 3.236714975845411e-05, + "loss": 0.6986, + "step": 134 + }, + { + "epoch": 0.1956521739130435, + "grad_norm": 0.4928401828984155, + "learning_rate": 3.260869565217392e-05, + "loss": 0.7408, + "step": 135 + }, + { + "epoch": 0.19710144927536233, + "grad_norm": 0.47674959216585067, + "learning_rate": 3.2850241545893725e-05, + "loss": 0.6962, + "step": 136 + }, + { + "epoch": 0.19855072463768117, + "grad_norm": 0.5262194433847822, + "learning_rate": 3.3091787439613533e-05, + "loss": 0.6421, + "step": 137 + }, + { + "epoch": 0.2, + "grad_norm": 0.5736351844030817, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.7716, + "step": 138 + }, + { + "epoch": 0.20144927536231885, + "grad_norm": 0.5025434605529145, + "learning_rate": 3.357487922705314e-05, + "loss": 0.7522, + "step": 139 + }, + { + "epoch": 0.2028985507246377, + "grad_norm": 0.5074291606372959, + "learning_rate": 3.381642512077295e-05, + "loss": 0.8226, + "step": 140 + }, + { + "epoch": 0.20434782608695654, + "grad_norm": 0.3974703015679785, + "learning_rate": 3.405797101449276e-05, + "loss": 0.7535, + "step": 141 + }, + { + "epoch": 0.20579710144927535, + "grad_norm": 0.45467619929714587, + "learning_rate": 3.429951690821256e-05, + "loss": 0.7729, + "step": 142 + }, + { + "epoch": 0.2072463768115942, + "grad_norm": 0.4084485817949974, + "learning_rate": 3.454106280193237e-05, + "loss": 0.7303, + "step": 143 + }, + { + "epoch": 0.20869565217391303, + "grad_norm": 0.41159007002019965, + "learning_rate": 3.478260869565218e-05, + "loss": 0.7145, + "step": 144 + }, + { + "epoch": 0.21014492753623187, + "grad_norm": 0.4973545025240137, + "learning_rate": 3.502415458937198e-05, + "loss": 0.7747, + "step": 145 + }, + { + "epoch": 0.21159420289855072, + "grad_norm": 0.3734469069038325, + "learning_rate": 3.526570048309179e-05, + "loss": 0.6809, + "step": 146 + }, + { + "epoch": 0.21304347826086956, + "grad_norm": 0.4058026991714768, + "learning_rate": 3.5507246376811596e-05, + "loss": 0.662, + "step": 147 + }, + { + "epoch": 0.2144927536231884, + "grad_norm": 1.0006350138668159, + "learning_rate": 3.57487922705314e-05, + "loss": 0.7111, + "step": 148 + }, + { + "epoch": 0.21594202898550724, + "grad_norm": 0.36130905180297707, + "learning_rate": 3.5990338164251205e-05, + "loss": 0.8164, + "step": 149 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 0.4391335573453258, + "learning_rate": 3.6231884057971014e-05, + "loss": 0.6806, + "step": 150 + }, + { + "epoch": 0.21884057971014492, + "grad_norm": 0.36378332102331984, + "learning_rate": 3.647342995169082e-05, + "loss": 0.672, + "step": 151 + }, + { + "epoch": 0.22028985507246376, + "grad_norm": 0.36408409425201227, + "learning_rate": 3.671497584541063e-05, + "loss": 0.6756, + "step": 152 + }, + { + "epoch": 0.2217391304347826, + "grad_norm": 0.3645988836273659, + "learning_rate": 3.695652173913043e-05, + "loss": 0.5685, + "step": 153 + }, + { + "epoch": 0.22318840579710145, + "grad_norm": 0.42301410248495724, + "learning_rate": 3.719806763285024e-05, + "loss": 0.6425, + "step": 154 + }, + { + "epoch": 0.2246376811594203, + "grad_norm": 0.363740011337443, + "learning_rate": 3.743961352657005e-05, + "loss": 0.6702, + "step": 155 + }, + { + "epoch": 0.22608695652173913, + "grad_norm": 0.4045601971692223, + "learning_rate": 3.7681159420289856e-05, + "loss": 0.6256, + "step": 156 + }, + { + "epoch": 0.22753623188405797, + "grad_norm": 0.41143862371080964, + "learning_rate": 3.7922705314009665e-05, + "loss": 0.7695, + "step": 157 + }, + { + "epoch": 0.2289855072463768, + "grad_norm": 0.3951097674703999, + "learning_rate": 3.8164251207729466e-05, + "loss": 0.7124, + "step": 158 + }, + { + "epoch": 0.23043478260869565, + "grad_norm": 0.3831302126355593, + "learning_rate": 3.8405797101449274e-05, + "loss": 0.746, + "step": 159 + }, + { + "epoch": 0.2318840579710145, + "grad_norm": 0.37142247727064054, + "learning_rate": 3.864734299516908e-05, + "loss": 0.726, + "step": 160 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 0.35813121793484276, + "learning_rate": 3.888888888888889e-05, + "loss": 0.597, + "step": 161 + }, + { + "epoch": 0.23478260869565218, + "grad_norm": 0.44255790588313515, + "learning_rate": 3.91304347826087e-05, + "loss": 0.6954, + "step": 162 + }, + { + "epoch": 0.23623188405797102, + "grad_norm": 0.3727301373909844, + "learning_rate": 3.937198067632851e-05, + "loss": 0.689, + "step": 163 + }, + { + "epoch": 0.23768115942028986, + "grad_norm": 0.48116777299429975, + "learning_rate": 3.961352657004831e-05, + "loss": 0.7812, + "step": 164 + }, + { + "epoch": 0.2391304347826087, + "grad_norm": 0.3455365306611197, + "learning_rate": 3.985507246376812e-05, + "loss": 0.7201, + "step": 165 + }, + { + "epoch": 0.24057971014492754, + "grad_norm": 0.48059498778631815, + "learning_rate": 4.0096618357487925e-05, + "loss": 0.7336, + "step": 166 + }, + { + "epoch": 0.24202898550724639, + "grad_norm": 0.3664497120347036, + "learning_rate": 4.0338164251207733e-05, + "loss": 0.6416, + "step": 167 + }, + { + "epoch": 0.24347826086956523, + "grad_norm": 0.5063263979771799, + "learning_rate": 4.057971014492754e-05, + "loss": 0.6918, + "step": 168 + }, + { + "epoch": 0.24492753623188407, + "grad_norm": 0.4636598172748528, + "learning_rate": 4.082125603864734e-05, + "loss": 0.7861, + "step": 169 + }, + { + "epoch": 0.2463768115942029, + "grad_norm": 0.5939118920382572, + "learning_rate": 4.106280193236715e-05, + "loss": 0.731, + "step": 170 + }, + { + "epoch": 0.24782608695652175, + "grad_norm": 0.70082387818902, + "learning_rate": 4.130434782608696e-05, + "loss": 0.7278, + "step": 171 + }, + { + "epoch": 0.2492753623188406, + "grad_norm": 0.4638036883744111, + "learning_rate": 4.154589371980677e-05, + "loss": 0.6726, + "step": 172 + }, + { + "epoch": 0.25072463768115943, + "grad_norm": 0.48095533230230175, + "learning_rate": 4.1787439613526576e-05, + "loss": 0.6673, + "step": 173 + }, + { + "epoch": 0.25217391304347825, + "grad_norm": 0.4690837413533762, + "learning_rate": 4.202898550724638e-05, + "loss": 0.653, + "step": 174 + }, + { + "epoch": 0.2536231884057971, + "grad_norm": 0.4458627462261377, + "learning_rate": 4.2270531400966186e-05, + "loss": 0.7793, + "step": 175 + }, + { + "epoch": 0.25507246376811593, + "grad_norm": 0.4189884078798217, + "learning_rate": 4.2512077294685994e-05, + "loss": 0.7243, + "step": 176 + }, + { + "epoch": 0.2565217391304348, + "grad_norm": 0.4651076969193504, + "learning_rate": 4.27536231884058e-05, + "loss": 0.6708, + "step": 177 + }, + { + "epoch": 0.2579710144927536, + "grad_norm": 0.42064317748362867, + "learning_rate": 4.299516908212561e-05, + "loss": 0.6885, + "step": 178 + }, + { + "epoch": 0.2594202898550725, + "grad_norm": 0.57387087269488, + "learning_rate": 4.323671497584541e-05, + "loss": 0.7926, + "step": 179 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.5215829127359812, + "learning_rate": 4.347826086956522e-05, + "loss": 0.7087, + "step": 180 + }, + { + "epoch": 0.26231884057971017, + "grad_norm": 0.5244092130860797, + "learning_rate": 4.371980676328503e-05, + "loss": 0.6009, + "step": 181 + }, + { + "epoch": 0.263768115942029, + "grad_norm": 0.40034582164953714, + "learning_rate": 4.396135265700484e-05, + "loss": 0.7362, + "step": 182 + }, + { + "epoch": 0.26521739130434785, + "grad_norm": 0.5219605123880174, + "learning_rate": 4.4202898550724645e-05, + "loss": 0.6995, + "step": 183 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.47609730352611157, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.7462, + "step": 184 + }, + { + "epoch": 0.26811594202898553, + "grad_norm": 0.4763490286301335, + "learning_rate": 4.4685990338164255e-05, + "loss": 0.7037, + "step": 185 + }, + { + "epoch": 0.26956521739130435, + "grad_norm": 0.4654352184388157, + "learning_rate": 4.492753623188406e-05, + "loss": 0.7397, + "step": 186 + }, + { + "epoch": 0.2710144927536232, + "grad_norm": 0.5309678745932885, + "learning_rate": 4.5169082125603865e-05, + "loss": 0.7948, + "step": 187 + }, + { + "epoch": 0.27246376811594203, + "grad_norm": 0.4357391853306629, + "learning_rate": 4.541062801932367e-05, + "loss": 0.7201, + "step": 188 + }, + { + "epoch": 0.27391304347826084, + "grad_norm": 0.495806282088296, + "learning_rate": 4.565217391304348e-05, + "loss": 0.7081, + "step": 189 + }, + { + "epoch": 0.2753623188405797, + "grad_norm": 0.4734024179586802, + "learning_rate": 4.589371980676328e-05, + "loss": 0.5961, + "step": 190 + }, + { + "epoch": 0.2768115942028985, + "grad_norm": 0.5213770810315379, + "learning_rate": 4.613526570048309e-05, + "loss": 0.6825, + "step": 191 + }, + { + "epoch": 0.2782608695652174, + "grad_norm": 0.45246037733085614, + "learning_rate": 4.63768115942029e-05, + "loss": 0.6037, + "step": 192 + }, + { + "epoch": 0.2797101449275362, + "grad_norm": 0.6271967081898506, + "learning_rate": 4.661835748792271e-05, + "loss": 0.7358, + "step": 193 + }, + { + "epoch": 0.2811594202898551, + "grad_norm": 0.857140335440062, + "learning_rate": 4.6859903381642516e-05, + "loss": 0.7949, + "step": 194 + }, + { + "epoch": 0.2826086956521739, + "grad_norm": 0.9127639840782331, + "learning_rate": 4.710144927536232e-05, + "loss": 0.6545, + "step": 195 + }, + { + "epoch": 0.28405797101449276, + "grad_norm": 0.4959655662040491, + "learning_rate": 4.7342995169082125e-05, + "loss": 0.6034, + "step": 196 + }, + { + "epoch": 0.2855072463768116, + "grad_norm": 0.5178854616939212, + "learning_rate": 4.7584541062801933e-05, + "loss": 0.6657, + "step": 197 + }, + { + "epoch": 0.28695652173913044, + "grad_norm": 0.5052219461873171, + "learning_rate": 4.782608695652174e-05, + "loss": 0.6443, + "step": 198 + }, + { + "epoch": 0.28840579710144926, + "grad_norm": 0.4614518423922908, + "learning_rate": 4.806763285024155e-05, + "loss": 0.7269, + "step": 199 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.4586802080175379, + "learning_rate": 4.830917874396135e-05, + "loss": 0.7065, + "step": 200 + }, + { + "epoch": 0.29130434782608694, + "grad_norm": 0.4666949971627673, + "learning_rate": 4.855072463768116e-05, + "loss": 0.6757, + "step": 201 + }, + { + "epoch": 0.2927536231884058, + "grad_norm": 0.622849855674404, + "learning_rate": 4.879227053140097e-05, + "loss": 0.7404, + "step": 202 + }, + { + "epoch": 0.2942028985507246, + "grad_norm": 0.621911112221847, + "learning_rate": 4.9033816425120776e-05, + "loss": 0.7203, + "step": 203 + }, + { + "epoch": 0.2956521739130435, + "grad_norm": 0.42462020266152434, + "learning_rate": 4.9275362318840584e-05, + "loss": 0.7221, + "step": 204 + }, + { + "epoch": 0.2971014492753623, + "grad_norm": 0.7157909547004069, + "learning_rate": 4.9516908212560386e-05, + "loss": 0.7178, + "step": 205 + }, + { + "epoch": 0.2985507246376812, + "grad_norm": 0.47540747142242634, + "learning_rate": 4.9758454106280194e-05, + "loss": 0.687, + "step": 206 + }, + { + "epoch": 0.3, + "grad_norm": 0.5356933660212242, + "learning_rate": 5e-05, + "loss": 0.5966, + "step": 207 + }, + { + "epoch": 0.30144927536231886, + "grad_norm": 3.875707624149591, + "learning_rate": 4.9973161567364465e-05, + "loss": 0.7651, + "step": 208 + }, + { + "epoch": 0.30289855072463767, + "grad_norm": 0.7348013333692176, + "learning_rate": 4.9946323134728935e-05, + "loss": 0.7256, + "step": 209 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 0.552654765255837, + "learning_rate": 4.99194847020934e-05, + "loss": 0.6954, + "step": 210 + }, + { + "epoch": 0.30579710144927535, + "grad_norm": 0.7049833226099168, + "learning_rate": 4.989264626945787e-05, + "loss": 0.7656, + "step": 211 + }, + { + "epoch": 0.3072463768115942, + "grad_norm": 0.5935672598341638, + "learning_rate": 4.986580783682233e-05, + "loss": 0.6958, + "step": 212 + }, + { + "epoch": 0.30869565217391304, + "grad_norm": 0.6495195288245664, + "learning_rate": 4.98389694041868e-05, + "loss": 0.6997, + "step": 213 + }, + { + "epoch": 0.3101449275362319, + "grad_norm": 0.7912017825608941, + "learning_rate": 4.981213097155126e-05, + "loss": 0.7651, + "step": 214 + }, + { + "epoch": 0.3115942028985507, + "grad_norm": 0.6081697492207544, + "learning_rate": 4.978529253891573e-05, + "loss": 0.741, + "step": 215 + }, + { + "epoch": 0.3130434782608696, + "grad_norm": 0.5237190794202718, + "learning_rate": 4.9758454106280194e-05, + "loss": 0.6351, + "step": 216 + }, + { + "epoch": 0.3144927536231884, + "grad_norm": 0.6124946170096812, + "learning_rate": 4.9731615673644664e-05, + "loss": 0.6809, + "step": 217 + }, + { + "epoch": 0.3159420289855073, + "grad_norm": 0.5039329545767305, + "learning_rate": 4.9704777241009126e-05, + "loss": 0.6485, + "step": 218 + }, + { + "epoch": 0.3173913043478261, + "grad_norm": 0.54854349900959, + "learning_rate": 4.967793880837359e-05, + "loss": 0.7133, + "step": 219 + }, + { + "epoch": 0.3188405797101449, + "grad_norm": 0.5217856945206872, + "learning_rate": 4.965110037573806e-05, + "loss": 0.78, + "step": 220 + }, + { + "epoch": 0.32028985507246377, + "grad_norm": 0.5466245482758063, + "learning_rate": 4.962426194310252e-05, + "loss": 0.7367, + "step": 221 + }, + { + "epoch": 0.3217391304347826, + "grad_norm": 0.7282345105417176, + "learning_rate": 4.959742351046699e-05, + "loss": 0.6954, + "step": 222 + }, + { + "epoch": 0.32318840579710145, + "grad_norm": 0.516419359601005, + "learning_rate": 4.9570585077831454e-05, + "loss": 0.7519, + "step": 223 + }, + { + "epoch": 0.32463768115942027, + "grad_norm": 0.5211427796582248, + "learning_rate": 4.954374664519592e-05, + "loss": 0.6564, + "step": 224 + }, + { + "epoch": 0.32608695652173914, + "grad_norm": 0.42978384225909405, + "learning_rate": 4.9516908212560386e-05, + "loss": 0.687, + "step": 225 + }, + { + "epoch": 0.32753623188405795, + "grad_norm": 0.44939955255847236, + "learning_rate": 4.9490069779924855e-05, + "loss": 0.6175, + "step": 226 + }, + { + "epoch": 0.3289855072463768, + "grad_norm": 0.4805257834794986, + "learning_rate": 4.946323134728932e-05, + "loss": 0.5891, + "step": 227 + }, + { + "epoch": 0.33043478260869563, + "grad_norm": 0.5560744932815476, + "learning_rate": 4.943639291465378e-05, + "loss": 0.7353, + "step": 228 + }, + { + "epoch": 0.3318840579710145, + "grad_norm": 0.3961756192417976, + "learning_rate": 4.940955448201825e-05, + "loss": 0.6504, + "step": 229 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.5300190615241616, + "learning_rate": 4.938271604938271e-05, + "loss": 0.6147, + "step": 230 + }, + { + "epoch": 0.3347826086956522, + "grad_norm": 0.4281542999977893, + "learning_rate": 4.935587761674719e-05, + "loss": 0.7576, + "step": 231 + }, + { + "epoch": 0.336231884057971, + "grad_norm": 0.6674575417873713, + "learning_rate": 4.932903918411165e-05, + "loss": 0.7837, + "step": 232 + }, + { + "epoch": 0.33768115942028987, + "grad_norm": 0.5981661539007347, + "learning_rate": 4.930220075147612e-05, + "loss": 0.6746, + "step": 233 + }, + { + "epoch": 0.3391304347826087, + "grad_norm": 0.41904669721076937, + "learning_rate": 4.9275362318840584e-05, + "loss": 0.6213, + "step": 234 + }, + { + "epoch": 0.34057971014492755, + "grad_norm": 0.5140483469439551, + "learning_rate": 4.924852388620505e-05, + "loss": 0.7331, + "step": 235 + }, + { + "epoch": 0.34202898550724636, + "grad_norm": 0.4612364066752335, + "learning_rate": 4.922168545356952e-05, + "loss": 0.6362, + "step": 236 + }, + { + "epoch": 0.34347826086956523, + "grad_norm": 0.4498508963425885, + "learning_rate": 4.919484702093398e-05, + "loss": 0.6164, + "step": 237 + }, + { + "epoch": 0.34492753623188405, + "grad_norm": 0.5586129985408278, + "learning_rate": 4.916800858829845e-05, + "loss": 0.7438, + "step": 238 + }, + { + "epoch": 0.3463768115942029, + "grad_norm": 0.5030472932879021, + "learning_rate": 4.914117015566291e-05, + "loss": 0.7163, + "step": 239 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.4559732003400084, + "learning_rate": 4.911433172302738e-05, + "loss": 0.5819, + "step": 240 + }, + { + "epoch": 0.3492753623188406, + "grad_norm": 0.4878474979693718, + "learning_rate": 4.9087493290391844e-05, + "loss": 0.6758, + "step": 241 + }, + { + "epoch": 0.3507246376811594, + "grad_norm": 0.6615032146067258, + "learning_rate": 4.9060654857756313e-05, + "loss": 0.7171, + "step": 242 + }, + { + "epoch": 0.3521739130434783, + "grad_norm": 0.5930208248990055, + "learning_rate": 4.9033816425120776e-05, + "loss": 0.7232, + "step": 243 + }, + { + "epoch": 0.3536231884057971, + "grad_norm": 0.48395298139388776, + "learning_rate": 4.9006977992485246e-05, + "loss": 0.769, + "step": 244 + }, + { + "epoch": 0.35507246376811596, + "grad_norm": 0.5502774529332332, + "learning_rate": 4.898013955984971e-05, + "loss": 0.5951, + "step": 245 + }, + { + "epoch": 0.3565217391304348, + "grad_norm": 0.6227102239291648, + "learning_rate": 4.895330112721417e-05, + "loss": 0.6301, + "step": 246 + }, + { + "epoch": 0.35797101449275365, + "grad_norm": 0.4496872881250766, + "learning_rate": 4.892646269457864e-05, + "loss": 0.7381, + "step": 247 + }, + { + "epoch": 0.35942028985507246, + "grad_norm": 0.3902252305394972, + "learning_rate": 4.8899624261943103e-05, + "loss": 0.6779, + "step": 248 + }, + { + "epoch": 0.36086956521739133, + "grad_norm": 0.4953935450077986, + "learning_rate": 4.887278582930757e-05, + "loss": 0.6582, + "step": 249 + }, + { + "epoch": 0.36231884057971014, + "grad_norm": 0.44386606193131384, + "learning_rate": 4.8845947396672036e-05, + "loss": 0.6119, + "step": 250 + }, + { + "epoch": 0.36376811594202896, + "grad_norm": 1.957189039544279, + "learning_rate": 4.8819108964036505e-05, + "loss": 0.6398, + "step": 251 + }, + { + "epoch": 0.3652173913043478, + "grad_norm": 0.47286371385458287, + "learning_rate": 4.879227053140097e-05, + "loss": 0.6352, + "step": 252 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 0.49049957412021783, + "learning_rate": 4.876543209876544e-05, + "loss": 0.6178, + "step": 253 + }, + { + "epoch": 0.3681159420289855, + "grad_norm": 0.6559768674111185, + "learning_rate": 4.87385936661299e-05, + "loss": 0.6945, + "step": 254 + }, + { + "epoch": 0.3695652173913043, + "grad_norm": 0.6203105736124627, + "learning_rate": 4.871175523349436e-05, + "loss": 0.6242, + "step": 255 + }, + { + "epoch": 0.3710144927536232, + "grad_norm": 0.5057308068799966, + "learning_rate": 4.868491680085883e-05, + "loss": 0.608, + "step": 256 + }, + { + "epoch": 0.372463768115942, + "grad_norm": 0.4602451484685541, + "learning_rate": 4.8658078368223295e-05, + "loss": 0.6752, + "step": 257 + }, + { + "epoch": 0.3739130434782609, + "grad_norm": 0.5132163941533325, + "learning_rate": 4.8631239935587765e-05, + "loss": 0.6914, + "step": 258 + }, + { + "epoch": 0.3753623188405797, + "grad_norm": 0.5450742277436593, + "learning_rate": 4.860440150295223e-05, + "loss": 0.65, + "step": 259 + }, + { + "epoch": 0.37681159420289856, + "grad_norm": 0.47203214991964443, + "learning_rate": 4.85775630703167e-05, + "loss": 0.6754, + "step": 260 + }, + { + "epoch": 0.3782608695652174, + "grad_norm": 0.5761359721857935, + "learning_rate": 4.855072463768116e-05, + "loss": 0.7498, + "step": 261 + }, + { + "epoch": 0.37971014492753624, + "grad_norm": 0.417069826454048, + "learning_rate": 4.852388620504563e-05, + "loss": 0.7171, + "step": 262 + }, + { + "epoch": 0.38115942028985506, + "grad_norm": 0.35694630092374025, + "learning_rate": 4.849704777241009e-05, + "loss": 0.6517, + "step": 263 + }, + { + "epoch": 0.3826086956521739, + "grad_norm": 0.4651244997425789, + "learning_rate": 4.847020933977456e-05, + "loss": 0.6868, + "step": 264 + }, + { + "epoch": 0.38405797101449274, + "grad_norm": 0.4028313222745874, + "learning_rate": 4.8443370907139024e-05, + "loss": 0.7097, + "step": 265 + }, + { + "epoch": 0.3855072463768116, + "grad_norm": 0.4437827058159613, + "learning_rate": 4.841653247450349e-05, + "loss": 0.6755, + "step": 266 + }, + { + "epoch": 0.3869565217391304, + "grad_norm": 0.4244846215615247, + "learning_rate": 4.8389694041867956e-05, + "loss": 0.6864, + "step": 267 + }, + { + "epoch": 0.3884057971014493, + "grad_norm": 0.44310012910623525, + "learning_rate": 4.836285560923242e-05, + "loss": 0.6454, + "step": 268 + }, + { + "epoch": 0.3898550724637681, + "grad_norm": 0.4757642543300501, + "learning_rate": 4.833601717659689e-05, + "loss": 0.6639, + "step": 269 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 0.565885386171182, + "learning_rate": 4.830917874396135e-05, + "loss": 0.799, + "step": 270 + }, + { + "epoch": 0.3927536231884058, + "grad_norm": 0.4100126299263926, + "learning_rate": 4.828234031132582e-05, + "loss": 0.6586, + "step": 271 + }, + { + "epoch": 0.39420289855072466, + "grad_norm": 0.4565019540369247, + "learning_rate": 4.8255501878690284e-05, + "loss": 0.6809, + "step": 272 + }, + { + "epoch": 0.39565217391304347, + "grad_norm": 0.4554139091690649, + "learning_rate": 4.822866344605475e-05, + "loss": 0.7114, + "step": 273 + }, + { + "epoch": 0.39710144927536234, + "grad_norm": 0.4724351999778902, + "learning_rate": 4.8201825013419216e-05, + "loss": 0.611, + "step": 274 + }, + { + "epoch": 0.39855072463768115, + "grad_norm": 0.5207385550926253, + "learning_rate": 4.817498658078368e-05, + "loss": 0.721, + "step": 275 + }, + { + "epoch": 0.4, + "grad_norm": 0.43019224883693835, + "learning_rate": 4.814814814814815e-05, + "loss": 0.7154, + "step": 276 + }, + { + "epoch": 0.40144927536231884, + "grad_norm": 0.522469033072573, + "learning_rate": 4.812130971551262e-05, + "loss": 0.7264, + "step": 277 + }, + { + "epoch": 0.4028985507246377, + "grad_norm": 0.4695809052920177, + "learning_rate": 4.809447128287709e-05, + "loss": 0.5917, + "step": 278 + }, + { + "epoch": 0.4043478260869565, + "grad_norm": 0.5312579249251256, + "learning_rate": 4.806763285024155e-05, + "loss": 0.6322, + "step": 279 + }, + { + "epoch": 0.4057971014492754, + "grad_norm": 0.6103259840908285, + "learning_rate": 4.804079441760602e-05, + "loss": 0.7255, + "step": 280 + }, + { + "epoch": 0.4072463768115942, + "grad_norm": 0.5509964882780453, + "learning_rate": 4.801395598497048e-05, + "loss": 0.6679, + "step": 281 + }, + { + "epoch": 0.40869565217391307, + "grad_norm": 0.6573044315013673, + "learning_rate": 4.7987117552334945e-05, + "loss": 0.6357, + "step": 282 + }, + { + "epoch": 0.4101449275362319, + "grad_norm": 0.5910903249453899, + "learning_rate": 4.7960279119699414e-05, + "loss": 0.5394, + "step": 283 + }, + { + "epoch": 0.4115942028985507, + "grad_norm": 0.47832362737303147, + "learning_rate": 4.793344068706388e-05, + "loss": 0.6773, + "step": 284 + }, + { + "epoch": 0.41304347826086957, + "grad_norm": 0.5548655349336106, + "learning_rate": 4.790660225442835e-05, + "loss": 0.6962, + "step": 285 + }, + { + "epoch": 0.4144927536231884, + "grad_norm": 0.4534853918571052, + "learning_rate": 4.787976382179281e-05, + "loss": 0.7101, + "step": 286 + }, + { + "epoch": 0.41594202898550725, + "grad_norm": 0.42141712239602674, + "learning_rate": 4.785292538915728e-05, + "loss": 0.6164, + "step": 287 + }, + { + "epoch": 0.41739130434782606, + "grad_norm": 0.5303006892740134, + "learning_rate": 4.782608695652174e-05, + "loss": 0.6017, + "step": 288 + }, + { + "epoch": 0.41884057971014493, + "grad_norm": 0.46211348435989263, + "learning_rate": 4.779924852388621e-05, + "loss": 0.6643, + "step": 289 + }, + { + "epoch": 0.42028985507246375, + "grad_norm": 0.5422397830559342, + "learning_rate": 4.7772410091250674e-05, + "loss": 0.6748, + "step": 290 + }, + { + "epoch": 0.4217391304347826, + "grad_norm": 0.45566020860996564, + "learning_rate": 4.7745571658615143e-05, + "loss": 0.5797, + "step": 291 + }, + { + "epoch": 0.42318840579710143, + "grad_norm": 0.7046915013882183, + "learning_rate": 4.7718733225979606e-05, + "loss": 0.6902, + "step": 292 + }, + { + "epoch": 0.4246376811594203, + "grad_norm": 0.5594416692480499, + "learning_rate": 4.769189479334407e-05, + "loss": 0.7175, + "step": 293 + }, + { + "epoch": 0.4260869565217391, + "grad_norm": 0.4916116646480955, + "learning_rate": 4.766505636070854e-05, + "loss": 0.5814, + "step": 294 + }, + { + "epoch": 0.427536231884058, + "grad_norm": 0.45352777639895914, + "learning_rate": 4.7638217928073e-05, + "loss": 0.6896, + "step": 295 + }, + { + "epoch": 0.4289855072463768, + "grad_norm": 0.5149754279803409, + "learning_rate": 4.761137949543747e-05, + "loss": 0.6643, + "step": 296 + }, + { + "epoch": 0.43043478260869567, + "grad_norm": 0.43351576199759795, + "learning_rate": 4.7584541062801933e-05, + "loss": 0.6787, + "step": 297 + }, + { + "epoch": 0.4318840579710145, + "grad_norm": 0.41889843156070783, + "learning_rate": 4.75577026301664e-05, + "loss": 0.5754, + "step": 298 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 0.5025185648234128, + "learning_rate": 4.7530864197530866e-05, + "loss": 0.7761, + "step": 299 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.4910170844080093, + "learning_rate": 4.7504025764895335e-05, + "loss": 0.6271, + "step": 300 + }, + { + "epoch": 0.43623188405797103, + "grad_norm": 0.4459249234150752, + "learning_rate": 4.74771873322598e-05, + "loss": 0.5621, + "step": 301 + }, + { + "epoch": 0.43768115942028984, + "grad_norm": 0.4130240042045532, + "learning_rate": 4.745034889962426e-05, + "loss": 0.6075, + "step": 302 + }, + { + "epoch": 0.4391304347826087, + "grad_norm": 0.5339425697493924, + "learning_rate": 4.742351046698873e-05, + "loss": 0.5985, + "step": 303 + }, + { + "epoch": 0.4405797101449275, + "grad_norm": 0.5662260384248383, + "learning_rate": 4.739667203435319e-05, + "loss": 0.7069, + "step": 304 + }, + { + "epoch": 0.4420289855072464, + "grad_norm": 0.5289530183900609, + "learning_rate": 4.736983360171766e-05, + "loss": 0.6578, + "step": 305 + }, + { + "epoch": 0.4434782608695652, + "grad_norm": 0.5621056697561709, + "learning_rate": 4.7342995169082125e-05, + "loss": 0.5806, + "step": 306 + }, + { + "epoch": 0.4449275362318841, + "grad_norm": 0.5028453643236157, + "learning_rate": 4.7316156736446595e-05, + "loss": 0.5989, + "step": 307 + }, + { + "epoch": 0.4463768115942029, + "grad_norm": 0.48129891568938116, + "learning_rate": 4.728931830381106e-05, + "loss": 0.6438, + "step": 308 + }, + { + "epoch": 0.44782608695652176, + "grad_norm": 0.5199119699799553, + "learning_rate": 4.726247987117553e-05, + "loss": 0.6594, + "step": 309 + }, + { + "epoch": 0.4492753623188406, + "grad_norm": 0.463370177698594, + "learning_rate": 4.723564143853999e-05, + "loss": 0.7021, + "step": 310 + }, + { + "epoch": 0.45072463768115945, + "grad_norm": 0.41751731613436227, + "learning_rate": 4.720880300590446e-05, + "loss": 0.6408, + "step": 311 + }, + { + "epoch": 0.45217391304347826, + "grad_norm": 0.41222115997598185, + "learning_rate": 4.718196457326892e-05, + "loss": 0.6486, + "step": 312 + }, + { + "epoch": 0.45362318840579713, + "grad_norm": 0.41762118003102466, + "learning_rate": 4.7155126140633385e-05, + "loss": 0.7012, + "step": 313 + }, + { + "epoch": 0.45507246376811594, + "grad_norm": 0.47083847264116285, + "learning_rate": 4.7128287707997854e-05, + "loss": 0.5933, + "step": 314 + }, + { + "epoch": 0.45652173913043476, + "grad_norm": 0.4385295480312083, + "learning_rate": 4.710144927536232e-05, + "loss": 0.5925, + "step": 315 + }, + { + "epoch": 0.4579710144927536, + "grad_norm": 0.47338682594708603, + "learning_rate": 4.7074610842726786e-05, + "loss": 0.6403, + "step": 316 + }, + { + "epoch": 0.45942028985507244, + "grad_norm": 0.449155602405838, + "learning_rate": 4.704777241009125e-05, + "loss": 0.6025, + "step": 317 + }, + { + "epoch": 0.4608695652173913, + "grad_norm": 0.4516855897509766, + "learning_rate": 4.702093397745572e-05, + "loss": 0.5849, + "step": 318 + }, + { + "epoch": 0.4623188405797101, + "grad_norm": 0.5606454482640691, + "learning_rate": 4.699409554482018e-05, + "loss": 0.6841, + "step": 319 + }, + { + "epoch": 0.463768115942029, + "grad_norm": 0.5070294712602395, + "learning_rate": 4.696725711218465e-05, + "loss": 0.6293, + "step": 320 + }, + { + "epoch": 0.4652173913043478, + "grad_norm": 0.37187122216983104, + "learning_rate": 4.6940418679549114e-05, + "loss": 0.595, + "step": 321 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.683077057258226, + "learning_rate": 4.691358024691358e-05, + "loss": 0.5657, + "step": 322 + }, + { + "epoch": 0.4681159420289855, + "grad_norm": 0.4701558290739009, + "learning_rate": 4.6886741814278046e-05, + "loss": 0.6412, + "step": 323 + }, + { + "epoch": 0.46956521739130436, + "grad_norm": 0.4085454812990897, + "learning_rate": 4.6859903381642516e-05, + "loss": 0.6615, + "step": 324 + }, + { + "epoch": 0.47101449275362317, + "grad_norm": 0.5311673769504186, + "learning_rate": 4.6833064949006985e-05, + "loss": 0.6267, + "step": 325 + }, + { + "epoch": 0.47246376811594204, + "grad_norm": 0.423024811889357, + "learning_rate": 4.680622651637145e-05, + "loss": 0.6453, + "step": 326 + }, + { + "epoch": 0.47391304347826085, + "grad_norm": 0.4560564760332426, + "learning_rate": 4.677938808373592e-05, + "loss": 0.6633, + "step": 327 + }, + { + "epoch": 0.4753623188405797, + "grad_norm": 0.36439217196706425, + "learning_rate": 4.675254965110038e-05, + "loss": 0.6128, + "step": 328 + }, + { + "epoch": 0.47681159420289854, + "grad_norm": 0.3919705007269534, + "learning_rate": 4.672571121846484e-05, + "loss": 0.6057, + "step": 329 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 0.3876474762734424, + "learning_rate": 4.669887278582931e-05, + "loss": 0.5812, + "step": 330 + }, + { + "epoch": 0.4797101449275362, + "grad_norm": 0.8537368376487238, + "learning_rate": 4.6672034353193775e-05, + "loss": 0.6831, + "step": 331 + }, + { + "epoch": 0.4811594202898551, + "grad_norm": 0.4126095686701784, + "learning_rate": 4.6645195920558245e-05, + "loss": 0.683, + "step": 332 + }, + { + "epoch": 0.4826086956521739, + "grad_norm": 0.4837042105681288, + "learning_rate": 4.661835748792271e-05, + "loss": 0.551, + "step": 333 + }, + { + "epoch": 0.48405797101449277, + "grad_norm": 0.43451962804846805, + "learning_rate": 4.659151905528718e-05, + "loss": 0.5953, + "step": 334 + }, + { + "epoch": 0.4855072463768116, + "grad_norm": 0.4254152211545657, + "learning_rate": 4.656468062265164e-05, + "loss": 0.6145, + "step": 335 + }, + { + "epoch": 0.48695652173913045, + "grad_norm": 0.5081602183421405, + "learning_rate": 4.653784219001611e-05, + "loss": 0.6101, + "step": 336 + }, + { + "epoch": 0.48840579710144927, + "grad_norm": 0.3677242041378604, + "learning_rate": 4.651100375738057e-05, + "loss": 0.628, + "step": 337 + }, + { + "epoch": 0.48985507246376814, + "grad_norm": 0.5736410512077793, + "learning_rate": 4.648416532474504e-05, + "loss": 0.7567, + "step": 338 + }, + { + "epoch": 0.49130434782608695, + "grad_norm": 0.3961758433255439, + "learning_rate": 4.6457326892109504e-05, + "loss": 0.5232, + "step": 339 + }, + { + "epoch": 0.4927536231884058, + "grad_norm": 0.6078832397392898, + "learning_rate": 4.643048845947397e-05, + "loss": 0.7044, + "step": 340 + }, + { + "epoch": 0.49420289855072463, + "grad_norm": 0.39627760424098896, + "learning_rate": 4.6403650026838436e-05, + "loss": 0.6417, + "step": 341 + }, + { + "epoch": 0.4956521739130435, + "grad_norm": 0.4734334069120371, + "learning_rate": 4.63768115942029e-05, + "loss": 0.6287, + "step": 342 + }, + { + "epoch": 0.4971014492753623, + "grad_norm": 0.46243156968178983, + "learning_rate": 4.634997316156737e-05, + "loss": 0.6103, + "step": 343 + }, + { + "epoch": 0.4985507246376812, + "grad_norm": 0.5433190757897186, + "learning_rate": 4.632313472893183e-05, + "loss": 0.5938, + "step": 344 + }, + { + "epoch": 0.5, + "grad_norm": 0.4559700829884716, + "learning_rate": 4.62962962962963e-05, + "loss": 0.679, + "step": 345 + }, + { + "epoch": 0.5014492753623189, + "grad_norm": 0.49482650146755597, + "learning_rate": 4.6269457863660764e-05, + "loss": 0.5917, + "step": 346 + }, + { + "epoch": 0.5028985507246376, + "grad_norm": 0.5029576885090551, + "learning_rate": 4.624261943102523e-05, + "loss": 0.5109, + "step": 347 + }, + { + "epoch": 0.5043478260869565, + "grad_norm": 0.3794583756550058, + "learning_rate": 4.6215780998389696e-05, + "loss": 0.6882, + "step": 348 + }, + { + "epoch": 0.5057971014492754, + "grad_norm": 0.49466579544121436, + "learning_rate": 4.6188942565754165e-05, + "loss": 0.5764, + "step": 349 + }, + { + "epoch": 0.5072463768115942, + "grad_norm": 0.4141689512026932, + "learning_rate": 4.616210413311863e-05, + "loss": 0.6417, + "step": 350 + }, + { + "epoch": 0.508695652173913, + "grad_norm": 0.4572258866434217, + "learning_rate": 4.613526570048309e-05, + "loss": 0.6427, + "step": 351 + }, + { + "epoch": 0.5101449275362319, + "grad_norm": 0.5119906485408763, + "learning_rate": 4.610842726784756e-05, + "loss": 0.6396, + "step": 352 + }, + { + "epoch": 0.5115942028985507, + "grad_norm": 0.43916535973559384, + "learning_rate": 4.608158883521202e-05, + "loss": 0.5667, + "step": 353 + }, + { + "epoch": 0.5130434782608696, + "grad_norm": 0.5789964597820626, + "learning_rate": 4.605475040257649e-05, + "loss": 0.5616, + "step": 354 + }, + { + "epoch": 0.5144927536231884, + "grad_norm": 0.3933092714425148, + "learning_rate": 4.6027911969940955e-05, + "loss": 0.603, + "step": 355 + }, + { + "epoch": 0.5159420289855072, + "grad_norm": 0.5437635404178742, + "learning_rate": 4.6001073537305425e-05, + "loss": 0.6244, + "step": 356 + }, + { + "epoch": 0.5173913043478261, + "grad_norm": 0.4887636002020938, + "learning_rate": 4.597423510466989e-05, + "loss": 0.6635, + "step": 357 + }, + { + "epoch": 0.518840579710145, + "grad_norm": 0.4363998278608295, + "learning_rate": 4.594739667203436e-05, + "loss": 0.5896, + "step": 358 + }, + { + "epoch": 0.5202898550724637, + "grad_norm": 0.4598576444127417, + "learning_rate": 4.592055823939882e-05, + "loss": 0.6361, + "step": 359 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.34761717274590787, + "learning_rate": 4.589371980676328e-05, + "loss": 0.5961, + "step": 360 + }, + { + "epoch": 0.5231884057971015, + "grad_norm": 0.4251983395550634, + "learning_rate": 4.586688137412775e-05, + "loss": 0.6047, + "step": 361 + }, + { + "epoch": 0.5246376811594203, + "grad_norm": 0.38041849327413135, + "learning_rate": 4.5840042941492215e-05, + "loss": 0.581, + "step": 362 + }, + { + "epoch": 0.5260869565217391, + "grad_norm": 0.3769564394943478, + "learning_rate": 4.5813204508856684e-05, + "loss": 0.5916, + "step": 363 + }, + { + "epoch": 0.527536231884058, + "grad_norm": 0.4060225292587437, + "learning_rate": 4.578636607622115e-05, + "loss": 0.6497, + "step": 364 + }, + { + "epoch": 0.5289855072463768, + "grad_norm": 0.3752767903004976, + "learning_rate": 4.5759527643585617e-05, + "loss": 0.6158, + "step": 365 + }, + { + "epoch": 0.5304347826086957, + "grad_norm": 0.39868767186559706, + "learning_rate": 4.573268921095008e-05, + "loss": 0.637, + "step": 366 + }, + { + "epoch": 0.5318840579710145, + "grad_norm": 0.4305347328957555, + "learning_rate": 4.570585077831455e-05, + "loss": 0.6199, + "step": 367 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.39277659041458757, + "learning_rate": 4.567901234567901e-05, + "loss": 0.6041, + "step": 368 + }, + { + "epoch": 0.5347826086956522, + "grad_norm": 0.4749667444026662, + "learning_rate": 4.565217391304348e-05, + "loss": 0.6974, + "step": 369 + }, + { + "epoch": 0.5362318840579711, + "grad_norm": 0.4391867067280921, + "learning_rate": 4.5625335480407944e-05, + "loss": 0.5768, + "step": 370 + }, + { + "epoch": 0.5376811594202898, + "grad_norm": 0.5090096020968206, + "learning_rate": 4.559849704777241e-05, + "loss": 0.5445, + "step": 371 + }, + { + "epoch": 0.5391304347826087, + "grad_norm": 0.4011524373138279, + "learning_rate": 4.557165861513688e-05, + "loss": 0.5444, + "step": 372 + }, + { + "epoch": 0.5405797101449276, + "grad_norm": 0.4558341949312885, + "learning_rate": 4.5544820182501346e-05, + "loss": 0.6478, + "step": 373 + }, + { + "epoch": 0.5420289855072464, + "grad_norm": 0.4568966746754348, + "learning_rate": 4.5517981749865815e-05, + "loss": 0.614, + "step": 374 + }, + { + "epoch": 0.5434782608695652, + "grad_norm": 0.4700753115968392, + "learning_rate": 4.549114331723028e-05, + "loss": 0.759, + "step": 375 + }, + { + "epoch": 0.5449275362318841, + "grad_norm": 0.3998862234622315, + "learning_rate": 4.546430488459474e-05, + "loss": 0.6338, + "step": 376 + }, + { + "epoch": 0.5463768115942029, + "grad_norm": 0.43440146434893584, + "learning_rate": 4.543746645195921e-05, + "loss": 0.5947, + "step": 377 + }, + { + "epoch": 0.5478260869565217, + "grad_norm": 0.3648968901901745, + "learning_rate": 4.541062801932367e-05, + "loss": 0.5506, + "step": 378 + }, + { + "epoch": 0.5492753623188406, + "grad_norm": 0.48579400196764083, + "learning_rate": 4.538378958668814e-05, + "loss": 0.6168, + "step": 379 + }, + { + "epoch": 0.5507246376811594, + "grad_norm": 0.7342657082099426, + "learning_rate": 4.5356951154052605e-05, + "loss": 0.6539, + "step": 380 + }, + { + "epoch": 0.5521739130434783, + "grad_norm": 0.45241959724408615, + "learning_rate": 4.5330112721417075e-05, + "loss": 0.5567, + "step": 381 + }, + { + "epoch": 0.553623188405797, + "grad_norm": 0.3986083042401516, + "learning_rate": 4.530327428878154e-05, + "loss": 0.6844, + "step": 382 + }, + { + "epoch": 0.5550724637681159, + "grad_norm": 0.6363551916780863, + "learning_rate": 4.527643585614601e-05, + "loss": 0.6956, + "step": 383 + }, + { + "epoch": 0.5565217391304348, + "grad_norm": 0.48150733103051285, + "learning_rate": 4.524959742351047e-05, + "loss": 0.6324, + "step": 384 + }, + { + "epoch": 0.5579710144927537, + "grad_norm": 0.49675319166001186, + "learning_rate": 4.522275899087494e-05, + "loss": 0.6708, + "step": 385 + }, + { + "epoch": 0.5594202898550724, + "grad_norm": 0.47708659349752586, + "learning_rate": 4.51959205582394e-05, + "loss": 0.685, + "step": 386 + }, + { + "epoch": 0.5608695652173913, + "grad_norm": 0.5000220688999433, + "learning_rate": 4.5169082125603865e-05, + "loss": 0.6275, + "step": 387 + }, + { + "epoch": 0.5623188405797102, + "grad_norm": 0.4368459518298929, + "learning_rate": 4.5142243692968334e-05, + "loss": 0.6844, + "step": 388 + }, + { + "epoch": 0.563768115942029, + "grad_norm": 0.4096537771062687, + "learning_rate": 4.51154052603328e-05, + "loss": 0.6328, + "step": 389 + }, + { + "epoch": 0.5652173913043478, + "grad_norm": 0.3334448687900693, + "learning_rate": 4.5088566827697266e-05, + "loss": 0.5817, + "step": 390 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 0.4819861650908369, + "learning_rate": 4.506172839506173e-05, + "loss": 0.6051, + "step": 391 + }, + { + "epoch": 0.5681159420289855, + "grad_norm": 0.43386350661453743, + "learning_rate": 4.50348899624262e-05, + "loss": 0.6212, + "step": 392 + }, + { + "epoch": 0.5695652173913044, + "grad_norm": 0.4542325204418208, + "learning_rate": 4.500805152979066e-05, + "loss": 0.598, + "step": 393 + }, + { + "epoch": 0.5710144927536231, + "grad_norm": 0.36353694572274375, + "learning_rate": 4.498121309715513e-05, + "loss": 0.6753, + "step": 394 + }, + { + "epoch": 0.572463768115942, + "grad_norm": 0.5541625024013656, + "learning_rate": 4.4954374664519594e-05, + "loss": 0.6362, + "step": 395 + }, + { + "epoch": 0.5739130434782609, + "grad_norm": 0.3919828315297418, + "learning_rate": 4.492753623188406e-05, + "loss": 0.625, + "step": 396 + }, + { + "epoch": 0.5753623188405798, + "grad_norm": 1.499546878942088, + "learning_rate": 4.4900697799248526e-05, + "loss": 0.5579, + "step": 397 + }, + { + "epoch": 0.5768115942028985, + "grad_norm": 0.556911484391141, + "learning_rate": 4.487385936661299e-05, + "loss": 0.6347, + "step": 398 + }, + { + "epoch": 0.5782608695652174, + "grad_norm": 0.45584289957280666, + "learning_rate": 4.484702093397746e-05, + "loss": 0.6767, + "step": 399 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.4956675178012224, + "learning_rate": 4.482018250134192e-05, + "loss": 0.5632, + "step": 400 + }, + { + "epoch": 0.5811594202898551, + "grad_norm": 0.44540912712042036, + "learning_rate": 4.479334406870639e-05, + "loss": 0.599, + "step": 401 + }, + { + "epoch": 0.5826086956521739, + "grad_norm": 0.42044316632261336, + "learning_rate": 4.476650563607085e-05, + "loss": 0.5734, + "step": 402 + }, + { + "epoch": 0.5840579710144927, + "grad_norm": 0.5107808787097907, + "learning_rate": 4.473966720343532e-05, + "loss": 0.7368, + "step": 403 + }, + { + "epoch": 0.5855072463768116, + "grad_norm": 0.4990397560807668, + "learning_rate": 4.4712828770799785e-05, + "loss": 0.6155, + "step": 404 + }, + { + "epoch": 0.5869565217391305, + "grad_norm": 0.4370235307040917, + "learning_rate": 4.4685990338164255e-05, + "loss": 0.6169, + "step": 405 + }, + { + "epoch": 0.5884057971014492, + "grad_norm": 0.37505407563553644, + "learning_rate": 4.465915190552872e-05, + "loss": 0.583, + "step": 406 + }, + { + "epoch": 0.5898550724637681, + "grad_norm": 0.4330379855434459, + "learning_rate": 4.463231347289318e-05, + "loss": 0.5817, + "step": 407 + }, + { + "epoch": 0.591304347826087, + "grad_norm": 0.4575635042170628, + "learning_rate": 4.460547504025765e-05, + "loss": 0.6586, + "step": 408 + }, + { + "epoch": 0.5927536231884057, + "grad_norm": 0.47572457302172977, + "learning_rate": 4.457863660762211e-05, + "loss": 0.6085, + "step": 409 + }, + { + "epoch": 0.5942028985507246, + "grad_norm": 0.3574119907944081, + "learning_rate": 4.455179817498658e-05, + "loss": 0.5559, + "step": 410 + }, + { + "epoch": 0.5956521739130435, + "grad_norm": 0.4178835165927953, + "learning_rate": 4.4524959742351045e-05, + "loss": 0.6125, + "step": 411 + }, + { + "epoch": 0.5971014492753624, + "grad_norm": 0.37512079629567197, + "learning_rate": 4.4498121309715514e-05, + "loss": 0.6084, + "step": 412 + }, + { + "epoch": 0.5985507246376811, + "grad_norm": 0.48633931038992023, + "learning_rate": 4.447128287707998e-05, + "loss": 0.5896, + "step": 413 + }, + { + "epoch": 0.6, + "grad_norm": 0.4796673589113237, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.5768, + "step": 414 + }, + { + "epoch": 0.6014492753623188, + "grad_norm": 0.43446070994232633, + "learning_rate": 4.441760601180891e-05, + "loss": 0.5937, + "step": 415 + }, + { + "epoch": 0.6028985507246377, + "grad_norm": 0.40889150420962617, + "learning_rate": 4.439076757917338e-05, + "loss": 0.7413, + "step": 416 + }, + { + "epoch": 0.6043478260869565, + "grad_norm": 0.4408375168183745, + "learning_rate": 4.436392914653785e-05, + "loss": 0.6439, + "step": 417 + }, + { + "epoch": 0.6057971014492753, + "grad_norm": 0.4246703790309954, + "learning_rate": 4.433709071390231e-05, + "loss": 0.6989, + "step": 418 + }, + { + "epoch": 0.6072463768115942, + "grad_norm": 0.38710833762232205, + "learning_rate": 4.431025228126678e-05, + "loss": 0.5806, + "step": 419 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 0.43313968649036916, + "learning_rate": 4.428341384863124e-05, + "loss": 0.7066, + "step": 420 + }, + { + "epoch": 0.6101449275362318, + "grad_norm": 0.35913268364756945, + "learning_rate": 4.425657541599571e-05, + "loss": 0.6513, + "step": 421 + }, + { + "epoch": 0.6115942028985507, + "grad_norm": 0.47580547381182897, + "learning_rate": 4.4229736983360176e-05, + "loss": 0.6257, + "step": 422 + }, + { + "epoch": 0.6130434782608696, + "grad_norm": 0.37911697302582864, + "learning_rate": 4.4202898550724645e-05, + "loss": 0.5946, + "step": 423 + }, + { + "epoch": 0.6144927536231884, + "grad_norm": 0.44074764204295713, + "learning_rate": 4.417606011808911e-05, + "loss": 0.55, + "step": 424 + }, + { + "epoch": 0.6159420289855072, + "grad_norm": 0.3937375526845581, + "learning_rate": 4.414922168545357e-05, + "loss": 0.5371, + "step": 425 + }, + { + "epoch": 0.6173913043478261, + "grad_norm": 0.4573471158606008, + "learning_rate": 4.412238325281804e-05, + "loss": 0.6158, + "step": 426 + }, + { + "epoch": 0.618840579710145, + "grad_norm": 0.3840697276185448, + "learning_rate": 4.40955448201825e-05, + "loss": 0.6754, + "step": 427 + }, + { + "epoch": 0.6202898550724638, + "grad_norm": 0.5317382053849552, + "learning_rate": 4.406870638754697e-05, + "loss": 0.647, + "step": 428 + }, + { + "epoch": 0.6217391304347826, + "grad_norm": 0.34527497173414967, + "learning_rate": 4.4041867954911435e-05, + "loss": 0.5108, + "step": 429 + }, + { + "epoch": 0.6231884057971014, + "grad_norm": 0.44422906839362747, + "learning_rate": 4.4015029522275905e-05, + "loss": 0.6502, + "step": 430 + }, + { + "epoch": 0.6246376811594203, + "grad_norm": 0.473283146231613, + "learning_rate": 4.398819108964037e-05, + "loss": 0.63, + "step": 431 + }, + { + "epoch": 0.6260869565217392, + "grad_norm": 0.40163835413996013, + "learning_rate": 4.396135265700484e-05, + "loss": 0.6318, + "step": 432 + }, + { + "epoch": 0.6275362318840579, + "grad_norm": 0.39716935347056315, + "learning_rate": 4.39345142243693e-05, + "loss": 0.5803, + "step": 433 + }, + { + "epoch": 0.6289855072463768, + "grad_norm": 0.357753745770442, + "learning_rate": 4.390767579173376e-05, + "loss": 0.595, + "step": 434 + }, + { + "epoch": 0.6304347826086957, + "grad_norm": 0.3687517288111165, + "learning_rate": 4.388083735909823e-05, + "loss": 0.5644, + "step": 435 + }, + { + "epoch": 0.6318840579710145, + "grad_norm": 0.38181865324249625, + "learning_rate": 4.3853998926462695e-05, + "loss": 0.6464, + "step": 436 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 0.44474753275399415, + "learning_rate": 4.3827160493827164e-05, + "loss": 0.5878, + "step": 437 + }, + { + "epoch": 0.6347826086956522, + "grad_norm": 0.3352537728421649, + "learning_rate": 4.380032206119163e-05, + "loss": 0.5174, + "step": 438 + }, + { + "epoch": 0.636231884057971, + "grad_norm": 0.5132005847948536, + "learning_rate": 4.3773483628556096e-05, + "loss": 0.6436, + "step": 439 + }, + { + "epoch": 0.6376811594202898, + "grad_norm": 0.44956523639329127, + "learning_rate": 4.374664519592056e-05, + "loss": 0.6523, + "step": 440 + }, + { + "epoch": 0.6391304347826087, + "grad_norm": 0.3264095375727053, + "learning_rate": 4.371980676328503e-05, + "loss": 0.5701, + "step": 441 + }, + { + "epoch": 0.6405797101449275, + "grad_norm": 0.47943855397136076, + "learning_rate": 4.369296833064949e-05, + "loss": 0.6899, + "step": 442 + }, + { + "epoch": 0.6420289855072464, + "grad_norm": 0.37295675960299046, + "learning_rate": 4.366612989801396e-05, + "loss": 0.5943, + "step": 443 + }, + { + "epoch": 0.6434782608695652, + "grad_norm": 0.37607090500273194, + "learning_rate": 4.3639291465378424e-05, + "loss": 0.6024, + "step": 444 + }, + { + "epoch": 0.644927536231884, + "grad_norm": 0.3709434406889319, + "learning_rate": 4.3612453032742886e-05, + "loss": 0.5975, + "step": 445 + }, + { + "epoch": 0.6463768115942029, + "grad_norm": 0.4001494650314734, + "learning_rate": 4.3585614600107356e-05, + "loss": 0.6961, + "step": 446 + }, + { + "epoch": 0.6478260869565218, + "grad_norm": 0.3723471807287628, + "learning_rate": 4.355877616747182e-05, + "loss": 0.6208, + "step": 447 + }, + { + "epoch": 0.6492753623188405, + "grad_norm": 0.41592983850578813, + "learning_rate": 4.353193773483629e-05, + "loss": 0.6253, + "step": 448 + }, + { + "epoch": 0.6507246376811594, + "grad_norm": 0.39096102588060283, + "learning_rate": 4.350509930220075e-05, + "loss": 0.6357, + "step": 449 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 0.323828875516656, + "learning_rate": 4.347826086956522e-05, + "loss": 0.5617, + "step": 450 + }, + { + "epoch": 0.6536231884057971, + "grad_norm": 0.36464426117660076, + "learning_rate": 4.345142243692968e-05, + "loss": 0.6906, + "step": 451 + }, + { + "epoch": 0.6550724637681159, + "grad_norm": 0.3273186362011285, + "learning_rate": 4.342458400429415e-05, + "loss": 0.5552, + "step": 452 + }, + { + "epoch": 0.6565217391304348, + "grad_norm": 0.38576665695385565, + "learning_rate": 4.3397745571658615e-05, + "loss": 0.5915, + "step": 453 + }, + { + "epoch": 0.6579710144927536, + "grad_norm": 0.4312121223746291, + "learning_rate": 4.337090713902308e-05, + "loss": 0.622, + "step": 454 + }, + { + "epoch": 0.6594202898550725, + "grad_norm": 0.35218029913587057, + "learning_rate": 4.334406870638755e-05, + "loss": 0.6485, + "step": 455 + }, + { + "epoch": 0.6608695652173913, + "grad_norm": 0.4214000155582382, + "learning_rate": 4.331723027375201e-05, + "loss": 0.5555, + "step": 456 + }, + { + "epoch": 0.6623188405797101, + "grad_norm": 0.36980449511799784, + "learning_rate": 4.329039184111648e-05, + "loss": 0.7021, + "step": 457 + }, + { + "epoch": 0.663768115942029, + "grad_norm": 0.3833858984633924, + "learning_rate": 4.326355340848094e-05, + "loss": 0.5455, + "step": 458 + }, + { + "epoch": 0.6652173913043479, + "grad_norm": 0.3621748626108088, + "learning_rate": 4.323671497584541e-05, + "loss": 0.6451, + "step": 459 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.44501860079574185, + "learning_rate": 4.3209876543209875e-05, + "loss": 0.6063, + "step": 460 + }, + { + "epoch": 0.6681159420289855, + "grad_norm": 0.40740730053072266, + "learning_rate": 4.3183038110574344e-05, + "loss": 0.6471, + "step": 461 + }, + { + "epoch": 0.6695652173913044, + "grad_norm": 0.42454592710002687, + "learning_rate": 4.315619967793881e-05, + "loss": 0.6733, + "step": 462 + }, + { + "epoch": 0.6710144927536232, + "grad_norm": 0.4413724435680676, + "learning_rate": 4.312936124530328e-05, + "loss": 0.5883, + "step": 463 + }, + { + "epoch": 0.672463768115942, + "grad_norm": 0.38871349519437576, + "learning_rate": 4.3102522812667746e-05, + "loss": 0.7363, + "step": 464 + }, + { + "epoch": 0.6739130434782609, + "grad_norm": 0.5312564883836602, + "learning_rate": 4.307568438003221e-05, + "loss": 0.5356, + "step": 465 + }, + { + "epoch": 0.6753623188405797, + "grad_norm": 0.5250394278357895, + "learning_rate": 4.304884594739668e-05, + "loss": 0.6368, + "step": 466 + }, + { + "epoch": 0.6768115942028986, + "grad_norm": 0.5002775405148884, + "learning_rate": 4.302200751476114e-05, + "loss": 0.6285, + "step": 467 + }, + { + "epoch": 0.6782608695652174, + "grad_norm": 0.6010481334870542, + "learning_rate": 4.299516908212561e-05, + "loss": 0.5958, + "step": 468 + }, + { + "epoch": 0.6797101449275362, + "grad_norm": 0.5137320966075457, + "learning_rate": 4.296833064949007e-05, + "loss": 0.6357, + "step": 469 + }, + { + "epoch": 0.6811594202898551, + "grad_norm": 0.7634800458708868, + "learning_rate": 4.294149221685454e-05, + "loss": 0.6266, + "step": 470 + }, + { + "epoch": 0.6826086956521739, + "grad_norm": 0.5412119314366018, + "learning_rate": 4.2914653784219006e-05, + "loss": 0.6257, + "step": 471 + }, + { + "epoch": 0.6840579710144927, + "grad_norm": 0.46005094137484503, + "learning_rate": 4.288781535158347e-05, + "loss": 0.6269, + "step": 472 + }, + { + "epoch": 0.6855072463768116, + "grad_norm": 0.5821703799591006, + "learning_rate": 4.286097691894794e-05, + "loss": 0.6145, + "step": 473 + }, + { + "epoch": 0.6869565217391305, + "grad_norm": 0.4076874787060111, + "learning_rate": 4.28341384863124e-05, + "loss": 0.6048, + "step": 474 + }, + { + "epoch": 0.6884057971014492, + "grad_norm": 0.5026991240254921, + "learning_rate": 4.280730005367687e-05, + "loss": 0.6283, + "step": 475 + }, + { + "epoch": 0.6898550724637681, + "grad_norm": 0.4994831943753748, + "learning_rate": 4.278046162104133e-05, + "loss": 0.5844, + "step": 476 + }, + { + "epoch": 0.691304347826087, + "grad_norm": 0.5923231155149884, + "learning_rate": 4.27536231884058e-05, + "loss": 0.6238, + "step": 477 + }, + { + "epoch": 0.6927536231884058, + "grad_norm": 4.1329790925392755, + "learning_rate": 4.2726784755770265e-05, + "loss": 0.6822, + "step": 478 + }, + { + "epoch": 0.6942028985507246, + "grad_norm": 1.232119030556872, + "learning_rate": 4.2699946323134735e-05, + "loss": 0.6798, + "step": 479 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.75778686644872, + "learning_rate": 4.26731078904992e-05, + "loss": 0.5747, + "step": 480 + }, + { + "epoch": 0.6971014492753623, + "grad_norm": 0.5303268834658762, + "learning_rate": 4.264626945786366e-05, + "loss": 0.4669, + "step": 481 + }, + { + "epoch": 0.6985507246376812, + "grad_norm": 0.6978339100881147, + "learning_rate": 4.261943102522813e-05, + "loss": 0.6481, + "step": 482 + }, + { + "epoch": 0.7, + "grad_norm": 0.5390133158454951, + "learning_rate": 4.259259259259259e-05, + "loss": 0.5633, + "step": 483 + }, + { + "epoch": 0.7014492753623188, + "grad_norm": 0.6534947509239906, + "learning_rate": 4.256575415995706e-05, + "loss": 0.5855, + "step": 484 + }, + { + "epoch": 0.7028985507246377, + "grad_norm": 0.6128904403486618, + "learning_rate": 4.2538915727321525e-05, + "loss": 0.65, + "step": 485 + }, + { + "epoch": 0.7043478260869566, + "grad_norm": 0.6224056264214514, + "learning_rate": 4.2512077294685994e-05, + "loss": 0.6994, + "step": 486 + }, + { + "epoch": 0.7057971014492753, + "grad_norm": 2.921316276290605, + "learning_rate": 4.248523886205046e-05, + "loss": 0.6553, + "step": 487 + }, + { + "epoch": 0.7072463768115942, + "grad_norm": 1.094977952559785, + "learning_rate": 4.2458400429414926e-05, + "loss": 0.6761, + "step": 488 + }, + { + "epoch": 0.7086956521739131, + "grad_norm": 0.8497515292463685, + "learning_rate": 4.243156199677939e-05, + "loss": 0.5919, + "step": 489 + }, + { + "epoch": 0.7101449275362319, + "grad_norm": 0.7893564837809413, + "learning_rate": 4.240472356414386e-05, + "loss": 0.6577, + "step": 490 + }, + { + "epoch": 0.7115942028985507, + "grad_norm": 1.1274908966110668, + "learning_rate": 4.237788513150832e-05, + "loss": 0.7219, + "step": 491 + }, + { + "epoch": 0.7130434782608696, + "grad_norm": 0.5753539741053254, + "learning_rate": 4.2351046698872784e-05, + "loss": 0.5748, + "step": 492 + }, + { + "epoch": 0.7144927536231884, + "grad_norm": 0.976724089496432, + "learning_rate": 4.2324208266237254e-05, + "loss": 0.5696, + "step": 493 + }, + { + "epoch": 0.7159420289855073, + "grad_norm": 0.6633654676572153, + "learning_rate": 4.2297369833601716e-05, + "loss": 0.6226, + "step": 494 + }, + { + "epoch": 0.717391304347826, + "grad_norm": 0.9829938269432279, + "learning_rate": 4.2270531400966186e-05, + "loss": 0.6031, + "step": 495 + }, + { + "epoch": 0.7188405797101449, + "grad_norm": 0.5633879991151081, + "learning_rate": 4.224369296833065e-05, + "loss": 0.6086, + "step": 496 + }, + { + "epoch": 0.7202898550724638, + "grad_norm": 0.9430497432801044, + "learning_rate": 4.221685453569512e-05, + "loss": 0.5859, + "step": 497 + }, + { + "epoch": 0.7217391304347827, + "grad_norm": 0.46009916937328854, + "learning_rate": 4.219001610305958e-05, + "loss": 0.6064, + "step": 498 + }, + { + "epoch": 0.7231884057971014, + "grad_norm": 0.7216715180796771, + "learning_rate": 4.216317767042405e-05, + "loss": 0.5473, + "step": 499 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 0.4509522203864577, + "learning_rate": 4.213633923778851e-05, + "loss": 0.5709, + "step": 500 + }, + { + "epoch": 0.7260869565217392, + "grad_norm": 0.523588042133622, + "learning_rate": 4.210950080515298e-05, + "loss": 0.5996, + "step": 501 + }, + { + "epoch": 0.7275362318840579, + "grad_norm": 0.4624176684479565, + "learning_rate": 4.2082662372517445e-05, + "loss": 0.5835, + "step": 502 + }, + { + "epoch": 0.7289855072463768, + "grad_norm": 0.43623626036112917, + "learning_rate": 4.205582393988191e-05, + "loss": 0.6786, + "step": 503 + }, + { + "epoch": 0.7304347826086957, + "grad_norm": 0.4542767125120218, + "learning_rate": 4.202898550724638e-05, + "loss": 0.6512, + "step": 504 + }, + { + "epoch": 0.7318840579710145, + "grad_norm": 14.137605164626205, + "learning_rate": 4.200214707461084e-05, + "loss": 0.7138, + "step": 505 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.5978196809747577, + "learning_rate": 4.197530864197531e-05, + "loss": 0.6954, + "step": 506 + }, + { + "epoch": 0.7347826086956522, + "grad_norm": 0.47590719779152063, + "learning_rate": 4.194847020933977e-05, + "loss": 0.5772, + "step": 507 + }, + { + "epoch": 0.736231884057971, + "grad_norm": 0.48674586940054604, + "learning_rate": 4.192163177670424e-05, + "loss": 0.6152, + "step": 508 + }, + { + "epoch": 0.7376811594202899, + "grad_norm": 0.5344336837088249, + "learning_rate": 4.1894793344068705e-05, + "loss": 0.5918, + "step": 509 + }, + { + "epoch": 0.7391304347826086, + "grad_norm": 0.4642887974191174, + "learning_rate": 4.1867954911433174e-05, + "loss": 0.6326, + "step": 510 + }, + { + "epoch": 0.7405797101449275, + "grad_norm": 0.5347402718091168, + "learning_rate": 4.1841116478797644e-05, + "loss": 0.5764, + "step": 511 + }, + { + "epoch": 0.7420289855072464, + "grad_norm": 0.5438038315928879, + "learning_rate": 4.181427804616211e-05, + "loss": 0.6002, + "step": 512 + }, + { + "epoch": 0.7434782608695653, + "grad_norm": 0.4825629049018045, + "learning_rate": 4.1787439613526576e-05, + "loss": 0.5937, + "step": 513 + }, + { + "epoch": 0.744927536231884, + "grad_norm": 0.48979088485780425, + "learning_rate": 4.176060118089104e-05, + "loss": 0.6404, + "step": 514 + }, + { + "epoch": 0.7463768115942029, + "grad_norm": 0.4620698383348423, + "learning_rate": 4.173376274825551e-05, + "loss": 0.5397, + "step": 515 + }, + { + "epoch": 0.7478260869565218, + "grad_norm": 0.44176580357824313, + "learning_rate": 4.170692431561997e-05, + "loss": 0.6207, + "step": 516 + }, + { + "epoch": 0.7492753623188406, + "grad_norm": 0.41871753355296315, + "learning_rate": 4.168008588298444e-05, + "loss": 0.6503, + "step": 517 + }, + { + "epoch": 0.7507246376811594, + "grad_norm": 0.47948444268186163, + "learning_rate": 4.1653247450348903e-05, + "loss": 0.563, + "step": 518 + }, + { + "epoch": 0.7521739130434782, + "grad_norm": 0.3822045549344485, + "learning_rate": 4.1626409017713366e-05, + "loss": 0.5774, + "step": 519 + }, + { + "epoch": 0.7536231884057971, + "grad_norm": 0.538431027647131, + "learning_rate": 4.1599570585077836e-05, + "loss": 0.6085, + "step": 520 + }, + { + "epoch": 0.755072463768116, + "grad_norm": 0.35528969092125745, + "learning_rate": 4.15727321524423e-05, + "loss": 0.6009, + "step": 521 + }, + { + "epoch": 0.7565217391304347, + "grad_norm": 0.4515347734731531, + "learning_rate": 4.154589371980677e-05, + "loss": 0.5579, + "step": 522 + }, + { + "epoch": 0.7579710144927536, + "grad_norm": 0.40842988456854906, + "learning_rate": 4.151905528717123e-05, + "loss": 0.6111, + "step": 523 + }, + { + "epoch": 0.7594202898550725, + "grad_norm": 0.4091827511039516, + "learning_rate": 4.14922168545357e-05, + "loss": 0.6287, + "step": 524 + }, + { + "epoch": 0.7608695652173914, + "grad_norm": 0.49858433261609175, + "learning_rate": 4.146537842190016e-05, + "loss": 0.6857, + "step": 525 + }, + { + "epoch": 0.7623188405797101, + "grad_norm": 0.4983638734374692, + "learning_rate": 4.143853998926463e-05, + "loss": 0.6053, + "step": 526 + }, + { + "epoch": 0.763768115942029, + "grad_norm": 0.46242329165053936, + "learning_rate": 4.1411701556629095e-05, + "loss": 0.5745, + "step": 527 + }, + { + "epoch": 0.7652173913043478, + "grad_norm": 0.42992145313121877, + "learning_rate": 4.1384863123993565e-05, + "loss": 0.6056, + "step": 528 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 0.5733079560388592, + "learning_rate": 4.135802469135803e-05, + "loss": 0.6339, + "step": 529 + }, + { + "epoch": 0.7681159420289855, + "grad_norm": 0.3467646266827582, + "learning_rate": 4.133118625872249e-05, + "loss": 0.6057, + "step": 530 + }, + { + "epoch": 0.7695652173913043, + "grad_norm": 0.4876355404655841, + "learning_rate": 4.130434782608696e-05, + "loss": 0.5334, + "step": 531 + }, + { + "epoch": 0.7710144927536232, + "grad_norm": 0.3993002820075125, + "learning_rate": 4.127750939345142e-05, + "loss": 0.6431, + "step": 532 + }, + { + "epoch": 0.7724637681159421, + "grad_norm": 0.5531641534870261, + "learning_rate": 4.125067096081589e-05, + "loss": 0.5591, + "step": 533 + }, + { + "epoch": 0.7739130434782608, + "grad_norm": 0.44858435855948825, + "learning_rate": 4.1223832528180355e-05, + "loss": 0.5929, + "step": 534 + }, + { + "epoch": 0.7753623188405797, + "grad_norm": 0.6282196147018511, + "learning_rate": 4.1196994095544824e-05, + "loss": 0.5181, + "step": 535 + }, + { + "epoch": 0.7768115942028986, + "grad_norm": 0.3967674108479093, + "learning_rate": 4.117015566290929e-05, + "loss": 0.59, + "step": 536 + }, + { + "epoch": 0.7782608695652173, + "grad_norm": 0.5204142812329497, + "learning_rate": 4.1143317230273756e-05, + "loss": 0.6374, + "step": 537 + }, + { + "epoch": 0.7797101449275362, + "grad_norm": 0.494954976619306, + "learning_rate": 4.111647879763822e-05, + "loss": 0.6296, + "step": 538 + }, + { + "epoch": 0.7811594202898551, + "grad_norm": 0.4706706689193786, + "learning_rate": 4.108964036500268e-05, + "loss": 0.5833, + "step": 539 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 1.512496589807772, + "learning_rate": 4.106280193236715e-05, + "loss": 0.5858, + "step": 540 + }, + { + "epoch": 0.7840579710144927, + "grad_norm": 0.6986263549595149, + "learning_rate": 4.1035963499731614e-05, + "loss": 0.5599, + "step": 541 + }, + { + "epoch": 0.7855072463768116, + "grad_norm": 0.8573296822090767, + "learning_rate": 4.1009125067096084e-05, + "loss": 0.6397, + "step": 542 + }, + { + "epoch": 0.7869565217391304, + "grad_norm": 0.5033595984865323, + "learning_rate": 4.0982286634460546e-05, + "loss": 0.5046, + "step": 543 + }, + { + "epoch": 0.7884057971014493, + "grad_norm": 0.713429803498217, + "learning_rate": 4.0955448201825016e-05, + "loss": 0.491, + "step": 544 + }, + { + "epoch": 0.7898550724637681, + "grad_norm": 0.7848459230690726, + "learning_rate": 4.092860976918948e-05, + "loss": 0.618, + "step": 545 + }, + { + "epoch": 0.7913043478260869, + "grad_norm": 0.844801020655998, + "learning_rate": 4.090177133655395e-05, + "loss": 0.6478, + "step": 546 + }, + { + "epoch": 0.7927536231884058, + "grad_norm": 0.6611299388643612, + "learning_rate": 4.087493290391841e-05, + "loss": 0.5311, + "step": 547 + }, + { + "epoch": 0.7942028985507247, + "grad_norm": 0.6499654815412405, + "learning_rate": 4.084809447128288e-05, + "loss": 0.5961, + "step": 548 + }, + { + "epoch": 0.7956521739130434, + "grad_norm": 0.5252072917572203, + "learning_rate": 4.082125603864734e-05, + "loss": 0.5862, + "step": 549 + }, + { + "epoch": 0.7971014492753623, + "grad_norm": 0.5207296431736637, + "learning_rate": 4.0794417606011806e-05, + "loss": 0.5172, + "step": 550 + }, + { + "epoch": 0.7985507246376812, + "grad_norm": 0.4689572540878984, + "learning_rate": 4.0767579173376275e-05, + "loss": 0.5726, + "step": 551 + }, + { + "epoch": 0.8, + "grad_norm": 0.5458681731232032, + "learning_rate": 4.074074074074074e-05, + "loss": 0.599, + "step": 552 + }, + { + "epoch": 0.8014492753623188, + "grad_norm": 0.38372406832517136, + "learning_rate": 4.071390230810521e-05, + "loss": 0.5514, + "step": 553 + }, + { + "epoch": 0.8028985507246377, + "grad_norm": 0.5094489821646551, + "learning_rate": 4.068706387546967e-05, + "loss": 0.5899, + "step": 554 + }, + { + "epoch": 0.8043478260869565, + "grad_norm": 2.1680852632462835, + "learning_rate": 4.066022544283414e-05, + "loss": 0.6013, + "step": 555 + }, + { + "epoch": 0.8057971014492754, + "grad_norm": 0.5692937335137704, + "learning_rate": 4.06333870101986e-05, + "loss": 0.6152, + "step": 556 + }, + { + "epoch": 0.8072463768115942, + "grad_norm": 0.4703136901533402, + "learning_rate": 4.060654857756307e-05, + "loss": 0.6275, + "step": 557 + }, + { + "epoch": 0.808695652173913, + "grad_norm": 0.5971159611839674, + "learning_rate": 4.057971014492754e-05, + "loss": 0.5077, + "step": 558 + }, + { + "epoch": 0.8101449275362319, + "grad_norm": 1.8068906932840791, + "learning_rate": 4.0552871712292004e-05, + "loss": 0.6552, + "step": 559 + }, + { + "epoch": 0.8115942028985508, + "grad_norm": 0.5449109715967542, + "learning_rate": 4.0526033279656474e-05, + "loss": 0.6109, + "step": 560 + }, + { + "epoch": 0.8130434782608695, + "grad_norm": 0.43388002270079523, + "learning_rate": 4.049919484702094e-05, + "loss": 0.6131, + "step": 561 + }, + { + "epoch": 0.8144927536231884, + "grad_norm": 0.4211682095005935, + "learning_rate": 4.0472356414385406e-05, + "loss": 0.5582, + "step": 562 + }, + { + "epoch": 0.8159420289855073, + "grad_norm": 0.49058073662614093, + "learning_rate": 4.044551798174987e-05, + "loss": 0.5524, + "step": 563 + }, + { + "epoch": 0.8173913043478261, + "grad_norm": 0.4473245845954027, + "learning_rate": 4.041867954911434e-05, + "loss": 0.5604, + "step": 564 + }, + { + "epoch": 0.8188405797101449, + "grad_norm": 0.4356196966368169, + "learning_rate": 4.03918411164788e-05, + "loss": 0.5579, + "step": 565 + }, + { + "epoch": 0.8202898550724638, + "grad_norm": 0.3670371817158889, + "learning_rate": 4.0365002683843264e-05, + "loss": 0.6055, + "step": 566 + }, + { + "epoch": 0.8217391304347826, + "grad_norm": 0.5140778620459774, + "learning_rate": 4.0338164251207733e-05, + "loss": 0.6115, + "step": 567 + }, + { + "epoch": 0.8231884057971014, + "grad_norm": 1.346065214744829, + "learning_rate": 4.0311325818572196e-05, + "loss": 0.6003, + "step": 568 + }, + { + "epoch": 0.8246376811594203, + "grad_norm": 0.8248383138671131, + "learning_rate": 4.0284487385936666e-05, + "loss": 0.7642, + "step": 569 + }, + { + "epoch": 0.8260869565217391, + "grad_norm": 0.5120287581760506, + "learning_rate": 4.025764895330113e-05, + "loss": 0.6164, + "step": 570 + }, + { + "epoch": 0.827536231884058, + "grad_norm": 0.4743646973120087, + "learning_rate": 4.02308105206656e-05, + "loss": 0.549, + "step": 571 + }, + { + "epoch": 0.8289855072463768, + "grad_norm": 3.3210188043275375, + "learning_rate": 4.020397208803006e-05, + "loss": 0.667, + "step": 572 + }, + { + "epoch": 0.8304347826086956, + "grad_norm": 0.6837248648294643, + "learning_rate": 4.017713365539453e-05, + "loss": 0.533, + "step": 573 + }, + { + "epoch": 0.8318840579710145, + "grad_norm": 0.516129522264584, + "learning_rate": 4.015029522275899e-05, + "loss": 0.6341, + "step": 574 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 2.7438937595276647, + "learning_rate": 4.012345679012346e-05, + "loss": 0.6521, + "step": 575 + }, + { + "epoch": 0.8347826086956521, + "grad_norm": 0.8592649758656578, + "learning_rate": 4.0096618357487925e-05, + "loss": 0.5506, + "step": 576 + }, + { + "epoch": 0.836231884057971, + "grad_norm": 1.5555303566525747, + "learning_rate": 4.006977992485239e-05, + "loss": 0.5788, + "step": 577 + }, + { + "epoch": 0.8376811594202899, + "grad_norm": 0.6899087860610227, + "learning_rate": 4.004294149221686e-05, + "loss": 0.5876, + "step": 578 + }, + { + "epoch": 0.8391304347826087, + "grad_norm": 0.7046190555629226, + "learning_rate": 4.001610305958132e-05, + "loss": 0.6044, + "step": 579 + }, + { + "epoch": 0.8405797101449275, + "grad_norm": 0.5740571870652859, + "learning_rate": 3.998926462694579e-05, + "loss": 0.5523, + "step": 580 + }, + { + "epoch": 0.8420289855072464, + "grad_norm": 3.0856739801665896, + "learning_rate": 3.996242619431025e-05, + "loss": 0.6992, + "step": 581 + }, + { + "epoch": 0.8434782608695652, + "grad_norm": 0.6217031879385219, + "learning_rate": 3.993558776167472e-05, + "loss": 0.5843, + "step": 582 + }, + { + "epoch": 0.8449275362318841, + "grad_norm": 0.5855584870859855, + "learning_rate": 3.9908749329039185e-05, + "loss": 0.804, + "step": 583 + }, + { + "epoch": 0.8463768115942029, + "grad_norm": 0.5127730494036674, + "learning_rate": 3.9881910896403654e-05, + "loss": 0.5784, + "step": 584 + }, + { + "epoch": 0.8478260869565217, + "grad_norm": 0.48011376646056936, + "learning_rate": 3.985507246376812e-05, + "loss": 0.5245, + "step": 585 + }, + { + "epoch": 0.8492753623188406, + "grad_norm": 0.5894489075240219, + "learning_rate": 3.982823403113258e-05, + "loss": 0.6293, + "step": 586 + }, + { + "epoch": 0.8507246376811595, + "grad_norm": 0.5378776256572029, + "learning_rate": 3.980139559849705e-05, + "loss": 0.6049, + "step": 587 + }, + { + "epoch": 0.8521739130434782, + "grad_norm": 0.7083934176689143, + "learning_rate": 3.977455716586151e-05, + "loss": 0.6418, + "step": 588 + }, + { + "epoch": 0.8536231884057971, + "grad_norm": 0.6190476116945333, + "learning_rate": 3.974771873322598e-05, + "loss": 0.5592, + "step": 589 + }, + { + "epoch": 0.855072463768116, + "grad_norm": 0.4120642741902257, + "learning_rate": 3.9720880300590444e-05, + "loss": 0.5752, + "step": 590 + }, + { + "epoch": 0.8565217391304348, + "grad_norm": 0.6013798835291985, + "learning_rate": 3.9694041867954914e-05, + "loss": 0.6497, + "step": 591 + }, + { + "epoch": 0.8579710144927536, + "grad_norm": 0.44171429178629873, + "learning_rate": 3.9667203435319376e-05, + "loss": 0.6097, + "step": 592 + }, + { + "epoch": 0.8594202898550725, + "grad_norm": 0.53452983650377, + "learning_rate": 3.9640365002683846e-05, + "loss": 0.6558, + "step": 593 + }, + { + "epoch": 0.8608695652173913, + "grad_norm": 0.4186097201166055, + "learning_rate": 3.961352657004831e-05, + "loss": 0.7236, + "step": 594 + }, + { + "epoch": 0.8623188405797102, + "grad_norm": 0.4645596168717277, + "learning_rate": 3.958668813741278e-05, + "loss": 0.5854, + "step": 595 + }, + { + "epoch": 0.863768115942029, + "grad_norm": 0.43029743789021974, + "learning_rate": 3.955984970477724e-05, + "loss": 0.6233, + "step": 596 + }, + { + "epoch": 0.8652173913043478, + "grad_norm": 0.43292776931294014, + "learning_rate": 3.9533011272141704e-05, + "loss": 0.6439, + "step": 597 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.43423115706910775, + "learning_rate": 3.950617283950617e-05, + "loss": 0.589, + "step": 598 + }, + { + "epoch": 0.8681159420289855, + "grad_norm": 0.4755946805708221, + "learning_rate": 3.9479334406870636e-05, + "loss": 0.586, + "step": 599 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.4341006298363921, + "learning_rate": 3.9452495974235105e-05, + "loss": 0.5903, + "step": 600 + }, + { + "epoch": 0.8710144927536232, + "grad_norm": 0.4718786944262586, + "learning_rate": 3.942565754159957e-05, + "loss": 0.5135, + "step": 601 + }, + { + "epoch": 0.8724637681159421, + "grad_norm": 0.40791241360653085, + "learning_rate": 3.939881910896404e-05, + "loss": 0.6038, + "step": 602 + }, + { + "epoch": 0.8739130434782608, + "grad_norm": 0.5059495818189851, + "learning_rate": 3.937198067632851e-05, + "loss": 0.6354, + "step": 603 + }, + { + "epoch": 0.8753623188405797, + "grad_norm": 0.431048802834448, + "learning_rate": 3.934514224369297e-05, + "loss": 0.6642, + "step": 604 + }, + { + "epoch": 0.8768115942028986, + "grad_norm": 0.4965530971692437, + "learning_rate": 3.931830381105744e-05, + "loss": 0.627, + "step": 605 + }, + { + "epoch": 0.8782608695652174, + "grad_norm": 0.46139203603720497, + "learning_rate": 3.92914653784219e-05, + "loss": 0.6329, + "step": 606 + }, + { + "epoch": 0.8797101449275362, + "grad_norm": 0.42125965185438957, + "learning_rate": 3.926462694578637e-05, + "loss": 0.5932, + "step": 607 + }, + { + "epoch": 0.881159420289855, + "grad_norm": 0.367329901744191, + "learning_rate": 3.9237788513150834e-05, + "loss": 0.6201, + "step": 608 + }, + { + "epoch": 0.8826086956521739, + "grad_norm": 0.4524070054726798, + "learning_rate": 3.9210950080515304e-05, + "loss": 0.5822, + "step": 609 + }, + { + "epoch": 0.8840579710144928, + "grad_norm": 0.467261697364059, + "learning_rate": 3.918411164787977e-05, + "loss": 0.4836, + "step": 610 + }, + { + "epoch": 0.8855072463768116, + "grad_norm": 0.4463128896050495, + "learning_rate": 3.9157273215244236e-05, + "loss": 0.4819, + "step": 611 + }, + { + "epoch": 0.8869565217391304, + "grad_norm": 0.40526007809503917, + "learning_rate": 3.91304347826087e-05, + "loss": 0.551, + "step": 612 + }, + { + "epoch": 0.8884057971014493, + "grad_norm": 0.41385125651458887, + "learning_rate": 3.910359634997316e-05, + "loss": 0.6159, + "step": 613 + }, + { + "epoch": 0.8898550724637682, + "grad_norm": 0.41673986104362226, + "learning_rate": 3.907675791733763e-05, + "loss": 0.6247, + "step": 614 + }, + { + "epoch": 0.8913043478260869, + "grad_norm": 0.36308482982577767, + "learning_rate": 3.9049919484702094e-05, + "loss": 0.6022, + "step": 615 + }, + { + "epoch": 0.8927536231884058, + "grad_norm": 0.38126692153672176, + "learning_rate": 3.9023081052066564e-05, + "loss": 0.6281, + "step": 616 + }, + { + "epoch": 0.8942028985507247, + "grad_norm": 0.33298531744437676, + "learning_rate": 3.8996242619431026e-05, + "loss": 0.5867, + "step": 617 + }, + { + "epoch": 0.8956521739130435, + "grad_norm": 0.3352484634893385, + "learning_rate": 3.8969404186795496e-05, + "loss": 0.5519, + "step": 618 + }, + { + "epoch": 0.8971014492753623, + "grad_norm": 0.37639114214176045, + "learning_rate": 3.894256575415996e-05, + "loss": 0.5195, + "step": 619 + }, + { + "epoch": 0.8985507246376812, + "grad_norm": 0.34048760325095795, + "learning_rate": 3.891572732152443e-05, + "loss": 0.5787, + "step": 620 + }, + { + "epoch": 0.9, + "grad_norm": 0.3832821048268026, + "learning_rate": 3.888888888888889e-05, + "loss": 0.5247, + "step": 621 + }, + { + "epoch": 0.9014492753623189, + "grad_norm": 0.40064194244810836, + "learning_rate": 3.886205045625336e-05, + "loss": 0.6286, + "step": 622 + }, + { + "epoch": 0.9028985507246376, + "grad_norm": 0.40965106020853814, + "learning_rate": 3.883521202361782e-05, + "loss": 0.5611, + "step": 623 + }, + { + "epoch": 0.9043478260869565, + "grad_norm": 0.4952521908568183, + "learning_rate": 3.8808373590982286e-05, + "loss": 0.6228, + "step": 624 + }, + { + "epoch": 0.9057971014492754, + "grad_norm": 0.3165074112635924, + "learning_rate": 3.8781535158346755e-05, + "loss": 0.5923, + "step": 625 + }, + { + "epoch": 0.9072463768115943, + "grad_norm": 0.3593150540525319, + "learning_rate": 3.875469672571122e-05, + "loss": 0.5492, + "step": 626 + }, + { + "epoch": 0.908695652173913, + "grad_norm": 0.30592808729109405, + "learning_rate": 3.872785829307569e-05, + "loss": 0.5315, + "step": 627 + }, + { + "epoch": 0.9101449275362319, + "grad_norm": 0.34550819407901817, + "learning_rate": 3.870101986044015e-05, + "loss": 0.5522, + "step": 628 + }, + { + "epoch": 0.9115942028985508, + "grad_norm": 0.45826140995448045, + "learning_rate": 3.867418142780462e-05, + "loss": 0.5927, + "step": 629 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 0.4563119771926975, + "learning_rate": 3.864734299516908e-05, + "loss": 0.6512, + "step": 630 + }, + { + "epoch": 0.9144927536231884, + "grad_norm": 0.38702857798816853, + "learning_rate": 3.862050456253355e-05, + "loss": 0.6182, + "step": 631 + }, + { + "epoch": 0.9159420289855073, + "grad_norm": 0.3900901657498249, + "learning_rate": 3.8593666129898015e-05, + "loss": 0.5362, + "step": 632 + }, + { + "epoch": 0.9173913043478261, + "grad_norm": 0.44337248837753757, + "learning_rate": 3.8566827697262484e-05, + "loss": 0.6322, + "step": 633 + }, + { + "epoch": 0.9188405797101449, + "grad_norm": 0.35254524944898624, + "learning_rate": 3.853998926462695e-05, + "loss": 0.5645, + "step": 634 + }, + { + "epoch": 0.9202898550724637, + "grad_norm": 0.42395525642297494, + "learning_rate": 3.851315083199141e-05, + "loss": 0.6362, + "step": 635 + }, + { + "epoch": 0.9217391304347826, + "grad_norm": 0.4361979901795581, + "learning_rate": 3.848631239935588e-05, + "loss": 0.6492, + "step": 636 + }, + { + "epoch": 0.9231884057971015, + "grad_norm": 0.46478955582935616, + "learning_rate": 3.845947396672034e-05, + "loss": 0.5464, + "step": 637 + }, + { + "epoch": 0.9246376811594202, + "grad_norm": 0.4138224106120635, + "learning_rate": 3.843263553408481e-05, + "loss": 0.6005, + "step": 638 + }, + { + "epoch": 0.9260869565217391, + "grad_norm": 0.455808647015972, + "learning_rate": 3.8405797101449274e-05, + "loss": 0.6613, + "step": 639 + }, + { + "epoch": 0.927536231884058, + "grad_norm": 0.427802788147173, + "learning_rate": 3.8378958668813744e-05, + "loss": 0.6763, + "step": 640 + }, + { + "epoch": 0.9289855072463769, + "grad_norm": 0.3811292517396575, + "learning_rate": 3.8352120236178207e-05, + "loss": 0.5262, + "step": 641 + }, + { + "epoch": 0.9304347826086956, + "grad_norm": 0.43191325483841736, + "learning_rate": 3.8325281803542676e-05, + "loss": 0.6173, + "step": 642 + }, + { + "epoch": 0.9318840579710145, + "grad_norm": 0.42403510011211526, + "learning_rate": 3.829844337090714e-05, + "loss": 0.6037, + "step": 643 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.41707900019612226, + "learning_rate": 3.82716049382716e-05, + "loss": 0.6085, + "step": 644 + }, + { + "epoch": 0.9347826086956522, + "grad_norm": 0.3505630178456613, + "learning_rate": 3.824476650563607e-05, + "loss": 0.5158, + "step": 645 + }, + { + "epoch": 0.936231884057971, + "grad_norm": 0.3960236953328549, + "learning_rate": 3.8217928073000534e-05, + "loss": 0.5711, + "step": 646 + }, + { + "epoch": 0.9376811594202898, + "grad_norm": 0.4105260806742507, + "learning_rate": 3.8191089640365e-05, + "loss": 0.5858, + "step": 647 + }, + { + "epoch": 0.9391304347826087, + "grad_norm": 0.33329816188212635, + "learning_rate": 3.8164251207729466e-05, + "loss": 0.6197, + "step": 648 + }, + { + "epoch": 0.9405797101449276, + "grad_norm": 0.44958808906007697, + "learning_rate": 3.8137412775093936e-05, + "loss": 0.6251, + "step": 649 + }, + { + "epoch": 0.9420289855072463, + "grad_norm": 0.3560872934225823, + "learning_rate": 3.8110574342458405e-05, + "loss": 0.5913, + "step": 650 + }, + { + "epoch": 0.9434782608695652, + "grad_norm": 0.3269136684995382, + "learning_rate": 3.808373590982287e-05, + "loss": 0.5154, + "step": 651 + }, + { + "epoch": 0.9449275362318841, + "grad_norm": 0.447727731046666, + "learning_rate": 3.805689747718734e-05, + "loss": 0.6371, + "step": 652 + }, + { + "epoch": 0.946376811594203, + "grad_norm": 0.37864338247741586, + "learning_rate": 3.80300590445518e-05, + "loss": 0.5815, + "step": 653 + }, + { + "epoch": 0.9478260869565217, + "grad_norm": 0.37617318149727297, + "learning_rate": 3.800322061191627e-05, + "loss": 0.5994, + "step": 654 + }, + { + "epoch": 0.9492753623188406, + "grad_norm": 0.37249479290176424, + "learning_rate": 3.797638217928073e-05, + "loss": 0.5855, + "step": 655 + }, + { + "epoch": 0.9507246376811594, + "grad_norm": 0.37349529397279085, + "learning_rate": 3.79495437466452e-05, + "loss": 0.6366, + "step": 656 + }, + { + "epoch": 0.9521739130434783, + "grad_norm": 0.4064309078465553, + "learning_rate": 3.7922705314009665e-05, + "loss": 0.564, + "step": 657 + }, + { + "epoch": 0.9536231884057971, + "grad_norm": 3.2564378472711915, + "learning_rate": 3.7895866881374134e-05, + "loss": 0.6199, + "step": 658 + }, + { + "epoch": 0.9550724637681159, + "grad_norm": 0.4257622394894502, + "learning_rate": 3.78690284487386e-05, + "loss": 0.6179, + "step": 659 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 0.36079814194969073, + "learning_rate": 3.784219001610306e-05, + "loss": 0.6637, + "step": 660 + }, + { + "epoch": 0.9579710144927536, + "grad_norm": 0.3546407933331195, + "learning_rate": 3.781535158346753e-05, + "loss": 0.562, + "step": 661 + }, + { + "epoch": 0.9594202898550724, + "grad_norm": 0.3380010456467773, + "learning_rate": 3.778851315083199e-05, + "loss": 0.6106, + "step": 662 + }, + { + "epoch": 0.9608695652173913, + "grad_norm": 0.4450413707456908, + "learning_rate": 3.776167471819646e-05, + "loss": 0.5588, + "step": 663 + }, + { + "epoch": 0.9623188405797102, + "grad_norm": 0.3503137981645195, + "learning_rate": 3.7734836285560924e-05, + "loss": 0.5943, + "step": 664 + }, + { + "epoch": 0.9637681159420289, + "grad_norm": 0.4620932728470443, + "learning_rate": 3.7707997852925394e-05, + "loss": 0.665, + "step": 665 + }, + { + "epoch": 0.9652173913043478, + "grad_norm": 0.35511456322514035, + "learning_rate": 3.7681159420289856e-05, + "loss": 0.5885, + "step": 666 + }, + { + "epoch": 0.9666666666666667, + "grad_norm": 0.3381162980326822, + "learning_rate": 3.7654320987654326e-05, + "loss": 0.5543, + "step": 667 + }, + { + "epoch": 0.9681159420289855, + "grad_norm": 0.40527670866170007, + "learning_rate": 3.762748255501879e-05, + "loss": 0.5315, + "step": 668 + }, + { + "epoch": 0.9695652173913043, + "grad_norm": 0.3436291560239095, + "learning_rate": 3.760064412238326e-05, + "loss": 0.5304, + "step": 669 + }, + { + "epoch": 0.9710144927536232, + "grad_norm": 0.44490914407508536, + "learning_rate": 3.757380568974772e-05, + "loss": 0.6451, + "step": 670 + }, + { + "epoch": 0.972463768115942, + "grad_norm": 0.34071363921885245, + "learning_rate": 3.7546967257112184e-05, + "loss": 0.6656, + "step": 671 + }, + { + "epoch": 0.9739130434782609, + "grad_norm": 0.4274060634917464, + "learning_rate": 3.752012882447665e-05, + "loss": 0.5736, + "step": 672 + }, + { + "epoch": 0.9753623188405797, + "grad_norm": 0.44530097858565637, + "learning_rate": 3.7493290391841116e-05, + "loss": 0.6671, + "step": 673 + }, + { + "epoch": 0.9768115942028985, + "grad_norm": 0.4800429391051743, + "learning_rate": 3.7466451959205585e-05, + "loss": 0.665, + "step": 674 + }, + { + "epoch": 0.9782608695652174, + "grad_norm": 0.5048795469607198, + "learning_rate": 3.743961352657005e-05, + "loss": 0.664, + "step": 675 + }, + { + "epoch": 0.9797101449275363, + "grad_norm": 0.41025966609715236, + "learning_rate": 3.741277509393452e-05, + "loss": 0.5653, + "step": 676 + }, + { + "epoch": 0.981159420289855, + "grad_norm": 0.3704821518151503, + "learning_rate": 3.738593666129898e-05, + "loss": 0.5432, + "step": 677 + }, + { + "epoch": 0.9826086956521739, + "grad_norm": 0.33328023269603463, + "learning_rate": 3.735909822866345e-05, + "loss": 0.5077, + "step": 678 + }, + { + "epoch": 0.9840579710144928, + "grad_norm": 0.36267745822386244, + "learning_rate": 3.733225979602791e-05, + "loss": 0.5957, + "step": 679 + }, + { + "epoch": 0.9855072463768116, + "grad_norm": 0.34543971218031955, + "learning_rate": 3.730542136339238e-05, + "loss": 0.5911, + "step": 680 + }, + { + "epoch": 0.9869565217391304, + "grad_norm": 2.5749761718864033, + "learning_rate": 3.7278582930756845e-05, + "loss": 0.5624, + "step": 681 + }, + { + "epoch": 0.9884057971014493, + "grad_norm": 0.4213680538219344, + "learning_rate": 3.725174449812131e-05, + "loss": 0.6305, + "step": 682 + }, + { + "epoch": 0.9898550724637681, + "grad_norm": 0.3503305518340307, + "learning_rate": 3.722490606548578e-05, + "loss": 0.5726, + "step": 683 + }, + { + "epoch": 0.991304347826087, + "grad_norm": 0.47858361426280377, + "learning_rate": 3.719806763285024e-05, + "loss": 0.6518, + "step": 684 + }, + { + "epoch": 0.9927536231884058, + "grad_norm": 0.40841795810020365, + "learning_rate": 3.717122920021471e-05, + "loss": 0.6048, + "step": 685 + }, + { + "epoch": 0.9942028985507246, + "grad_norm": 0.4159168319432099, + "learning_rate": 3.714439076757917e-05, + "loss": 0.6226, + "step": 686 + }, + { + "epoch": 0.9956521739130435, + "grad_norm": 0.353982673397565, + "learning_rate": 3.711755233494364e-05, + "loss": 0.5372, + "step": 687 + }, + { + "epoch": 0.9971014492753624, + "grad_norm": 0.4708236125419223, + "learning_rate": 3.7090713902308104e-05, + "loss": 0.5306, + "step": 688 + }, + { + "epoch": 0.9985507246376811, + "grad_norm": 0.5050034586864125, + "learning_rate": 3.7063875469672574e-05, + "loss": 0.5568, + "step": 689 + }, + { + "epoch": 1.0, + "grad_norm": 0.42561847995684515, + "learning_rate": 3.7037037037037037e-05, + "loss": 0.7395, + "step": 690 + }, + { + "epoch": 1.0014492753623188, + "grad_norm": 0.4141984981902665, + "learning_rate": 3.70101986044015e-05, + "loss": 0.5274, + "step": 691 + }, + { + "epoch": 1.0028985507246377, + "grad_norm": 0.40244657538127293, + "learning_rate": 3.698336017176597e-05, + "loss": 0.5858, + "step": 692 + }, + { + "epoch": 1.0043478260869565, + "grad_norm": 0.37737332184052724, + "learning_rate": 3.695652173913043e-05, + "loss": 0.5323, + "step": 693 + }, + { + "epoch": 1.0057971014492753, + "grad_norm": 0.4104832641924855, + "learning_rate": 3.69296833064949e-05, + "loss": 0.511, + "step": 694 + }, + { + "epoch": 1.0072463768115942, + "grad_norm": 0.43539872914512057, + "learning_rate": 3.6902844873859364e-05, + "loss": 0.5521, + "step": 695 + }, + { + "epoch": 1.008695652173913, + "grad_norm": 0.5216741728865025, + "learning_rate": 3.687600644122383e-05, + "loss": 0.4813, + "step": 696 + }, + { + "epoch": 1.010144927536232, + "grad_norm": 0.4751854447776757, + "learning_rate": 3.68491680085883e-05, + "loss": 0.4749, + "step": 697 + }, + { + "epoch": 1.0115942028985507, + "grad_norm": 0.34316199861048846, + "learning_rate": 3.6822329575952766e-05, + "loss": 0.5377, + "step": 698 + }, + { + "epoch": 1.0130434782608695, + "grad_norm": 0.4058139437725442, + "learning_rate": 3.6795491143317235e-05, + "loss": 0.5336, + "step": 699 + }, + { + "epoch": 1.0144927536231885, + "grad_norm": 0.3171721049552687, + "learning_rate": 3.67686527106817e-05, + "loss": 0.4144, + "step": 700 + }, + { + "epoch": 1.0159420289855072, + "grad_norm": 0.3869454431410363, + "learning_rate": 3.674181427804617e-05, + "loss": 0.5649, + "step": 701 + }, + { + "epoch": 1.017391304347826, + "grad_norm": 0.32055242749315693, + "learning_rate": 3.671497584541063e-05, + "loss": 0.4494, + "step": 702 + }, + { + "epoch": 1.018840579710145, + "grad_norm": 0.3525840569410084, + "learning_rate": 3.66881374127751e-05, + "loss": 0.5125, + "step": 703 + }, + { + "epoch": 1.0202898550724637, + "grad_norm": 0.5341701745257826, + "learning_rate": 3.666129898013956e-05, + "loss": 0.5073, + "step": 704 + }, + { + "epoch": 1.0217391304347827, + "grad_norm": 0.29988900486330294, + "learning_rate": 3.663446054750403e-05, + "loss": 0.4828, + "step": 705 + }, + { + "epoch": 1.0231884057971015, + "grad_norm": 0.33163899918073203, + "learning_rate": 3.6607622114868495e-05, + "loss": 0.5149, + "step": 706 + }, + { + "epoch": 1.0246376811594202, + "grad_norm": 0.2749607995982067, + "learning_rate": 3.6580783682232964e-05, + "loss": 0.4467, + "step": 707 + }, + { + "epoch": 1.0260869565217392, + "grad_norm": 0.335134524186387, + "learning_rate": 3.655394524959743e-05, + "loss": 0.5448, + "step": 708 + }, + { + "epoch": 1.027536231884058, + "grad_norm": 0.336245663515532, + "learning_rate": 3.652710681696189e-05, + "loss": 0.5312, + "step": 709 + }, + { + "epoch": 1.0289855072463767, + "grad_norm": 0.3041401875794213, + "learning_rate": 3.650026838432636e-05, + "loss": 0.456, + "step": 710 + }, + { + "epoch": 1.0304347826086957, + "grad_norm": 0.33112756836747553, + "learning_rate": 3.647342995169082e-05, + "loss": 0.5265, + "step": 711 + }, + { + "epoch": 1.0318840579710145, + "grad_norm": 0.32377857452632586, + "learning_rate": 3.644659151905529e-05, + "loss": 0.5049, + "step": 712 + }, + { + "epoch": 1.0333333333333334, + "grad_norm": 0.35237156346129317, + "learning_rate": 3.6419753086419754e-05, + "loss": 0.509, + "step": 713 + }, + { + "epoch": 1.0347826086956522, + "grad_norm": 0.30061366226048825, + "learning_rate": 3.6392914653784224e-05, + "loss": 0.469, + "step": 714 + }, + { + "epoch": 1.036231884057971, + "grad_norm": 0.35291453847506943, + "learning_rate": 3.6366076221148686e-05, + "loss": 0.5389, + "step": 715 + }, + { + "epoch": 1.03768115942029, + "grad_norm": 0.31133681533501706, + "learning_rate": 3.6339237788513156e-05, + "loss": 0.4586, + "step": 716 + }, + { + "epoch": 1.0391304347826087, + "grad_norm": 0.38873796715326586, + "learning_rate": 3.631239935587762e-05, + "loss": 0.4938, + "step": 717 + }, + { + "epoch": 1.0405797101449274, + "grad_norm": 0.3347900167399755, + "learning_rate": 3.628556092324208e-05, + "loss": 0.4765, + "step": 718 + }, + { + "epoch": 1.0420289855072464, + "grad_norm": 16.79798030704283, + "learning_rate": 3.625872249060655e-05, + "loss": 0.6782, + "step": 719 + }, + { + "epoch": 1.0434782608695652, + "grad_norm": 0.3588584614784706, + "learning_rate": 3.6231884057971014e-05, + "loss": 0.42, + "step": 720 + }, + { + "epoch": 1.0449275362318842, + "grad_norm": 0.3683402353340812, + "learning_rate": 3.620504562533548e-05, + "loss": 0.4768, + "step": 721 + }, + { + "epoch": 1.046376811594203, + "grad_norm": 0.4050960641583972, + "learning_rate": 3.6178207192699946e-05, + "loss": 0.4417, + "step": 722 + }, + { + "epoch": 1.0478260869565217, + "grad_norm": 0.3745285639337781, + "learning_rate": 3.6151368760064415e-05, + "loss": 0.4808, + "step": 723 + }, + { + "epoch": 1.0492753623188407, + "grad_norm": 0.440811714792068, + "learning_rate": 3.612453032742888e-05, + "loss": 0.4576, + "step": 724 + }, + { + "epoch": 1.0507246376811594, + "grad_norm": 0.3644702486058993, + "learning_rate": 3.609769189479335e-05, + "loss": 0.4801, + "step": 725 + }, + { + "epoch": 1.0521739130434782, + "grad_norm": 0.32619631976812075, + "learning_rate": 3.607085346215781e-05, + "loss": 0.4738, + "step": 726 + }, + { + "epoch": 1.0536231884057972, + "grad_norm": 0.4032452494515482, + "learning_rate": 3.604401502952228e-05, + "loss": 0.4632, + "step": 727 + }, + { + "epoch": 1.055072463768116, + "grad_norm": 0.33186477576484597, + "learning_rate": 3.601717659688674e-05, + "loss": 0.473, + "step": 728 + }, + { + "epoch": 1.0565217391304347, + "grad_norm": 0.41265306224613973, + "learning_rate": 3.5990338164251205e-05, + "loss": 0.5297, + "step": 729 + }, + { + "epoch": 1.0579710144927537, + "grad_norm": 0.357811054298638, + "learning_rate": 3.5963499731615675e-05, + "loss": 0.4358, + "step": 730 + }, + { + "epoch": 1.0594202898550724, + "grad_norm": 1.6014513679266071, + "learning_rate": 3.593666129898014e-05, + "loss": 0.4755, + "step": 731 + }, + { + "epoch": 1.0608695652173914, + "grad_norm": 0.386928608403775, + "learning_rate": 3.590982286634461e-05, + "loss": 0.431, + "step": 732 + }, + { + "epoch": 1.0623188405797102, + "grad_norm": 0.3333018164985276, + "learning_rate": 3.588298443370907e-05, + "loss": 0.4761, + "step": 733 + }, + { + "epoch": 1.063768115942029, + "grad_norm": 0.3828708305599461, + "learning_rate": 3.585614600107354e-05, + "loss": 0.5168, + "step": 734 + }, + { + "epoch": 1.065217391304348, + "grad_norm": 0.3330827819992235, + "learning_rate": 3.5829307568438e-05, + "loss": 0.4949, + "step": 735 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.3413451946582709, + "learning_rate": 3.580246913580247e-05, + "loss": 0.4477, + "step": 736 + }, + { + "epoch": 1.0681159420289854, + "grad_norm": 0.32611495494902515, + "learning_rate": 3.5775630703166934e-05, + "loss": 0.5093, + "step": 737 + }, + { + "epoch": 1.0695652173913044, + "grad_norm": 0.35968381975521796, + "learning_rate": 3.57487922705314e-05, + "loss": 0.4419, + "step": 738 + }, + { + "epoch": 1.0710144927536231, + "grad_norm": 0.331806860557936, + "learning_rate": 3.5721953837895867e-05, + "loss": 0.447, + "step": 739 + }, + { + "epoch": 1.0724637681159421, + "grad_norm": 0.3202770251387416, + "learning_rate": 3.569511540526033e-05, + "loss": 0.4719, + "step": 740 + }, + { + "epoch": 1.0739130434782609, + "grad_norm": 0.37406348075552714, + "learning_rate": 3.56682769726248e-05, + "loss": 0.524, + "step": 741 + }, + { + "epoch": 1.0753623188405796, + "grad_norm": 0.3333246479182366, + "learning_rate": 3.564143853998926e-05, + "loss": 0.446, + "step": 742 + }, + { + "epoch": 1.0768115942028986, + "grad_norm": 0.34231725200893803, + "learning_rate": 3.561460010735374e-05, + "loss": 0.4321, + "step": 743 + }, + { + "epoch": 1.0782608695652174, + "grad_norm": 0.3914086039213534, + "learning_rate": 3.55877616747182e-05, + "loss": 0.5154, + "step": 744 + }, + { + "epoch": 1.0797101449275361, + "grad_norm": 0.33722219668147707, + "learning_rate": 3.556092324208266e-05, + "loss": 0.5284, + "step": 745 + }, + { + "epoch": 1.0811594202898551, + "grad_norm": 0.31575080298428554, + "learning_rate": 3.553408480944713e-05, + "loss": 0.4803, + "step": 746 + }, + { + "epoch": 1.0826086956521739, + "grad_norm": 0.31987366418089547, + "learning_rate": 3.5507246376811596e-05, + "loss": 0.4612, + "step": 747 + }, + { + "epoch": 1.0840579710144929, + "grad_norm": 0.379863200048738, + "learning_rate": 3.5480407944176065e-05, + "loss": 0.489, + "step": 748 + }, + { + "epoch": 1.0855072463768116, + "grad_norm": 0.2766306075403479, + "learning_rate": 3.545356951154053e-05, + "loss": 0.4586, + "step": 749 + }, + { + "epoch": 1.0869565217391304, + "grad_norm": 0.34487833074900065, + "learning_rate": 3.5426731078905e-05, + "loss": 0.496, + "step": 750 + }, + { + "epoch": 1.0884057971014494, + "grad_norm": 4.906831253016445, + "learning_rate": 3.539989264626946e-05, + "loss": 0.5816, + "step": 751 + }, + { + "epoch": 1.0898550724637681, + "grad_norm": 0.353885455422411, + "learning_rate": 3.537305421363393e-05, + "loss": 0.5382, + "step": 752 + }, + { + "epoch": 1.0913043478260869, + "grad_norm": 0.29502103994366735, + "learning_rate": 3.534621578099839e-05, + "loss": 0.4972, + "step": 753 + }, + { + "epoch": 1.0927536231884059, + "grad_norm": 0.35290823324954934, + "learning_rate": 3.531937734836286e-05, + "loss": 0.5024, + "step": 754 + }, + { + "epoch": 1.0942028985507246, + "grad_norm": 0.2860262112670786, + "learning_rate": 3.5292538915727325e-05, + "loss": 0.4789, + "step": 755 + }, + { + "epoch": 1.0956521739130434, + "grad_norm": 0.3877390248809208, + "learning_rate": 3.526570048309179e-05, + "loss": 0.4886, + "step": 756 + }, + { + "epoch": 1.0971014492753624, + "grad_norm": 0.28709320063852295, + "learning_rate": 3.523886205045626e-05, + "loss": 0.3936, + "step": 757 + }, + { + "epoch": 1.098550724637681, + "grad_norm": 0.34519804599228754, + "learning_rate": 3.521202361782072e-05, + "loss": 0.4594, + "step": 758 + }, + { + "epoch": 1.1, + "grad_norm": 0.30444354880038155, + "learning_rate": 3.518518518518519e-05, + "loss": 0.4198, + "step": 759 + }, + { + "epoch": 1.1014492753623188, + "grad_norm": 0.29409450262132447, + "learning_rate": 3.515834675254965e-05, + "loss": 0.5075, + "step": 760 + }, + { + "epoch": 1.1028985507246376, + "grad_norm": 0.30054572231790977, + "learning_rate": 3.513150831991412e-05, + "loss": 0.4649, + "step": 761 + }, + { + "epoch": 1.1043478260869566, + "grad_norm": 0.2989613936621628, + "learning_rate": 3.5104669887278584e-05, + "loss": 0.4449, + "step": 762 + }, + { + "epoch": 1.1057971014492753, + "grad_norm": 0.3037114090980125, + "learning_rate": 3.5077831454643054e-05, + "loss": 0.4839, + "step": 763 + }, + { + "epoch": 1.107246376811594, + "grad_norm": 0.3031693362845878, + "learning_rate": 3.5050993022007516e-05, + "loss": 0.4157, + "step": 764 + }, + { + "epoch": 1.108695652173913, + "grad_norm": 0.3137122464324766, + "learning_rate": 3.502415458937198e-05, + "loss": 0.4999, + "step": 765 + }, + { + "epoch": 1.1101449275362318, + "grad_norm": 0.35389682558480495, + "learning_rate": 3.499731615673645e-05, + "loss": 0.5107, + "step": 766 + }, + { + "epoch": 1.1115942028985508, + "grad_norm": 0.2895422259138419, + "learning_rate": 3.497047772410091e-05, + "loss": 0.4807, + "step": 767 + }, + { + "epoch": 1.1130434782608696, + "grad_norm": 0.35009638294139317, + "learning_rate": 3.494363929146538e-05, + "loss": 0.4727, + "step": 768 + }, + { + "epoch": 1.1144927536231883, + "grad_norm": 0.3708206656399047, + "learning_rate": 3.4916800858829844e-05, + "loss": 0.5179, + "step": 769 + }, + { + "epoch": 1.1159420289855073, + "grad_norm": 0.28903378564992177, + "learning_rate": 3.488996242619431e-05, + "loss": 0.529, + "step": 770 + }, + { + "epoch": 1.117391304347826, + "grad_norm": 0.39287961390615433, + "learning_rate": 3.4863123993558776e-05, + "loss": 0.468, + "step": 771 + }, + { + "epoch": 1.1188405797101448, + "grad_norm": 0.3796057467781331, + "learning_rate": 3.4836285560923245e-05, + "loss": 0.5136, + "step": 772 + }, + { + "epoch": 1.1202898550724638, + "grad_norm": 0.34554797079886895, + "learning_rate": 3.480944712828771e-05, + "loss": 0.5179, + "step": 773 + }, + { + "epoch": 1.1217391304347826, + "grad_norm": 0.3964474111821127, + "learning_rate": 3.478260869565218e-05, + "loss": 0.421, + "step": 774 + }, + { + "epoch": 1.1231884057971016, + "grad_norm": 0.42365345484348044, + "learning_rate": 3.475577026301664e-05, + "loss": 0.4967, + "step": 775 + }, + { + "epoch": 1.1246376811594203, + "grad_norm": 0.31444964103858697, + "learning_rate": 3.47289318303811e-05, + "loss": 0.5367, + "step": 776 + }, + { + "epoch": 1.126086956521739, + "grad_norm": 0.40977846149532443, + "learning_rate": 3.470209339774557e-05, + "loss": 0.5085, + "step": 777 + }, + { + "epoch": 1.127536231884058, + "grad_norm": 0.4040084864158796, + "learning_rate": 3.4675254965110035e-05, + "loss": 0.5166, + "step": 778 + }, + { + "epoch": 1.1289855072463768, + "grad_norm": 0.33605865897707826, + "learning_rate": 3.4648416532474505e-05, + "loss": 0.5246, + "step": 779 + }, + { + "epoch": 1.1304347826086956, + "grad_norm": 0.3220052982912894, + "learning_rate": 3.462157809983897e-05, + "loss": 0.4491, + "step": 780 + }, + { + "epoch": 1.1318840579710145, + "grad_norm": 0.4366726878890496, + "learning_rate": 3.459473966720344e-05, + "loss": 0.523, + "step": 781 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.26621474023466923, + "learning_rate": 3.45679012345679e-05, + "loss": 0.4173, + "step": 782 + }, + { + "epoch": 1.134782608695652, + "grad_norm": 0.39572440477831416, + "learning_rate": 3.454106280193237e-05, + "loss": 0.5299, + "step": 783 + }, + { + "epoch": 1.136231884057971, + "grad_norm": 0.4040603491168308, + "learning_rate": 3.451422436929683e-05, + "loss": 0.4567, + "step": 784 + }, + { + "epoch": 1.1376811594202898, + "grad_norm": 0.32904119897841605, + "learning_rate": 3.44873859366613e-05, + "loss": 0.5497, + "step": 785 + }, + { + "epoch": 1.1391304347826088, + "grad_norm": 0.46278497219882125, + "learning_rate": 3.4460547504025764e-05, + "loss": 0.5304, + "step": 786 + }, + { + "epoch": 1.1405797101449275, + "grad_norm": 3.964868351721275, + "learning_rate": 3.443370907139023e-05, + "loss": 0.6874, + "step": 787 + }, + { + "epoch": 1.1420289855072463, + "grad_norm": 0.4333381760274589, + "learning_rate": 3.44068706387547e-05, + "loss": 0.5262, + "step": 788 + }, + { + "epoch": 1.1434782608695653, + "grad_norm": 0.40342770956293456, + "learning_rate": 3.438003220611916e-05, + "loss": 0.476, + "step": 789 + }, + { + "epoch": 1.144927536231884, + "grad_norm": 0.450118522092737, + "learning_rate": 3.4353193773483636e-05, + "loss": 0.5051, + "step": 790 + }, + { + "epoch": 1.146376811594203, + "grad_norm": 0.38593792951653244, + "learning_rate": 3.43263553408481e-05, + "loss": 0.5204, + "step": 791 + }, + { + "epoch": 1.1478260869565218, + "grad_norm": 0.3198244248233121, + "learning_rate": 3.429951690821256e-05, + "loss": 0.4774, + "step": 792 + }, + { + "epoch": 1.1492753623188405, + "grad_norm": 0.37117111558693916, + "learning_rate": 3.427267847557703e-05, + "loss": 0.4587, + "step": 793 + }, + { + "epoch": 1.1507246376811595, + "grad_norm": 0.3304999838282135, + "learning_rate": 3.4245840042941493e-05, + "loss": 0.5017, + "step": 794 + }, + { + "epoch": 1.1521739130434783, + "grad_norm": 0.32790729982023115, + "learning_rate": 3.421900161030596e-05, + "loss": 0.4553, + "step": 795 + }, + { + "epoch": 1.153623188405797, + "grad_norm": 0.3058444493138007, + "learning_rate": 3.4192163177670426e-05, + "loss": 0.4992, + "step": 796 + }, + { + "epoch": 1.155072463768116, + "grad_norm": 0.3137149776147036, + "learning_rate": 3.4165324745034895e-05, + "loss": 0.4673, + "step": 797 + }, + { + "epoch": 1.1565217391304348, + "grad_norm": 0.3055969447342787, + "learning_rate": 3.413848631239936e-05, + "loss": 0.4489, + "step": 798 + }, + { + "epoch": 1.1579710144927535, + "grad_norm": 0.6552379281280202, + "learning_rate": 3.411164787976383e-05, + "loss": 0.4605, + "step": 799 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 0.35827545936038846, + "learning_rate": 3.408480944712829e-05, + "loss": 0.5115, + "step": 800 + }, + { + "epoch": 1.1608695652173913, + "grad_norm": 0.3096119862536391, + "learning_rate": 3.405797101449276e-05, + "loss": 0.5188, + "step": 801 + }, + { + "epoch": 1.1623188405797102, + "grad_norm": 0.3746648851546505, + "learning_rate": 3.403113258185722e-05, + "loss": 0.4364, + "step": 802 + }, + { + "epoch": 1.163768115942029, + "grad_norm": 0.37513208247393715, + "learning_rate": 3.4004294149221685e-05, + "loss": 0.4751, + "step": 803 + }, + { + "epoch": 1.1652173913043478, + "grad_norm": 0.4093812220396632, + "learning_rate": 3.3977455716586155e-05, + "loss": 0.4712, + "step": 804 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.2968811522800294, + "learning_rate": 3.395061728395062e-05, + "loss": 0.4673, + "step": 805 + }, + { + "epoch": 1.1681159420289855, + "grad_norm": 0.37760080915918015, + "learning_rate": 3.392377885131509e-05, + "loss": 0.4461, + "step": 806 + }, + { + "epoch": 1.1695652173913043, + "grad_norm": 2.3847172796970564, + "learning_rate": 3.389694041867955e-05, + "loss": 0.531, + "step": 807 + }, + { + "epoch": 1.1710144927536232, + "grad_norm": 0.36180593225677865, + "learning_rate": 3.387010198604402e-05, + "loss": 0.4838, + "step": 808 + }, + { + "epoch": 1.172463768115942, + "grad_norm": 1.5785551257268962, + "learning_rate": 3.384326355340848e-05, + "loss": 0.51, + "step": 809 + }, + { + "epoch": 1.1739130434782608, + "grad_norm": 0.48512277259322567, + "learning_rate": 3.381642512077295e-05, + "loss": 0.5021, + "step": 810 + }, + { + "epoch": 1.1753623188405797, + "grad_norm": 0.3689744132091387, + "learning_rate": 3.3789586688137414e-05, + "loss": 0.496, + "step": 811 + }, + { + "epoch": 1.1768115942028985, + "grad_norm": 0.5468114360082217, + "learning_rate": 3.3762748255501884e-05, + "loss": 0.5209, + "step": 812 + }, + { + "epoch": 1.1782608695652175, + "grad_norm": 0.3789941620658084, + "learning_rate": 3.3735909822866346e-05, + "loss": 0.4426, + "step": 813 + }, + { + "epoch": 1.1797101449275362, + "grad_norm": 0.4720650796187916, + "learning_rate": 3.370907139023081e-05, + "loss": 0.4939, + "step": 814 + }, + { + "epoch": 1.181159420289855, + "grad_norm": 0.4519968405566694, + "learning_rate": 3.368223295759528e-05, + "loss": 0.4975, + "step": 815 + }, + { + "epoch": 1.182608695652174, + "grad_norm": 0.5104332648013641, + "learning_rate": 3.365539452495974e-05, + "loss": 0.491, + "step": 816 + }, + { + "epoch": 1.1840579710144927, + "grad_norm": 0.49738354731794426, + "learning_rate": 3.362855609232421e-05, + "loss": 0.4647, + "step": 817 + }, + { + "epoch": 1.1855072463768117, + "grad_norm": 0.3769921876268249, + "learning_rate": 3.3601717659688674e-05, + "loss": 0.4579, + "step": 818 + }, + { + "epoch": 1.1869565217391305, + "grad_norm": 0.4663245979257391, + "learning_rate": 3.357487922705314e-05, + "loss": 0.5632, + "step": 819 + }, + { + "epoch": 1.1884057971014492, + "grad_norm": 0.37499049303519677, + "learning_rate": 3.3548040794417606e-05, + "loss": 0.549, + "step": 820 + }, + { + "epoch": 1.1898550724637682, + "grad_norm": 0.4473411807255098, + "learning_rate": 3.3521202361782075e-05, + "loss": 0.4996, + "step": 821 + }, + { + "epoch": 1.191304347826087, + "grad_norm": 0.3830440398346099, + "learning_rate": 3.349436392914654e-05, + "loss": 0.4959, + "step": 822 + }, + { + "epoch": 1.1927536231884057, + "grad_norm": 0.4104640920005607, + "learning_rate": 3.3467525496511e-05, + "loss": 0.4994, + "step": 823 + }, + { + "epoch": 1.1942028985507247, + "grad_norm": 0.40985809678787355, + "learning_rate": 3.344068706387547e-05, + "loss": 0.5009, + "step": 824 + }, + { + "epoch": 1.1956521739130435, + "grad_norm": 0.3289639690915926, + "learning_rate": 3.341384863123993e-05, + "loss": 0.4977, + "step": 825 + }, + { + "epoch": 1.1971014492753622, + "grad_norm": 0.3527697395053973, + "learning_rate": 3.33870101986044e-05, + "loss": 0.5708, + "step": 826 + }, + { + "epoch": 1.1985507246376812, + "grad_norm": 0.43443492772244724, + "learning_rate": 3.3360171765968865e-05, + "loss": 0.4473, + "step": 827 + }, + { + "epoch": 1.2, + "grad_norm": 0.337378493207393, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.5505, + "step": 828 + }, + { + "epoch": 1.201449275362319, + "grad_norm": 0.3865226415547385, + "learning_rate": 3.33064949006978e-05, + "loss": 0.5119, + "step": 829 + }, + { + "epoch": 1.2028985507246377, + "grad_norm": 0.34911981266613895, + "learning_rate": 3.327965646806227e-05, + "loss": 0.4573, + "step": 830 + }, + { + "epoch": 1.2043478260869565, + "grad_norm": 0.34586488602301124, + "learning_rate": 3.325281803542673e-05, + "loss": 0.51, + "step": 831 + }, + { + "epoch": 1.2057971014492754, + "grad_norm": 0.42107082760580405, + "learning_rate": 3.32259796027912e-05, + "loss": 0.4662, + "step": 832 + }, + { + "epoch": 1.2072463768115942, + "grad_norm": 0.43549995035939054, + "learning_rate": 3.319914117015566e-05, + "loss": 0.5384, + "step": 833 + }, + { + "epoch": 1.208695652173913, + "grad_norm": 0.3862371394781864, + "learning_rate": 3.3172302737520125e-05, + "loss": 0.5404, + "step": 834 + }, + { + "epoch": 1.210144927536232, + "grad_norm": 0.34958162952048627, + "learning_rate": 3.3145464304884594e-05, + "loss": 0.4774, + "step": 835 + }, + { + "epoch": 1.2115942028985507, + "grad_norm": 0.3821812720253012, + "learning_rate": 3.3118625872249064e-05, + "loss": 0.441, + "step": 836 + }, + { + "epoch": 1.2130434782608694, + "grad_norm": 0.4000142422819388, + "learning_rate": 3.3091787439613533e-05, + "loss": 0.4964, + "step": 837 + }, + { + "epoch": 1.2144927536231884, + "grad_norm": 0.406579430351433, + "learning_rate": 3.3064949006977996e-05, + "loss": 0.4323, + "step": 838 + }, + { + "epoch": 1.2159420289855072, + "grad_norm": 0.3732779725444165, + "learning_rate": 3.3038110574342466e-05, + "loss": 0.4795, + "step": 839 + }, + { + "epoch": 1.2173913043478262, + "grad_norm": 0.3856736552896055, + "learning_rate": 3.301127214170693e-05, + "loss": 0.4587, + "step": 840 + }, + { + "epoch": 1.218840579710145, + "grad_norm": 0.9264139639240327, + "learning_rate": 3.298443370907139e-05, + "loss": 0.5313, + "step": 841 + }, + { + "epoch": 1.2202898550724637, + "grad_norm": 0.41170275194820466, + "learning_rate": 3.295759527643586e-05, + "loss": 0.4814, + "step": 842 + }, + { + "epoch": 1.2217391304347827, + "grad_norm": 0.3640744642554216, + "learning_rate": 3.2930756843800323e-05, + "loss": 0.5029, + "step": 843 + }, + { + "epoch": 1.2231884057971014, + "grad_norm": 0.38012885622626436, + "learning_rate": 3.290391841116479e-05, + "loss": 0.4661, + "step": 844 + }, + { + "epoch": 1.2246376811594204, + "grad_norm": 0.3239294935147369, + "learning_rate": 3.2877079978529256e-05, + "loss": 0.435, + "step": 845 + }, + { + "epoch": 1.2260869565217392, + "grad_norm": 0.34770674570393884, + "learning_rate": 3.2850241545893725e-05, + "loss": 0.4938, + "step": 846 + }, + { + "epoch": 1.227536231884058, + "grad_norm": 0.3313586231488532, + "learning_rate": 3.282340311325819e-05, + "loss": 0.4983, + "step": 847 + }, + { + "epoch": 1.228985507246377, + "grad_norm": 0.3752653873420586, + "learning_rate": 3.279656468062266e-05, + "loss": 0.4987, + "step": 848 + }, + { + "epoch": 1.2304347826086957, + "grad_norm": 0.3241261208367448, + "learning_rate": 3.276972624798712e-05, + "loss": 0.4745, + "step": 849 + }, + { + "epoch": 1.2318840579710144, + "grad_norm": 0.3393488135230325, + "learning_rate": 3.274288781535158e-05, + "loss": 0.433, + "step": 850 + }, + { + "epoch": 1.2333333333333334, + "grad_norm": 0.3503274304144233, + "learning_rate": 3.271604938271605e-05, + "loss": 0.4736, + "step": 851 + }, + { + "epoch": 1.2347826086956522, + "grad_norm": 0.3418505543344525, + "learning_rate": 3.2689210950080515e-05, + "loss": 0.471, + "step": 852 + }, + { + "epoch": 1.236231884057971, + "grad_norm": 0.34202854605765615, + "learning_rate": 3.2662372517444985e-05, + "loss": 0.5153, + "step": 853 + }, + { + "epoch": 1.23768115942029, + "grad_norm": 0.3096582468616831, + "learning_rate": 3.263553408480945e-05, + "loss": 0.4471, + "step": 854 + }, + { + "epoch": 1.2391304347826086, + "grad_norm": 0.36906290878465053, + "learning_rate": 3.260869565217392e-05, + "loss": 0.5084, + "step": 855 + }, + { + "epoch": 1.2405797101449276, + "grad_norm": 0.3236250876491843, + "learning_rate": 3.258185721953838e-05, + "loss": 0.4589, + "step": 856 + }, + { + "epoch": 1.2420289855072464, + "grad_norm": 0.3834374189338398, + "learning_rate": 3.255501878690285e-05, + "loss": 0.5156, + "step": 857 + }, + { + "epoch": 1.2434782608695651, + "grad_norm": 0.3019495775859321, + "learning_rate": 3.252818035426731e-05, + "loss": 0.4715, + "step": 858 + }, + { + "epoch": 1.2449275362318841, + "grad_norm": 0.37003140617652347, + "learning_rate": 3.250134192163178e-05, + "loss": 0.4597, + "step": 859 + }, + { + "epoch": 1.2463768115942029, + "grad_norm": 0.32403513297994646, + "learning_rate": 3.2474503488996244e-05, + "loss": 0.5453, + "step": 860 + }, + { + "epoch": 1.2478260869565219, + "grad_norm": 0.31644292072769686, + "learning_rate": 3.244766505636071e-05, + "loss": 0.485, + "step": 861 + }, + { + "epoch": 1.2492753623188406, + "grad_norm": 0.3193997232817535, + "learning_rate": 3.2420826623725176e-05, + "loss": 0.4498, + "step": 862 + }, + { + "epoch": 1.2507246376811594, + "grad_norm": 0.3288694911299711, + "learning_rate": 3.239398819108964e-05, + "loss": 0.4515, + "step": 863 + }, + { + "epoch": 1.2521739130434781, + "grad_norm": 0.3722851249796339, + "learning_rate": 3.236714975845411e-05, + "loss": 0.4434, + "step": 864 + }, + { + "epoch": 1.2536231884057971, + "grad_norm": 0.35640927389422644, + "learning_rate": 3.234031132581857e-05, + "loss": 0.4788, + "step": 865 + }, + { + "epoch": 1.2550724637681159, + "grad_norm": 0.34416013778138244, + "learning_rate": 3.231347289318304e-05, + "loss": 0.4682, + "step": 866 + }, + { + "epoch": 1.2565217391304349, + "grad_norm": 0.35307468327505803, + "learning_rate": 3.2286634460547504e-05, + "loss": 0.5182, + "step": 867 + }, + { + "epoch": 1.2579710144927536, + "grad_norm": 0.3427936110711446, + "learning_rate": 3.225979602791197e-05, + "loss": 0.4552, + "step": 868 + }, + { + "epoch": 1.2594202898550724, + "grad_norm": 0.2763448846412855, + "learning_rate": 3.2232957595276436e-05, + "loss": 0.4304, + "step": 869 + }, + { + "epoch": 1.2608695652173914, + "grad_norm": 0.379596405325515, + "learning_rate": 3.22061191626409e-05, + "loss": 0.4684, + "step": 870 + }, + { + "epoch": 1.26231884057971, + "grad_norm": 0.3940790425475433, + "learning_rate": 3.217928073000537e-05, + "loss": 0.5502, + "step": 871 + }, + { + "epoch": 1.263768115942029, + "grad_norm": 0.36709229703125856, + "learning_rate": 3.215244229736983e-05, + "loss": 0.4895, + "step": 872 + }, + { + "epoch": 1.2652173913043478, + "grad_norm": 0.3765475900776119, + "learning_rate": 3.21256038647343e-05, + "loss": 0.5219, + "step": 873 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.3015095446447829, + "learning_rate": 3.209876543209876e-05, + "loss": 0.4674, + "step": 874 + }, + { + "epoch": 1.2681159420289856, + "grad_norm": 0.3754372901666103, + "learning_rate": 3.207192699946323e-05, + "loss": 0.4681, + "step": 875 + }, + { + "epoch": 1.2695652173913043, + "grad_norm": 0.28275837312472013, + "learning_rate": 3.2045088566827695e-05, + "loss": 0.4487, + "step": 876 + }, + { + "epoch": 1.2710144927536233, + "grad_norm": 0.3935736907949941, + "learning_rate": 3.2018250134192165e-05, + "loss": 0.5029, + "step": 877 + }, + { + "epoch": 1.272463768115942, + "grad_norm": 0.39538277413686385, + "learning_rate": 3.199141170155663e-05, + "loss": 0.4619, + "step": 878 + }, + { + "epoch": 1.2739130434782608, + "grad_norm": 0.3270500133449961, + "learning_rate": 3.19645732689211e-05, + "loss": 0.521, + "step": 879 + }, + { + "epoch": 1.2753623188405796, + "grad_norm": 0.35142103803291763, + "learning_rate": 3.193773483628556e-05, + "loss": 0.4879, + "step": 880 + }, + { + "epoch": 1.2768115942028986, + "grad_norm": 0.3393109748778095, + "learning_rate": 3.191089640365002e-05, + "loss": 0.491, + "step": 881 + }, + { + "epoch": 1.2782608695652173, + "grad_norm": 0.4071844260390824, + "learning_rate": 3.188405797101449e-05, + "loss": 0.4283, + "step": 882 + }, + { + "epoch": 1.2797101449275363, + "grad_norm": 0.3601811827305163, + "learning_rate": 3.185721953837896e-05, + "loss": 0.4632, + "step": 883 + }, + { + "epoch": 1.281159420289855, + "grad_norm": 0.34393463950589037, + "learning_rate": 3.183038110574343e-05, + "loss": 0.5411, + "step": 884 + }, + { + "epoch": 1.2826086956521738, + "grad_norm": 0.3019143069007389, + "learning_rate": 3.1803542673107894e-05, + "loss": 0.4914, + "step": 885 + }, + { + "epoch": 1.2840579710144928, + "grad_norm": 0.31666199554410895, + "learning_rate": 3.1776704240472364e-05, + "loss": 0.407, + "step": 886 + }, + { + "epoch": 1.2855072463768116, + "grad_norm": 0.32452996393481603, + "learning_rate": 3.1749865807836826e-05, + "loss": 0.5102, + "step": 887 + }, + { + "epoch": 1.2869565217391306, + "grad_norm": 0.3820308585272605, + "learning_rate": 3.172302737520129e-05, + "loss": 0.4429, + "step": 888 + }, + { + "epoch": 1.2884057971014493, + "grad_norm": 0.5195137701326371, + "learning_rate": 3.169618894256576e-05, + "loss": 0.4732, + "step": 889 + }, + { + "epoch": 1.289855072463768, + "grad_norm": 0.2936225835669792, + "learning_rate": 3.166935050993022e-05, + "loss": 0.4657, + "step": 890 + }, + { + "epoch": 1.2913043478260868, + "grad_norm": 0.42350866865971254, + "learning_rate": 3.164251207729469e-05, + "loss": 0.486, + "step": 891 + }, + { + "epoch": 1.2927536231884058, + "grad_norm": 0.3370830587110153, + "learning_rate": 3.1615673644659153e-05, + "loss": 0.4732, + "step": 892 + }, + { + "epoch": 1.2942028985507246, + "grad_norm": 0.34306276198036184, + "learning_rate": 3.158883521202362e-05, + "loss": 0.5006, + "step": 893 + }, + { + "epoch": 1.2956521739130435, + "grad_norm": 0.4307823243834926, + "learning_rate": 3.1561996779388086e-05, + "loss": 0.5748, + "step": 894 + }, + { + "epoch": 1.2971014492753623, + "grad_norm": 0.3421827711913848, + "learning_rate": 3.1535158346752555e-05, + "loss": 0.5452, + "step": 895 + }, + { + "epoch": 1.298550724637681, + "grad_norm": 0.35986972590664257, + "learning_rate": 3.150831991411702e-05, + "loss": 0.4928, + "step": 896 + }, + { + "epoch": 1.3, + "grad_norm": 0.4028315305194365, + "learning_rate": 3.148148148148148e-05, + "loss": 0.4893, + "step": 897 + }, + { + "epoch": 1.3014492753623188, + "grad_norm": 0.3198518521418353, + "learning_rate": 3.145464304884595e-05, + "loss": 0.4719, + "step": 898 + }, + { + "epoch": 1.3028985507246378, + "grad_norm": 0.3048101730507823, + "learning_rate": 3.142780461621041e-05, + "loss": 0.497, + "step": 899 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.4497587869956628, + "learning_rate": 3.140096618357488e-05, + "loss": 0.5056, + "step": 900 + }, + { + "epoch": 1.3057971014492753, + "grad_norm": 0.3584956584198498, + "learning_rate": 3.1374127750939345e-05, + "loss": 0.5536, + "step": 901 + }, + { + "epoch": 1.3072463768115943, + "grad_norm": 0.3461500766700808, + "learning_rate": 3.1347289318303815e-05, + "loss": 0.4703, + "step": 902 + }, + { + "epoch": 1.308695652173913, + "grad_norm": 0.8694177600711719, + "learning_rate": 3.132045088566828e-05, + "loss": 0.4411, + "step": 903 + }, + { + "epoch": 1.310144927536232, + "grad_norm": 0.42333283493724055, + "learning_rate": 3.129361245303275e-05, + "loss": 0.4677, + "step": 904 + }, + { + "epoch": 1.3115942028985508, + "grad_norm": 0.41077456941737023, + "learning_rate": 3.126677402039721e-05, + "loss": 0.4789, + "step": 905 + }, + { + "epoch": 1.3130434782608695, + "grad_norm": 0.49701901911869834, + "learning_rate": 3.123993558776168e-05, + "loss": 0.4627, + "step": 906 + }, + { + "epoch": 1.3144927536231883, + "grad_norm": 0.3737725318170062, + "learning_rate": 3.121309715512614e-05, + "loss": 0.5316, + "step": 907 + }, + { + "epoch": 1.3159420289855073, + "grad_norm": 0.37226289942626994, + "learning_rate": 3.1186258722490605e-05, + "loss": 0.5297, + "step": 908 + }, + { + "epoch": 1.317391304347826, + "grad_norm": 0.4429119113697492, + "learning_rate": 3.1159420289855074e-05, + "loss": 0.5067, + "step": 909 + }, + { + "epoch": 1.318840579710145, + "grad_norm": 0.39281951947534194, + "learning_rate": 3.113258185721954e-05, + "loss": 0.5163, + "step": 910 + }, + { + "epoch": 1.3202898550724638, + "grad_norm": 0.36347806131070287, + "learning_rate": 3.1105743424584007e-05, + "loss": 0.5138, + "step": 911 + }, + { + "epoch": 1.3217391304347825, + "grad_norm": 0.37090038432978484, + "learning_rate": 3.107890499194847e-05, + "loss": 0.5067, + "step": 912 + }, + { + "epoch": 1.3231884057971015, + "grad_norm": 0.3399730639972717, + "learning_rate": 3.105206655931294e-05, + "loss": 0.4666, + "step": 913 + }, + { + "epoch": 1.3246376811594203, + "grad_norm": 0.3615568168196394, + "learning_rate": 3.10252281266774e-05, + "loss": 0.5118, + "step": 914 + }, + { + "epoch": 1.3260869565217392, + "grad_norm": 0.3614274613790376, + "learning_rate": 3.099838969404187e-05, + "loss": 0.5104, + "step": 915 + }, + { + "epoch": 1.327536231884058, + "grad_norm": 0.3147554804974595, + "learning_rate": 3.0971551261406334e-05, + "loss": 0.4469, + "step": 916 + }, + { + "epoch": 1.3289855072463768, + "grad_norm": 0.4522346661612457, + "learning_rate": 3.09447128287708e-05, + "loss": 0.4577, + "step": 917 + }, + { + "epoch": 1.3304347826086955, + "grad_norm": 0.32263332874891165, + "learning_rate": 3.0917874396135266e-05, + "loss": 0.4621, + "step": 918 + }, + { + "epoch": 1.3318840579710145, + "grad_norm": 0.2883562120924791, + "learning_rate": 3.089103596349973e-05, + "loss": 0.5105, + "step": 919 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.3179678130564038, + "learning_rate": 3.08641975308642e-05, + "loss": 0.481, + "step": 920 + }, + { + "epoch": 1.3347826086956522, + "grad_norm": 0.28861961579910617, + "learning_rate": 3.083735909822866e-05, + "loss": 0.4943, + "step": 921 + }, + { + "epoch": 1.336231884057971, + "grad_norm": 0.3363186614291933, + "learning_rate": 3.081052066559313e-05, + "loss": 0.4577, + "step": 922 + }, + { + "epoch": 1.3376811594202898, + "grad_norm": 0.3053941389822725, + "learning_rate": 3.078368223295759e-05, + "loss": 0.4812, + "step": 923 + }, + { + "epoch": 1.3391304347826087, + "grad_norm": 0.34395797623076607, + "learning_rate": 3.075684380032206e-05, + "loss": 0.4901, + "step": 924 + }, + { + "epoch": 1.3405797101449275, + "grad_norm": 0.3574862058583271, + "learning_rate": 3.0730005367686526e-05, + "loss": 0.5004, + "step": 925 + }, + { + "epoch": 1.3420289855072465, + "grad_norm": 0.36766498861515995, + "learning_rate": 3.0703166935050995e-05, + "loss": 0.4846, + "step": 926 + }, + { + "epoch": 1.3434782608695652, + "grad_norm": 0.3064419453723753, + "learning_rate": 3.067632850241546e-05, + "loss": 0.4842, + "step": 927 + }, + { + "epoch": 1.344927536231884, + "grad_norm": 0.3364090622983266, + "learning_rate": 3.064949006977992e-05, + "loss": 0.5311, + "step": 928 + }, + { + "epoch": 1.346376811594203, + "grad_norm": 0.3274279374759603, + "learning_rate": 3.06226516371444e-05, + "loss": 0.4851, + "step": 929 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 0.2826344391252545, + "learning_rate": 3.059581320450886e-05, + "loss": 0.4574, + "step": 930 + }, + { + "epoch": 1.3492753623188407, + "grad_norm": 0.29856410659478516, + "learning_rate": 3.056897477187333e-05, + "loss": 0.4674, + "step": 931 + }, + { + "epoch": 1.3507246376811595, + "grad_norm": 0.3031177440918603, + "learning_rate": 3.054213633923779e-05, + "loss": 0.5107, + "step": 932 + }, + { + "epoch": 1.3521739130434782, + "grad_norm": 0.3190321958011399, + "learning_rate": 3.0515297906602258e-05, + "loss": 0.5476, + "step": 933 + }, + { + "epoch": 1.353623188405797, + "grad_norm": 0.28726674696872556, + "learning_rate": 3.0488459473966724e-05, + "loss": 0.4484, + "step": 934 + }, + { + "epoch": 1.355072463768116, + "grad_norm": 0.34401661423155566, + "learning_rate": 3.046162104133119e-05, + "loss": 0.512, + "step": 935 + }, + { + "epoch": 1.3565217391304347, + "grad_norm": 0.3264713371237452, + "learning_rate": 3.0434782608695656e-05, + "loss": 0.4517, + "step": 936 + }, + { + "epoch": 1.3579710144927537, + "grad_norm": 0.30036423040371935, + "learning_rate": 3.0407944176060122e-05, + "loss": 0.4729, + "step": 937 + }, + { + "epoch": 1.3594202898550725, + "grad_norm": 0.2836159289751039, + "learning_rate": 3.038110574342459e-05, + "loss": 0.3927, + "step": 938 + }, + { + "epoch": 1.3608695652173912, + "grad_norm": 0.35656364715244465, + "learning_rate": 3.035426731078905e-05, + "loss": 0.4858, + "step": 939 + }, + { + "epoch": 1.3623188405797102, + "grad_norm": 0.2611982183438624, + "learning_rate": 3.0327428878153517e-05, + "loss": 0.4202, + "step": 940 + }, + { + "epoch": 1.363768115942029, + "grad_norm": 0.297442808481956, + "learning_rate": 3.0300590445517984e-05, + "loss": 0.5323, + "step": 941 + }, + { + "epoch": 1.365217391304348, + "grad_norm": 0.30599446236692146, + "learning_rate": 3.027375201288245e-05, + "loss": 0.4312, + "step": 942 + }, + { + "epoch": 1.3666666666666667, + "grad_norm": 0.37120178748535404, + "learning_rate": 3.0246913580246916e-05, + "loss": 0.5141, + "step": 943 + }, + { + "epoch": 1.3681159420289855, + "grad_norm": 0.3109213272274342, + "learning_rate": 3.0220075147611382e-05, + "loss": 0.4831, + "step": 944 + }, + { + "epoch": 1.3695652173913042, + "grad_norm": 0.7134990744412109, + "learning_rate": 3.0193236714975848e-05, + "loss": 0.4434, + "step": 945 + }, + { + "epoch": 1.3710144927536232, + "grad_norm": 0.3556483056405202, + "learning_rate": 3.0166398282340314e-05, + "loss": 0.4893, + "step": 946 + }, + { + "epoch": 1.372463768115942, + "grad_norm": 0.3827510724428325, + "learning_rate": 3.013955984970478e-05, + "loss": 0.4244, + "step": 947 + }, + { + "epoch": 1.373913043478261, + "grad_norm": 0.33618272241328867, + "learning_rate": 3.0112721417069246e-05, + "loss": 0.4893, + "step": 948 + }, + { + "epoch": 1.3753623188405797, + "grad_norm": 0.3440259382494713, + "learning_rate": 3.0085882984433713e-05, + "loss": 0.5287, + "step": 949 + }, + { + "epoch": 1.3768115942028984, + "grad_norm": 0.3714305425319723, + "learning_rate": 3.0059044551798175e-05, + "loss": 0.4722, + "step": 950 + }, + { + "epoch": 1.3782608695652174, + "grad_norm": 0.3365930331575947, + "learning_rate": 3.003220611916264e-05, + "loss": 0.4906, + "step": 951 + }, + { + "epoch": 1.3797101449275362, + "grad_norm": 0.3448515268305264, + "learning_rate": 3.0005367686527108e-05, + "loss": 0.4747, + "step": 952 + }, + { + "epoch": 1.3811594202898552, + "grad_norm": 0.3191880755633343, + "learning_rate": 2.9978529253891574e-05, + "loss": 0.4512, + "step": 953 + }, + { + "epoch": 1.382608695652174, + "grad_norm": 0.30747956275576543, + "learning_rate": 2.995169082125604e-05, + "loss": 0.4136, + "step": 954 + }, + { + "epoch": 1.3840579710144927, + "grad_norm": 0.31307014206977685, + "learning_rate": 2.9924852388620506e-05, + "loss": 0.519, + "step": 955 + }, + { + "epoch": 1.3855072463768117, + "grad_norm": 0.3223807954231026, + "learning_rate": 2.9898013955984972e-05, + "loss": 0.4719, + "step": 956 + }, + { + "epoch": 1.3869565217391304, + "grad_norm": 0.30410535043204734, + "learning_rate": 2.9871175523349438e-05, + "loss": 0.607, + "step": 957 + }, + { + "epoch": 1.3884057971014494, + "grad_norm": 0.29856137689060125, + "learning_rate": 2.9844337090713904e-05, + "loss": 0.505, + "step": 958 + }, + { + "epoch": 1.3898550724637682, + "grad_norm": 0.34325249374708355, + "learning_rate": 2.981749865807837e-05, + "loss": 0.5084, + "step": 959 + }, + { + "epoch": 1.391304347826087, + "grad_norm": 0.2846508750212212, + "learning_rate": 2.9790660225442833e-05, + "loss": 0.4798, + "step": 960 + }, + { + "epoch": 1.3927536231884057, + "grad_norm": 0.29384666806977755, + "learning_rate": 2.97638217928073e-05, + "loss": 0.4357, + "step": 961 + }, + { + "epoch": 1.3942028985507247, + "grad_norm": 0.3098793105792094, + "learning_rate": 2.9736983360171765e-05, + "loss": 0.4663, + "step": 962 + }, + { + "epoch": 1.3956521739130434, + "grad_norm": 0.280703970437938, + "learning_rate": 2.971014492753623e-05, + "loss": 0.4517, + "step": 963 + }, + { + "epoch": 1.3971014492753624, + "grad_norm": 0.295897243223529, + "learning_rate": 2.9683306494900698e-05, + "loss": 0.4953, + "step": 964 + }, + { + "epoch": 1.3985507246376812, + "grad_norm": 0.2990637911720898, + "learning_rate": 2.9656468062265164e-05, + "loss": 0.4454, + "step": 965 + }, + { + "epoch": 1.4, + "grad_norm": 0.2841004825626051, + "learning_rate": 2.962962962962963e-05, + "loss": 0.5554, + "step": 966 + }, + { + "epoch": 1.401449275362319, + "grad_norm": 0.3382291317076574, + "learning_rate": 2.9602791196994096e-05, + "loss": 0.5134, + "step": 967 + }, + { + "epoch": 1.4028985507246376, + "grad_norm": 0.2848473660753329, + "learning_rate": 2.9575952764358562e-05, + "loss": 0.4276, + "step": 968 + }, + { + "epoch": 1.4043478260869566, + "grad_norm": 0.30949147729278975, + "learning_rate": 2.9549114331723028e-05, + "loss": 0.4868, + "step": 969 + }, + { + "epoch": 1.4057971014492754, + "grad_norm": 1.375106211694375, + "learning_rate": 2.952227589908749e-05, + "loss": 0.4457, + "step": 970 + }, + { + "epoch": 1.4072463768115941, + "grad_norm": 0.3161686918576888, + "learning_rate": 2.9495437466451957e-05, + "loss": 0.5123, + "step": 971 + }, + { + "epoch": 1.4086956521739131, + "grad_norm": 0.3289921703244623, + "learning_rate": 2.9468599033816423e-05, + "loss": 0.4721, + "step": 972 + }, + { + "epoch": 1.4101449275362319, + "grad_norm": 0.32444685871548057, + "learning_rate": 2.944176060118089e-05, + "loss": 0.5155, + "step": 973 + }, + { + "epoch": 1.4115942028985506, + "grad_norm": 0.34878175924902355, + "learning_rate": 2.9414922168545356e-05, + "loss": 0.4148, + "step": 974 + }, + { + "epoch": 1.4130434782608696, + "grad_norm": 0.6529055435476424, + "learning_rate": 2.938808373590982e-05, + "loss": 0.5577, + "step": 975 + }, + { + "epoch": 1.4144927536231884, + "grad_norm": 0.29726742021900276, + "learning_rate": 2.9361245303274295e-05, + "loss": 0.4579, + "step": 976 + }, + { + "epoch": 1.4159420289855071, + "grad_norm": 0.370721498030071, + "learning_rate": 2.9334406870638757e-05, + "loss": 0.5317, + "step": 977 + }, + { + "epoch": 1.4173913043478261, + "grad_norm": 0.356650197544395, + "learning_rate": 2.9307568438003223e-05, + "loss": 0.446, + "step": 978 + }, + { + "epoch": 1.4188405797101449, + "grad_norm": 0.3052312311447589, + "learning_rate": 2.928073000536769e-05, + "loss": 0.5167, + "step": 979 + }, + { + "epoch": 1.4202898550724639, + "grad_norm": 0.31821019277596285, + "learning_rate": 2.9253891572732156e-05, + "loss": 0.4607, + "step": 980 + }, + { + "epoch": 1.4217391304347826, + "grad_norm": 0.31432739981673774, + "learning_rate": 2.9227053140096622e-05, + "loss": 0.456, + "step": 981 + }, + { + "epoch": 1.4231884057971014, + "grad_norm": 0.3031707155403672, + "learning_rate": 2.9200214707461088e-05, + "loss": 0.5148, + "step": 982 + }, + { + "epoch": 1.4246376811594204, + "grad_norm": 0.2855523267853004, + "learning_rate": 2.9173376274825554e-05, + "loss": 0.5317, + "step": 983 + }, + { + "epoch": 1.4260869565217391, + "grad_norm": 0.2901230981280946, + "learning_rate": 2.914653784219002e-05, + "loss": 0.4524, + "step": 984 + }, + { + "epoch": 1.427536231884058, + "grad_norm": 0.3247998883320533, + "learning_rate": 2.9119699409554486e-05, + "loss": 0.5147, + "step": 985 + }, + { + "epoch": 1.4289855072463769, + "grad_norm": 0.26818811417448163, + "learning_rate": 2.9092860976918952e-05, + "loss": 0.4362, + "step": 986 + }, + { + "epoch": 1.4304347826086956, + "grad_norm": 0.2940552568752322, + "learning_rate": 2.9066022544283415e-05, + "loss": 0.491, + "step": 987 + }, + { + "epoch": 1.4318840579710144, + "grad_norm": 0.3457522998826719, + "learning_rate": 2.903918411164788e-05, + "loss": 0.489, + "step": 988 + }, + { + "epoch": 1.4333333333333333, + "grad_norm": 0.33644944078569905, + "learning_rate": 2.9012345679012347e-05, + "loss": 0.5334, + "step": 989 + }, + { + "epoch": 1.434782608695652, + "grad_norm": 0.34951348063221277, + "learning_rate": 2.8985507246376814e-05, + "loss": 0.5584, + "step": 990 + }, + { + "epoch": 1.436231884057971, + "grad_norm": 0.3619439249364829, + "learning_rate": 2.895866881374128e-05, + "loss": 0.5196, + "step": 991 + }, + { + "epoch": 1.4376811594202898, + "grad_norm": 0.2909575341845204, + "learning_rate": 2.8931830381105746e-05, + "loss": 0.4389, + "step": 992 + }, + { + "epoch": 1.4391304347826086, + "grad_norm": 0.38572710386421416, + "learning_rate": 2.8904991948470212e-05, + "loss": 0.5097, + "step": 993 + }, + { + "epoch": 1.4405797101449276, + "grad_norm": 0.30304027869204375, + "learning_rate": 2.8878153515834678e-05, + "loss": 0.4838, + "step": 994 + }, + { + "epoch": 1.4420289855072463, + "grad_norm": 0.30382150828149235, + "learning_rate": 2.8851315083199144e-05, + "loss": 0.4964, + "step": 995 + }, + { + "epoch": 1.4434782608695653, + "grad_norm": 0.32588305845948534, + "learning_rate": 2.882447665056361e-05, + "loss": 0.5379, + "step": 996 + }, + { + "epoch": 1.444927536231884, + "grad_norm": 0.3469605041772776, + "learning_rate": 2.8797638217928073e-05, + "loss": 0.4791, + "step": 997 + }, + { + "epoch": 1.4463768115942028, + "grad_norm": 0.27111724716871016, + "learning_rate": 2.877079978529254e-05, + "loss": 0.3857, + "step": 998 + }, + { + "epoch": 1.4478260869565218, + "grad_norm": 0.3233826007150532, + "learning_rate": 2.8743961352657005e-05, + "loss": 0.4538, + "step": 999 + }, + { + "epoch": 1.4492753623188406, + "grad_norm": 0.28402937137023476, + "learning_rate": 2.871712292002147e-05, + "loss": 0.3899, + "step": 1000 + }, + { + "epoch": 1.4507246376811596, + "grad_norm": 0.31413076614809343, + "learning_rate": 2.8690284487385938e-05, + "loss": 0.5691, + "step": 1001 + }, + { + "epoch": 1.4521739130434783, + "grad_norm": 0.2943139035596239, + "learning_rate": 2.8663446054750404e-05, + "loss": 0.4505, + "step": 1002 + }, + { + "epoch": 1.453623188405797, + "grad_norm": 0.35202564226634103, + "learning_rate": 2.863660762211487e-05, + "loss": 0.4805, + "step": 1003 + }, + { + "epoch": 1.4550724637681158, + "grad_norm": 0.3473503644422686, + "learning_rate": 2.8609769189479336e-05, + "loss": 0.5192, + "step": 1004 + }, + { + "epoch": 1.4565217391304348, + "grad_norm": 0.3321358690145911, + "learning_rate": 2.8582930756843802e-05, + "loss": 0.498, + "step": 1005 + }, + { + "epoch": 1.4579710144927536, + "grad_norm": 0.3963590731326883, + "learning_rate": 2.8556092324208268e-05, + "loss": 0.4544, + "step": 1006 + }, + { + "epoch": 1.4594202898550726, + "grad_norm": 0.3160377865145725, + "learning_rate": 2.852925389157273e-05, + "loss": 0.48, + "step": 1007 + }, + { + "epoch": 1.4608695652173913, + "grad_norm": 0.3023702273452484, + "learning_rate": 2.8502415458937197e-05, + "loss": 0.4614, + "step": 1008 + }, + { + "epoch": 1.46231884057971, + "grad_norm": 0.32055644945325834, + "learning_rate": 2.8475577026301663e-05, + "loss": 0.4739, + "step": 1009 + }, + { + "epoch": 1.463768115942029, + "grad_norm": 0.363135383170579, + "learning_rate": 2.844873859366613e-05, + "loss": 0.4962, + "step": 1010 + }, + { + "epoch": 1.4652173913043478, + "grad_norm": 0.2860447211182234, + "learning_rate": 2.8421900161030595e-05, + "loss": 0.4877, + "step": 1011 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.3349245845217755, + "learning_rate": 2.839506172839506e-05, + "loss": 0.4809, + "step": 1012 + }, + { + "epoch": 1.4681159420289855, + "grad_norm": 0.34913542414472193, + "learning_rate": 2.8368223295759528e-05, + "loss": 0.4457, + "step": 1013 + }, + { + "epoch": 1.4695652173913043, + "grad_norm": 0.2910127806792282, + "learning_rate": 2.8341384863123994e-05, + "loss": 0.5023, + "step": 1014 + }, + { + "epoch": 1.471014492753623, + "grad_norm": 0.2720985633608247, + "learning_rate": 2.831454643048846e-05, + "loss": 0.4375, + "step": 1015 + }, + { + "epoch": 1.472463768115942, + "grad_norm": 0.30354616941545454, + "learning_rate": 2.8287707997852926e-05, + "loss": 0.4965, + "step": 1016 + }, + { + "epoch": 1.4739130434782608, + "grad_norm": 0.3694880111899378, + "learning_rate": 2.826086956521739e-05, + "loss": 0.4917, + "step": 1017 + }, + { + "epoch": 1.4753623188405798, + "grad_norm": 0.3274882343659536, + "learning_rate": 2.8234031132581855e-05, + "loss": 0.4815, + "step": 1018 + }, + { + "epoch": 1.4768115942028985, + "grad_norm": 0.3224830151935816, + "learning_rate": 2.820719269994632e-05, + "loss": 0.5586, + "step": 1019 + }, + { + "epoch": 1.4782608695652173, + "grad_norm": 0.2980296932580383, + "learning_rate": 2.8180354267310787e-05, + "loss": 0.5271, + "step": 1020 + }, + { + "epoch": 1.4797101449275363, + "grad_norm": 0.34507202615871363, + "learning_rate": 2.8153515834675253e-05, + "loss": 0.4796, + "step": 1021 + }, + { + "epoch": 1.481159420289855, + "grad_norm": 0.3101443752712901, + "learning_rate": 2.812667740203972e-05, + "loss": 0.4396, + "step": 1022 + }, + { + "epoch": 1.482608695652174, + "grad_norm": 0.2685688911780887, + "learning_rate": 2.8099838969404192e-05, + "loss": 0.4331, + "step": 1023 + }, + { + "epoch": 1.4840579710144928, + "grad_norm": 0.2898255728895406, + "learning_rate": 2.8073000536768655e-05, + "loss": 0.4414, + "step": 1024 + }, + { + "epoch": 1.4855072463768115, + "grad_norm": 0.3005447183802071, + "learning_rate": 2.804616210413312e-05, + "loss": 0.4675, + "step": 1025 + }, + { + "epoch": 1.4869565217391305, + "grad_norm": 0.2951594502573726, + "learning_rate": 2.8019323671497587e-05, + "loss": 0.5167, + "step": 1026 + }, + { + "epoch": 1.4884057971014493, + "grad_norm": 0.34141270846717114, + "learning_rate": 2.7992485238862053e-05, + "loss": 0.5213, + "step": 1027 + }, + { + "epoch": 1.4898550724637682, + "grad_norm": 0.2961121782295929, + "learning_rate": 2.796564680622652e-05, + "loss": 0.5084, + "step": 1028 + }, + { + "epoch": 1.491304347826087, + "grad_norm": 0.30238719507345607, + "learning_rate": 2.7938808373590986e-05, + "loss": 0.4607, + "step": 1029 + }, + { + "epoch": 1.4927536231884058, + "grad_norm": 0.4133514663264453, + "learning_rate": 2.7911969940955452e-05, + "loss": 0.4958, + "step": 1030 + }, + { + "epoch": 1.4942028985507245, + "grad_norm": 0.2776853903753614, + "learning_rate": 2.7885131508319918e-05, + "loss": 0.4545, + "step": 1031 + }, + { + "epoch": 1.4956521739130435, + "grad_norm": 0.3355782381599797, + "learning_rate": 2.7858293075684384e-05, + "loss": 0.5098, + "step": 1032 + }, + { + "epoch": 1.4971014492753623, + "grad_norm": 0.3375973164794734, + "learning_rate": 2.783145464304885e-05, + "loss": 0.454, + "step": 1033 + }, + { + "epoch": 1.4985507246376812, + "grad_norm": 0.33212025767660713, + "learning_rate": 2.7804616210413313e-05, + "loss": 0.474, + "step": 1034 + }, + { + "epoch": 1.5, + "grad_norm": 0.2933684635973571, + "learning_rate": 2.777777777777778e-05, + "loss": 0.5098, + "step": 1035 + }, + { + "epoch": 1.5014492753623188, + "grad_norm": 0.3001437074236019, + "learning_rate": 2.7750939345142245e-05, + "loss": 0.5155, + "step": 1036 + }, + { + "epoch": 1.5028985507246375, + "grad_norm": 0.31042500910696075, + "learning_rate": 2.772410091250671e-05, + "loss": 0.4794, + "step": 1037 + }, + { + "epoch": 1.5043478260869565, + "grad_norm": 0.29702687455774746, + "learning_rate": 2.7697262479871177e-05, + "loss": 0.4966, + "step": 1038 + }, + { + "epoch": 1.5057971014492755, + "grad_norm": 0.27626007286792775, + "learning_rate": 2.7670424047235644e-05, + "loss": 0.4312, + "step": 1039 + }, + { + "epoch": 1.5072463768115942, + "grad_norm": 0.30248674822166244, + "learning_rate": 2.764358561460011e-05, + "loss": 0.5571, + "step": 1040 + }, + { + "epoch": 1.508695652173913, + "grad_norm": 0.30886810124630376, + "learning_rate": 2.7616747181964576e-05, + "loss": 0.5157, + "step": 1041 + }, + { + "epoch": 1.5101449275362318, + "grad_norm": 0.2766998203983483, + "learning_rate": 2.7589908749329042e-05, + "loss": 0.4802, + "step": 1042 + }, + { + "epoch": 1.5115942028985507, + "grad_norm": 0.2930564064361767, + "learning_rate": 2.7563070316693508e-05, + "loss": 0.4346, + "step": 1043 + }, + { + "epoch": 1.5130434782608697, + "grad_norm": 0.3021299734706425, + "learning_rate": 2.753623188405797e-05, + "loss": 0.424, + "step": 1044 + }, + { + "epoch": 1.5144927536231885, + "grad_norm": 0.300297304057552, + "learning_rate": 2.7509393451422437e-05, + "loss": 0.5042, + "step": 1045 + }, + { + "epoch": 1.5159420289855072, + "grad_norm": 0.32251092016584043, + "learning_rate": 2.7482555018786903e-05, + "loss": 0.5228, + "step": 1046 + }, + { + "epoch": 1.517391304347826, + "grad_norm": 0.28027744804448423, + "learning_rate": 2.745571658615137e-05, + "loss": 0.4005, + "step": 1047 + }, + { + "epoch": 1.518840579710145, + "grad_norm": 0.35068102203668, + "learning_rate": 2.7428878153515835e-05, + "loss": 0.4543, + "step": 1048 + }, + { + "epoch": 1.5202898550724637, + "grad_norm": 0.3519909596862909, + "learning_rate": 2.74020397208803e-05, + "loss": 0.4308, + "step": 1049 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 0.3279603569621184, + "learning_rate": 2.7375201288244768e-05, + "loss": 0.4563, + "step": 1050 + }, + { + "epoch": 1.5231884057971015, + "grad_norm": 0.3123333826206131, + "learning_rate": 2.7348362855609234e-05, + "loss": 0.4796, + "step": 1051 + }, + { + "epoch": 1.5246376811594202, + "grad_norm": 0.3786082929605651, + "learning_rate": 2.73215244229737e-05, + "loss": 0.4699, + "step": 1052 + }, + { + "epoch": 1.526086956521739, + "grad_norm": 0.2987854198643322, + "learning_rate": 2.7294685990338166e-05, + "loss": 0.5033, + "step": 1053 + }, + { + "epoch": 1.527536231884058, + "grad_norm": 0.2856444661663017, + "learning_rate": 2.7267847557702632e-05, + "loss": 0.5121, + "step": 1054 + }, + { + "epoch": 1.528985507246377, + "grad_norm": 0.34233136806927067, + "learning_rate": 2.7241009125067095e-05, + "loss": 0.5266, + "step": 1055 + }, + { + "epoch": 1.5304347826086957, + "grad_norm": 0.2925490260180701, + "learning_rate": 2.721417069243156e-05, + "loss": 0.5282, + "step": 1056 + }, + { + "epoch": 1.5318840579710145, + "grad_norm": 0.28624250953418484, + "learning_rate": 2.7187332259796027e-05, + "loss": 0.4228, + "step": 1057 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.36892271205960986, + "learning_rate": 2.7160493827160493e-05, + "loss": 0.5322, + "step": 1058 + }, + { + "epoch": 1.5347826086956522, + "grad_norm": 0.31099299091219945, + "learning_rate": 2.713365539452496e-05, + "loss": 0.5231, + "step": 1059 + }, + { + "epoch": 1.5362318840579712, + "grad_norm": 0.3384470762863493, + "learning_rate": 2.7106816961889425e-05, + "loss": 0.5012, + "step": 1060 + }, + { + "epoch": 1.53768115942029, + "grad_norm": 0.30210082438884567, + "learning_rate": 2.707997852925389e-05, + "loss": 0.4853, + "step": 1061 + }, + { + "epoch": 1.5391304347826087, + "grad_norm": 0.3110417076499207, + "learning_rate": 2.7053140096618358e-05, + "loss": 0.5261, + "step": 1062 + }, + { + "epoch": 1.5405797101449274, + "grad_norm": 1.9310222543387976, + "learning_rate": 2.7026301663982824e-05, + "loss": 0.5916, + "step": 1063 + }, + { + "epoch": 1.5420289855072464, + "grad_norm": 0.3433386114056728, + "learning_rate": 2.699946323134729e-05, + "loss": 0.4571, + "step": 1064 + }, + { + "epoch": 1.5434782608695652, + "grad_norm": 0.3309906667592126, + "learning_rate": 2.6972624798711753e-05, + "loss": 0.4688, + "step": 1065 + }, + { + "epoch": 1.5449275362318842, + "grad_norm": 0.29072863860011616, + "learning_rate": 2.694578636607622e-05, + "loss": 0.4636, + "step": 1066 + }, + { + "epoch": 1.546376811594203, + "grad_norm": 0.303646521753606, + "learning_rate": 2.6918947933440685e-05, + "loss": 0.5115, + "step": 1067 + }, + { + "epoch": 1.5478260869565217, + "grad_norm": 0.34580275744019917, + "learning_rate": 2.689210950080515e-05, + "loss": 0.5092, + "step": 1068 + }, + { + "epoch": 1.5492753623188404, + "grad_norm": 0.3385141331273027, + "learning_rate": 2.6865271068169624e-05, + "loss": 0.4684, + "step": 1069 + }, + { + "epoch": 1.5507246376811594, + "grad_norm": 0.35919253427363557, + "learning_rate": 2.683843263553409e-05, + "loss": 0.525, + "step": 1070 + }, + { + "epoch": 1.5521739130434784, + "grad_norm": 0.3464418349850528, + "learning_rate": 2.6811594202898553e-05, + "loss": 0.5036, + "step": 1071 + }, + { + "epoch": 1.5536231884057972, + "grad_norm": 0.3482073902469133, + "learning_rate": 2.678475577026302e-05, + "loss": 0.4991, + "step": 1072 + }, + { + "epoch": 1.555072463768116, + "grad_norm": 0.3343119764801353, + "learning_rate": 2.6757917337627485e-05, + "loss": 0.5615, + "step": 1073 + }, + { + "epoch": 1.5565217391304347, + "grad_norm": 0.29222849822164526, + "learning_rate": 2.673107890499195e-05, + "loss": 0.4951, + "step": 1074 + }, + { + "epoch": 1.5579710144927537, + "grad_norm": 0.31368101687605016, + "learning_rate": 2.6704240472356417e-05, + "loss": 0.4332, + "step": 1075 + }, + { + "epoch": 1.5594202898550724, + "grad_norm": 0.31563321476585815, + "learning_rate": 2.6677402039720884e-05, + "loss": 0.4807, + "step": 1076 + }, + { + "epoch": 1.5608695652173914, + "grad_norm": 0.2771632028980185, + "learning_rate": 2.665056360708535e-05, + "loss": 0.4293, + "step": 1077 + }, + { + "epoch": 1.5623188405797102, + "grad_norm": 0.35885070515840845, + "learning_rate": 2.6623725174449816e-05, + "loss": 0.491, + "step": 1078 + }, + { + "epoch": 1.563768115942029, + "grad_norm": 0.45855129836544456, + "learning_rate": 2.6596886741814282e-05, + "loss": 0.506, + "step": 1079 + }, + { + "epoch": 1.5652173913043477, + "grad_norm": 0.31151851560502486, + "learning_rate": 2.6570048309178748e-05, + "loss": 0.4129, + "step": 1080 + }, + { + "epoch": 1.5666666666666667, + "grad_norm": 0.3940707203914161, + "learning_rate": 2.654320987654321e-05, + "loss": 0.4713, + "step": 1081 + }, + { + "epoch": 1.5681159420289856, + "grad_norm": 0.29630838485277544, + "learning_rate": 2.6516371443907677e-05, + "loss": 0.4597, + "step": 1082 + }, + { + "epoch": 1.5695652173913044, + "grad_norm": 1.3737628784009406, + "learning_rate": 2.6489533011272143e-05, + "loss": 0.5068, + "step": 1083 + }, + { + "epoch": 1.5710144927536231, + "grad_norm": 0.46680947803437023, + "learning_rate": 2.646269457863661e-05, + "loss": 0.4453, + "step": 1084 + }, + { + "epoch": 1.572463768115942, + "grad_norm": 0.32836072314477216, + "learning_rate": 2.6435856146001075e-05, + "loss": 0.5094, + "step": 1085 + }, + { + "epoch": 1.5739130434782609, + "grad_norm": 0.30537238692197544, + "learning_rate": 2.640901771336554e-05, + "loss": 0.4709, + "step": 1086 + }, + { + "epoch": 1.5753623188405799, + "grad_norm": 0.4136853222401632, + "learning_rate": 2.6382179280730008e-05, + "loss": 0.4336, + "step": 1087 + }, + { + "epoch": 1.5768115942028986, + "grad_norm": 0.3865605698775386, + "learning_rate": 2.6355340848094474e-05, + "loss": 0.4848, + "step": 1088 + }, + { + "epoch": 1.5782608695652174, + "grad_norm": 0.36430998727945507, + "learning_rate": 2.632850241545894e-05, + "loss": 0.5249, + "step": 1089 + }, + { + "epoch": 1.5797101449275361, + "grad_norm": 0.47223614909730766, + "learning_rate": 2.6301663982823406e-05, + "loss": 0.5398, + "step": 1090 + }, + { + "epoch": 1.5811594202898551, + "grad_norm": 1.7291963265409254, + "learning_rate": 2.6274825550187872e-05, + "loss": 0.5247, + "step": 1091 + }, + { + "epoch": 1.5826086956521739, + "grad_norm": 0.35052147672518846, + "learning_rate": 2.6247987117552335e-05, + "loss": 0.47, + "step": 1092 + }, + { + "epoch": 1.5840579710144929, + "grad_norm": 0.37445221619061947, + "learning_rate": 2.62211486849168e-05, + "loss": 0.4582, + "step": 1093 + }, + { + "epoch": 1.5855072463768116, + "grad_norm": 0.40176853170877347, + "learning_rate": 2.6194310252281267e-05, + "loss": 0.5314, + "step": 1094 + }, + { + "epoch": 1.5869565217391304, + "grad_norm": 0.3129420729765643, + "learning_rate": 2.6167471819645733e-05, + "loss": 0.5057, + "step": 1095 + }, + { + "epoch": 1.5884057971014491, + "grad_norm": 0.3668781780261487, + "learning_rate": 2.61406333870102e-05, + "loss": 0.4761, + "step": 1096 + }, + { + "epoch": 1.5898550724637681, + "grad_norm": 0.3328390730485344, + "learning_rate": 2.6113794954374665e-05, + "loss": 0.5399, + "step": 1097 + }, + { + "epoch": 1.591304347826087, + "grad_norm": 0.34513824383552366, + "learning_rate": 2.608695652173913e-05, + "loss": 0.4, + "step": 1098 + }, + { + "epoch": 1.5927536231884059, + "grad_norm": 0.32500164182044994, + "learning_rate": 2.6060118089103598e-05, + "loss": 0.4751, + "step": 1099 + }, + { + "epoch": 1.5942028985507246, + "grad_norm": 0.2762870180657687, + "learning_rate": 2.6033279656468064e-05, + "loss": 0.5027, + "step": 1100 + }, + { + "epoch": 1.5956521739130434, + "grad_norm": 0.31612062911505995, + "learning_rate": 2.600644122383253e-05, + "loss": 0.4754, + "step": 1101 + }, + { + "epoch": 1.5971014492753624, + "grad_norm": 0.2938548583813651, + "learning_rate": 2.5979602791196993e-05, + "loss": 0.4709, + "step": 1102 + }, + { + "epoch": 1.598550724637681, + "grad_norm": 0.26715822202118394, + "learning_rate": 2.595276435856146e-05, + "loss": 0.4444, + "step": 1103 + }, + { + "epoch": 1.6, + "grad_norm": 0.3416266894284156, + "learning_rate": 2.5925925925925925e-05, + "loss": 0.5503, + "step": 1104 + }, + { + "epoch": 1.6014492753623188, + "grad_norm": 0.3216038263370538, + "learning_rate": 2.589908749329039e-05, + "loss": 0.5112, + "step": 1105 + }, + { + "epoch": 1.6028985507246376, + "grad_norm": 0.27503545942301794, + "learning_rate": 2.5872249060654857e-05, + "loss": 0.462, + "step": 1106 + }, + { + "epoch": 1.6043478260869564, + "grad_norm": 0.36952921169438074, + "learning_rate": 2.5845410628019323e-05, + "loss": 0.5123, + "step": 1107 + }, + { + "epoch": 1.6057971014492753, + "grad_norm": 0.3117300813713928, + "learning_rate": 2.581857219538379e-05, + "loss": 0.5011, + "step": 1108 + }, + { + "epoch": 1.6072463768115943, + "grad_norm": 0.2992474583613941, + "learning_rate": 2.5791733762748256e-05, + "loss": 0.5008, + "step": 1109 + }, + { + "epoch": 1.608695652173913, + "grad_norm": 0.3014995756227707, + "learning_rate": 2.576489533011272e-05, + "loss": 0.4762, + "step": 1110 + }, + { + "epoch": 1.6101449275362318, + "grad_norm": 0.35343798929006137, + "learning_rate": 2.5738056897477188e-05, + "loss": 0.5262, + "step": 1111 + }, + { + "epoch": 1.6115942028985506, + "grad_norm": 0.2577183024889789, + "learning_rate": 2.571121846484165e-05, + "loss": 0.4279, + "step": 1112 + }, + { + "epoch": 1.6130434782608696, + "grad_norm": 0.3034987372845954, + "learning_rate": 2.5684380032206117e-05, + "loss": 0.5026, + "step": 1113 + }, + { + "epoch": 1.6144927536231886, + "grad_norm": 0.33673464929484864, + "learning_rate": 2.5657541599570583e-05, + "loss": 0.4862, + "step": 1114 + }, + { + "epoch": 1.6159420289855073, + "grad_norm": 0.3384160427246048, + "learning_rate": 2.563070316693505e-05, + "loss": 0.5445, + "step": 1115 + }, + { + "epoch": 1.617391304347826, + "grad_norm": 0.32413449758544177, + "learning_rate": 2.5603864734299522e-05, + "loss": 0.4661, + "step": 1116 + }, + { + "epoch": 1.6188405797101448, + "grad_norm": 0.3144244703969793, + "learning_rate": 2.5577026301663988e-05, + "loss": 0.4783, + "step": 1117 + }, + { + "epoch": 1.6202898550724638, + "grad_norm": 0.35066050252787756, + "learning_rate": 2.5550187869028454e-05, + "loss": 0.4951, + "step": 1118 + }, + { + "epoch": 1.6217391304347826, + "grad_norm": 0.27519048555942127, + "learning_rate": 2.5523349436392917e-05, + "loss": 0.4505, + "step": 1119 + }, + { + "epoch": 1.6231884057971016, + "grad_norm": 0.3545251856066128, + "learning_rate": 2.5496511003757383e-05, + "loss": 0.5553, + "step": 1120 + }, + { + "epoch": 1.6246376811594203, + "grad_norm": 0.32213559162233424, + "learning_rate": 2.546967257112185e-05, + "loss": 0.5033, + "step": 1121 + }, + { + "epoch": 1.626086956521739, + "grad_norm": 0.32983910484047146, + "learning_rate": 2.5442834138486315e-05, + "loss": 0.4894, + "step": 1122 + }, + { + "epoch": 1.6275362318840578, + "grad_norm": 0.3202291330759991, + "learning_rate": 2.541599570585078e-05, + "loss": 0.5531, + "step": 1123 + }, + { + "epoch": 1.6289855072463768, + "grad_norm": 0.32166747117547634, + "learning_rate": 2.5389157273215247e-05, + "loss": 0.4613, + "step": 1124 + }, + { + "epoch": 1.6304347826086958, + "grad_norm": 0.2886548591988718, + "learning_rate": 2.5362318840579714e-05, + "loss": 0.4716, + "step": 1125 + }, + { + "epoch": 1.6318840579710145, + "grad_norm": 0.40006321648929954, + "learning_rate": 2.533548040794418e-05, + "loss": 0.5574, + "step": 1126 + }, + { + "epoch": 1.6333333333333333, + "grad_norm": 0.3145464195782174, + "learning_rate": 2.5308641975308646e-05, + "loss": 0.4951, + "step": 1127 + }, + { + "epoch": 1.634782608695652, + "grad_norm": 0.3053615646973681, + "learning_rate": 2.5281803542673112e-05, + "loss": 0.4936, + "step": 1128 + }, + { + "epoch": 1.636231884057971, + "grad_norm": 0.333201209211236, + "learning_rate": 2.5254965110037575e-05, + "loss": 0.4611, + "step": 1129 + }, + { + "epoch": 1.6376811594202898, + "grad_norm": 0.2851998465250763, + "learning_rate": 2.522812667740204e-05, + "loss": 0.4738, + "step": 1130 + }, + { + "epoch": 1.6391304347826088, + "grad_norm": 0.32565555228536963, + "learning_rate": 2.5201288244766507e-05, + "loss": 0.5086, + "step": 1131 + }, + { + "epoch": 1.6405797101449275, + "grad_norm": 0.3218307726528686, + "learning_rate": 2.5174449812130973e-05, + "loss": 0.4572, + "step": 1132 + }, + { + "epoch": 1.6420289855072463, + "grad_norm": 0.33430033216292593, + "learning_rate": 2.514761137949544e-05, + "loss": 0.449, + "step": 1133 + }, + { + "epoch": 1.643478260869565, + "grad_norm": 0.27199374077107724, + "learning_rate": 2.5120772946859905e-05, + "loss": 0.4553, + "step": 1134 + }, + { + "epoch": 1.644927536231884, + "grad_norm": 0.3023340056172451, + "learning_rate": 2.509393451422437e-05, + "loss": 0.544, + "step": 1135 + }, + { + "epoch": 1.646376811594203, + "grad_norm": 0.2613098320681714, + "learning_rate": 2.5067096081588838e-05, + "loss": 0.4587, + "step": 1136 + }, + { + "epoch": 1.6478260869565218, + "grad_norm": 0.2974510036323834, + "learning_rate": 2.5040257648953304e-05, + "loss": 0.5263, + "step": 1137 + }, + { + "epoch": 1.6492753623188405, + "grad_norm": 0.2940780318124218, + "learning_rate": 2.501341921631777e-05, + "loss": 0.5851, + "step": 1138 + }, + { + "epoch": 1.6507246376811593, + "grad_norm": 0.27179550390114365, + "learning_rate": 2.4986580783682233e-05, + "loss": 0.5108, + "step": 1139 + }, + { + "epoch": 1.6521739130434783, + "grad_norm": 0.2810015927705504, + "learning_rate": 2.49597423510467e-05, + "loss": 0.4957, + "step": 1140 + }, + { + "epoch": 1.6536231884057973, + "grad_norm": 0.45762102953727496, + "learning_rate": 2.4932903918411165e-05, + "loss": 0.4951, + "step": 1141 + }, + { + "epoch": 1.655072463768116, + "grad_norm": 0.2904087653620613, + "learning_rate": 2.490606548577563e-05, + "loss": 0.4773, + "step": 1142 + }, + { + "epoch": 1.6565217391304348, + "grad_norm": 0.3015137691314487, + "learning_rate": 2.4879227053140097e-05, + "loss": 0.4079, + "step": 1143 + }, + { + "epoch": 1.6579710144927535, + "grad_norm": 0.28317996532363227, + "learning_rate": 2.4852388620504563e-05, + "loss": 0.5032, + "step": 1144 + }, + { + "epoch": 1.6594202898550725, + "grad_norm": 0.2672615381516704, + "learning_rate": 2.482555018786903e-05, + "loss": 0.4898, + "step": 1145 + }, + { + "epoch": 1.6608695652173913, + "grad_norm": 0.2966522019850434, + "learning_rate": 2.4798711755233495e-05, + "loss": 0.4761, + "step": 1146 + }, + { + "epoch": 1.6623188405797102, + "grad_norm": 0.3239107334984002, + "learning_rate": 2.477187332259796e-05, + "loss": 0.4442, + "step": 1147 + }, + { + "epoch": 1.663768115942029, + "grad_norm": 0.31042620564122636, + "learning_rate": 2.4745034889962428e-05, + "loss": 0.4839, + "step": 1148 + }, + { + "epoch": 1.6652173913043478, + "grad_norm": 0.35048458578602115, + "learning_rate": 2.471819645732689e-05, + "loss": 0.528, + "step": 1149 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.2980529297686646, + "learning_rate": 2.4691358024691357e-05, + "loss": 0.4833, + "step": 1150 + }, + { + "epoch": 1.6681159420289855, + "grad_norm": 0.3672928132817581, + "learning_rate": 2.4664519592055826e-05, + "loss": 0.4872, + "step": 1151 + }, + { + "epoch": 1.6695652173913045, + "grad_norm": 0.2820395944630137, + "learning_rate": 2.4637681159420292e-05, + "loss": 0.4809, + "step": 1152 + }, + { + "epoch": 1.6710144927536232, + "grad_norm": 0.31450043773062825, + "learning_rate": 2.461084272678476e-05, + "loss": 0.4956, + "step": 1153 + }, + { + "epoch": 1.672463768115942, + "grad_norm": 0.30192432417594417, + "learning_rate": 2.4584004294149224e-05, + "loss": 0.4635, + "step": 1154 + }, + { + "epoch": 1.6739130434782608, + "grad_norm": 0.27676854093728326, + "learning_rate": 2.455716586151369e-05, + "loss": 0.4223, + "step": 1155 + }, + { + "epoch": 1.6753623188405797, + "grad_norm": 0.30919443229083027, + "learning_rate": 2.4530327428878157e-05, + "loss": 0.5351, + "step": 1156 + }, + { + "epoch": 1.6768115942028987, + "grad_norm": 0.29361155971628206, + "learning_rate": 2.4503488996242623e-05, + "loss": 0.5295, + "step": 1157 + }, + { + "epoch": 1.6782608695652175, + "grad_norm": 0.3047886425873606, + "learning_rate": 2.4476650563607086e-05, + "loss": 0.4427, + "step": 1158 + }, + { + "epoch": 1.6797101449275362, + "grad_norm": 0.28166533462952836, + "learning_rate": 2.4449812130971552e-05, + "loss": 0.4584, + "step": 1159 + }, + { + "epoch": 1.681159420289855, + "grad_norm": 0.3019997528285114, + "learning_rate": 2.4422973698336018e-05, + "loss": 0.4257, + "step": 1160 + }, + { + "epoch": 1.6826086956521737, + "grad_norm": 0.3171044179739617, + "learning_rate": 2.4396135265700484e-05, + "loss": 0.5039, + "step": 1161 + }, + { + "epoch": 1.6840579710144927, + "grad_norm": 0.34308975241506323, + "learning_rate": 2.436929683306495e-05, + "loss": 0.5566, + "step": 1162 + }, + { + "epoch": 1.6855072463768117, + "grad_norm": 3.039656774093585, + "learning_rate": 2.4342458400429416e-05, + "loss": 0.5861, + "step": 1163 + }, + { + "epoch": 1.6869565217391305, + "grad_norm": 0.35515967239526663, + "learning_rate": 2.4315619967793882e-05, + "loss": 0.477, + "step": 1164 + }, + { + "epoch": 1.6884057971014492, + "grad_norm": 0.37102830820163163, + "learning_rate": 2.428878153515835e-05, + "loss": 0.4585, + "step": 1165 + }, + { + "epoch": 1.689855072463768, + "grad_norm": 0.2824586787793108, + "learning_rate": 2.4261943102522815e-05, + "loss": 0.4651, + "step": 1166 + }, + { + "epoch": 1.691304347826087, + "grad_norm": 0.26716569849813837, + "learning_rate": 2.423510466988728e-05, + "loss": 0.4307, + "step": 1167 + }, + { + "epoch": 1.692753623188406, + "grad_norm": 0.30556825406142546, + "learning_rate": 2.4208266237251743e-05, + "loss": 0.4892, + "step": 1168 + }, + { + "epoch": 1.6942028985507247, + "grad_norm": 0.3604146757235219, + "learning_rate": 2.418142780461621e-05, + "loss": 0.5213, + "step": 1169 + }, + { + "epoch": 1.6956521739130435, + "grad_norm": 0.32770296185197473, + "learning_rate": 2.4154589371980676e-05, + "loss": 0.544, + "step": 1170 + }, + { + "epoch": 1.6971014492753622, + "grad_norm": 0.36987261868006993, + "learning_rate": 2.4127750939345142e-05, + "loss": 0.4964, + "step": 1171 + }, + { + "epoch": 1.6985507246376812, + "grad_norm": 0.30640544794163954, + "learning_rate": 2.4100912506709608e-05, + "loss": 0.4344, + "step": 1172 + }, + { + "epoch": 1.7, + "grad_norm": 0.32343856057785547, + "learning_rate": 2.4074074074074074e-05, + "loss": 0.4784, + "step": 1173 + }, + { + "epoch": 1.701449275362319, + "grad_norm": 0.3305229567420513, + "learning_rate": 2.4047235641438544e-05, + "loss": 0.4911, + "step": 1174 + }, + { + "epoch": 1.7028985507246377, + "grad_norm": 0.2782883099862748, + "learning_rate": 2.402039720880301e-05, + "loss": 0.4965, + "step": 1175 + }, + { + "epoch": 1.7043478260869565, + "grad_norm": 0.3142883185393042, + "learning_rate": 2.3993558776167472e-05, + "loss": 0.4349, + "step": 1176 + }, + { + "epoch": 1.7057971014492752, + "grad_norm": 0.6268301065499224, + "learning_rate": 2.396672034353194e-05, + "loss": 0.5338, + "step": 1177 + }, + { + "epoch": 1.7072463768115942, + "grad_norm": 0.3213649355828965, + "learning_rate": 2.3939881910896405e-05, + "loss": 0.5142, + "step": 1178 + }, + { + "epoch": 1.7086956521739132, + "grad_norm": 0.3190793133605725, + "learning_rate": 2.391304347826087e-05, + "loss": 0.438, + "step": 1179 + }, + { + "epoch": 1.710144927536232, + "grad_norm": 0.2750310402906081, + "learning_rate": 2.3886205045625337e-05, + "loss": 0.426, + "step": 1180 + }, + { + "epoch": 1.7115942028985507, + "grad_norm": 0.2757694770103066, + "learning_rate": 2.3859366612989803e-05, + "loss": 0.4465, + "step": 1181 + }, + { + "epoch": 1.7130434782608694, + "grad_norm": 0.30493489587722383, + "learning_rate": 2.383252818035427e-05, + "loss": 0.5027, + "step": 1182 + }, + { + "epoch": 1.7144927536231884, + "grad_norm": 0.32344723622363536, + "learning_rate": 2.3805689747718735e-05, + "loss": 0.4986, + "step": 1183 + }, + { + "epoch": 1.7159420289855074, + "grad_norm": 0.26856516264127966, + "learning_rate": 2.37788513150832e-05, + "loss": 0.4792, + "step": 1184 + }, + { + "epoch": 1.7173913043478262, + "grad_norm": 0.3250264038161418, + "learning_rate": 2.3752012882447668e-05, + "loss": 0.5504, + "step": 1185 + }, + { + "epoch": 1.718840579710145, + "grad_norm": 0.34292399150690334, + "learning_rate": 2.372517444981213e-05, + "loss": 0.4606, + "step": 1186 + }, + { + "epoch": 1.7202898550724637, + "grad_norm": 0.2721807363721305, + "learning_rate": 2.3698336017176596e-05, + "loss": 0.4407, + "step": 1187 + }, + { + "epoch": 1.7217391304347827, + "grad_norm": 0.3653483444914663, + "learning_rate": 2.3671497584541063e-05, + "loss": 0.5293, + "step": 1188 + }, + { + "epoch": 1.7231884057971014, + "grad_norm": 0.3310831455581937, + "learning_rate": 2.364465915190553e-05, + "loss": 0.5508, + "step": 1189 + }, + { + "epoch": 1.7246376811594204, + "grad_norm": 0.365199402173693, + "learning_rate": 2.3617820719269995e-05, + "loss": 0.5591, + "step": 1190 + }, + { + "epoch": 1.7260869565217392, + "grad_norm": 0.3178140233613037, + "learning_rate": 2.359098228663446e-05, + "loss": 0.4831, + "step": 1191 + }, + { + "epoch": 1.727536231884058, + "grad_norm": 0.3563520913233769, + "learning_rate": 2.3564143853998927e-05, + "loss": 0.4858, + "step": 1192 + }, + { + "epoch": 1.7289855072463767, + "grad_norm": 0.2976845205796779, + "learning_rate": 2.3537305421363393e-05, + "loss": 0.476, + "step": 1193 + }, + { + "epoch": 1.7304347826086957, + "grad_norm": 0.34241650845225546, + "learning_rate": 2.351046698872786e-05, + "loss": 0.4882, + "step": 1194 + }, + { + "epoch": 1.7318840579710146, + "grad_norm": 0.3005603672201895, + "learning_rate": 2.3483628556092325e-05, + "loss": 0.4768, + "step": 1195 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.3854426600408823, + "learning_rate": 2.345679012345679e-05, + "loss": 0.5329, + "step": 1196 + }, + { + "epoch": 1.7347826086956522, + "grad_norm": 0.3723887263936342, + "learning_rate": 2.3429951690821258e-05, + "loss": 0.482, + "step": 1197 + }, + { + "epoch": 1.736231884057971, + "grad_norm": 0.3529567447157191, + "learning_rate": 2.3403113258185724e-05, + "loss": 0.5135, + "step": 1198 + }, + { + "epoch": 1.73768115942029, + "grad_norm": 0.40448836607297356, + "learning_rate": 2.337627482555019e-05, + "loss": 0.4882, + "step": 1199 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.32133621387232536, + "learning_rate": 2.3349436392914656e-05, + "loss": 0.473, + "step": 1200 + }, + { + "epoch": 1.7405797101449276, + "grad_norm": 1.0852661570450723, + "learning_rate": 2.3322597960279122e-05, + "loss": 0.4425, + "step": 1201 + }, + { + "epoch": 1.7420289855072464, + "grad_norm": 0.3711355431298706, + "learning_rate": 2.329575952764359e-05, + "loss": 0.4347, + "step": 1202 + }, + { + "epoch": 1.7434782608695651, + "grad_norm": 0.3379298215669769, + "learning_rate": 2.3268921095008055e-05, + "loss": 0.4532, + "step": 1203 + }, + { + "epoch": 1.744927536231884, + "grad_norm": 0.3569245007981008, + "learning_rate": 2.324208266237252e-05, + "loss": 0.5119, + "step": 1204 + }, + { + "epoch": 1.7463768115942029, + "grad_norm": 0.2830441058655675, + "learning_rate": 2.3215244229736983e-05, + "loss": 0.4829, + "step": 1205 + }, + { + "epoch": 1.7478260869565219, + "grad_norm": 0.29739221481915823, + "learning_rate": 2.318840579710145e-05, + "loss": 0.4562, + "step": 1206 + }, + { + "epoch": 1.7492753623188406, + "grad_norm": 0.325030040824517, + "learning_rate": 2.3161567364465916e-05, + "loss": 0.4666, + "step": 1207 + }, + { + "epoch": 1.7507246376811594, + "grad_norm": 0.305603115485528, + "learning_rate": 2.3134728931830382e-05, + "loss": 0.5258, + "step": 1208 + }, + { + "epoch": 1.7521739130434781, + "grad_norm": 0.35825304542957837, + "learning_rate": 2.3107890499194848e-05, + "loss": 0.5211, + "step": 1209 + }, + { + "epoch": 1.7536231884057971, + "grad_norm": 0.29913595053269965, + "learning_rate": 2.3081052066559314e-05, + "loss": 0.5621, + "step": 1210 + }, + { + "epoch": 1.755072463768116, + "grad_norm": 0.3233729807843389, + "learning_rate": 2.305421363392378e-05, + "loss": 0.4016, + "step": 1211 + }, + { + "epoch": 1.7565217391304349, + "grad_norm": 0.41568565650293215, + "learning_rate": 2.3027375201288246e-05, + "loss": 0.5101, + "step": 1212 + }, + { + "epoch": 1.7579710144927536, + "grad_norm": 0.2868122902243216, + "learning_rate": 2.3000536768652712e-05, + "loss": 0.4703, + "step": 1213 + }, + { + "epoch": 1.7594202898550724, + "grad_norm": 0.3058263429464893, + "learning_rate": 2.297369833601718e-05, + "loss": 0.4488, + "step": 1214 + }, + { + "epoch": 1.7608695652173914, + "grad_norm": 0.3019552092758227, + "learning_rate": 2.294685990338164e-05, + "loss": 0.4847, + "step": 1215 + }, + { + "epoch": 1.76231884057971, + "grad_norm": 0.26413528924068774, + "learning_rate": 2.2920021470746107e-05, + "loss": 0.4644, + "step": 1216 + }, + { + "epoch": 1.763768115942029, + "grad_norm": 0.36682267539610747, + "learning_rate": 2.2893183038110574e-05, + "loss": 0.4519, + "step": 1217 + }, + { + "epoch": 1.7652173913043478, + "grad_norm": 0.28753522716414265, + "learning_rate": 2.286634460547504e-05, + "loss": 0.473, + "step": 1218 + }, + { + "epoch": 1.7666666666666666, + "grad_norm": 0.3026541886137627, + "learning_rate": 2.2839506172839506e-05, + "loss": 0.5092, + "step": 1219 + }, + { + "epoch": 1.7681159420289854, + "grad_norm": 0.30391904667726166, + "learning_rate": 2.2812667740203972e-05, + "loss": 0.5232, + "step": 1220 + }, + { + "epoch": 1.7695652173913043, + "grad_norm": 0.2852925782044705, + "learning_rate": 2.278582930756844e-05, + "loss": 0.4252, + "step": 1221 + }, + { + "epoch": 1.7710144927536233, + "grad_norm": 0.3000533868403904, + "learning_rate": 2.2758990874932908e-05, + "loss": 0.5337, + "step": 1222 + }, + { + "epoch": 1.772463768115942, + "grad_norm": 0.30059390751594706, + "learning_rate": 2.273215244229737e-05, + "loss": 0.4557, + "step": 1223 + }, + { + "epoch": 1.7739130434782608, + "grad_norm": 0.26736425462212277, + "learning_rate": 2.2705314009661836e-05, + "loss": 0.5, + "step": 1224 + }, + { + "epoch": 1.7753623188405796, + "grad_norm": 0.3070528336704938, + "learning_rate": 2.2678475577026303e-05, + "loss": 0.5455, + "step": 1225 + }, + { + "epoch": 1.7768115942028986, + "grad_norm": 0.3420183725834646, + "learning_rate": 2.265163714439077e-05, + "loss": 0.4505, + "step": 1226 + }, + { + "epoch": 1.7782608695652173, + "grad_norm": 0.32331303715104653, + "learning_rate": 2.2624798711755235e-05, + "loss": 0.4736, + "step": 1227 + }, + { + "epoch": 1.7797101449275363, + "grad_norm": 0.25845807346373795, + "learning_rate": 2.25979602791197e-05, + "loss": 0.436, + "step": 1228 + }, + { + "epoch": 1.781159420289855, + "grad_norm": 0.3116846376099379, + "learning_rate": 2.2571121846484167e-05, + "loss": 0.4876, + "step": 1229 + }, + { + "epoch": 1.7826086956521738, + "grad_norm": 0.2824150019214866, + "learning_rate": 2.2544283413848633e-05, + "loss": 0.4713, + "step": 1230 + }, + { + "epoch": 1.7840579710144926, + "grad_norm": 0.31803201506817763, + "learning_rate": 2.25174449812131e-05, + "loss": 0.4723, + "step": 1231 + }, + { + "epoch": 1.7855072463768116, + "grad_norm": 0.3104614414725165, + "learning_rate": 2.2490606548577565e-05, + "loss": 0.4612, + "step": 1232 + }, + { + "epoch": 1.7869565217391306, + "grad_norm": 0.3292079708354603, + "learning_rate": 2.246376811594203e-05, + "loss": 0.4648, + "step": 1233 + }, + { + "epoch": 1.7884057971014493, + "grad_norm": 0.3072965120142354, + "learning_rate": 2.2436929683306494e-05, + "loss": 0.5005, + "step": 1234 + }, + { + "epoch": 1.789855072463768, + "grad_norm": 0.32424907723585317, + "learning_rate": 2.241009125067096e-05, + "loss": 0.4711, + "step": 1235 + }, + { + "epoch": 1.7913043478260868, + "grad_norm": 0.31647288097845033, + "learning_rate": 2.2383252818035427e-05, + "loss": 0.4186, + "step": 1236 + }, + { + "epoch": 1.7927536231884058, + "grad_norm": 0.3370391739929275, + "learning_rate": 2.2356414385399893e-05, + "loss": 0.4478, + "step": 1237 + }, + { + "epoch": 1.7942028985507248, + "grad_norm": 0.28465711257853077, + "learning_rate": 2.232957595276436e-05, + "loss": 0.4547, + "step": 1238 + }, + { + "epoch": 1.7956521739130435, + "grad_norm": 0.3500145854808722, + "learning_rate": 2.2302737520128825e-05, + "loss": 0.4533, + "step": 1239 + }, + { + "epoch": 1.7971014492753623, + "grad_norm": 0.35577089369408404, + "learning_rate": 2.227589908749329e-05, + "loss": 0.468, + "step": 1240 + }, + { + "epoch": 1.798550724637681, + "grad_norm": 0.27215161526036075, + "learning_rate": 2.2249060654857757e-05, + "loss": 0.4879, + "step": 1241 + }, + { + "epoch": 1.8, + "grad_norm": 0.31464918485266263, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.5211, + "step": 1242 + }, + { + "epoch": 1.8014492753623188, + "grad_norm": 0.28959716892415166, + "learning_rate": 2.219538378958669e-05, + "loss": 0.4034, + "step": 1243 + }, + { + "epoch": 1.8028985507246378, + "grad_norm": 0.3133095055561112, + "learning_rate": 2.2168545356951156e-05, + "loss": 0.4886, + "step": 1244 + }, + { + "epoch": 1.8043478260869565, + "grad_norm": 0.3035834128041399, + "learning_rate": 2.214170692431562e-05, + "loss": 0.5295, + "step": 1245 + }, + { + "epoch": 1.8057971014492753, + "grad_norm": 0.3041539027975744, + "learning_rate": 2.2114868491680088e-05, + "loss": 0.5061, + "step": 1246 + }, + { + "epoch": 1.807246376811594, + "grad_norm": 0.2966047899734678, + "learning_rate": 2.2088030059044554e-05, + "loss": 0.4658, + "step": 1247 + }, + { + "epoch": 1.808695652173913, + "grad_norm": 0.29187310468867916, + "learning_rate": 2.206119162640902e-05, + "loss": 0.4448, + "step": 1248 + }, + { + "epoch": 1.810144927536232, + "grad_norm": 0.26644541230528856, + "learning_rate": 2.2034353193773486e-05, + "loss": 0.4782, + "step": 1249 + }, + { + "epoch": 1.8115942028985508, + "grad_norm": 0.29148279845891767, + "learning_rate": 2.2007514761137952e-05, + "loss": 0.4607, + "step": 1250 + }, + { + "epoch": 1.8130434782608695, + "grad_norm": 0.27695780648996593, + "learning_rate": 2.198067632850242e-05, + "loss": 0.431, + "step": 1251 + }, + { + "epoch": 1.8144927536231883, + "grad_norm": 0.27383782215227476, + "learning_rate": 2.195383789586688e-05, + "loss": 0.4782, + "step": 1252 + }, + { + "epoch": 1.8159420289855073, + "grad_norm": 0.256922040034402, + "learning_rate": 2.1926999463231347e-05, + "loss": 0.4699, + "step": 1253 + }, + { + "epoch": 1.8173913043478263, + "grad_norm": 0.28676204052163945, + "learning_rate": 2.1900161030595813e-05, + "loss": 0.4642, + "step": 1254 + }, + { + "epoch": 1.818840579710145, + "grad_norm": 0.27090218727496, + "learning_rate": 2.187332259796028e-05, + "loss": 0.5015, + "step": 1255 + }, + { + "epoch": 1.8202898550724638, + "grad_norm": 0.29832305479155297, + "learning_rate": 2.1846484165324746e-05, + "loss": 0.5476, + "step": 1256 + }, + { + "epoch": 1.8217391304347825, + "grad_norm": 0.263630806640635, + "learning_rate": 2.1819645732689212e-05, + "loss": 0.4291, + "step": 1257 + }, + { + "epoch": 1.8231884057971013, + "grad_norm": 0.2851339779626431, + "learning_rate": 2.1792807300053678e-05, + "loss": 0.5007, + "step": 1258 + }, + { + "epoch": 1.8246376811594203, + "grad_norm": 0.28813505570565895, + "learning_rate": 2.1765968867418144e-05, + "loss": 0.5072, + "step": 1259 + }, + { + "epoch": 1.8260869565217392, + "grad_norm": 0.2855298396602445, + "learning_rate": 2.173913043478261e-05, + "loss": 0.5819, + "step": 1260 + }, + { + "epoch": 1.827536231884058, + "grad_norm": 0.29281428576959717, + "learning_rate": 2.1712292002147076e-05, + "loss": 0.5055, + "step": 1261 + }, + { + "epoch": 1.8289855072463768, + "grad_norm": 0.3124420934472098, + "learning_rate": 2.168545356951154e-05, + "loss": 0.4743, + "step": 1262 + }, + { + "epoch": 1.8304347826086955, + "grad_norm": 0.28710424131737194, + "learning_rate": 2.1658615136876005e-05, + "loss": 0.4775, + "step": 1263 + }, + { + "epoch": 1.8318840579710145, + "grad_norm": 0.2827625651958568, + "learning_rate": 2.163177670424047e-05, + "loss": 0.4919, + "step": 1264 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.2853579763416708, + "learning_rate": 2.1604938271604937e-05, + "loss": 0.5057, + "step": 1265 + }, + { + "epoch": 1.8347826086956522, + "grad_norm": 0.3012843262516302, + "learning_rate": 2.1578099838969404e-05, + "loss": 0.5333, + "step": 1266 + }, + { + "epoch": 1.836231884057971, + "grad_norm": 0.3065514320478756, + "learning_rate": 2.1551261406333873e-05, + "loss": 0.5173, + "step": 1267 + }, + { + "epoch": 1.8376811594202898, + "grad_norm": 0.26727540883895357, + "learning_rate": 2.152442297369834e-05, + "loss": 0.4248, + "step": 1268 + }, + { + "epoch": 1.8391304347826087, + "grad_norm": 0.29562518431891094, + "learning_rate": 2.1497584541062805e-05, + "loss": 0.469, + "step": 1269 + }, + { + "epoch": 1.8405797101449275, + "grad_norm": 0.298575851524304, + "learning_rate": 2.147074610842727e-05, + "loss": 0.4467, + "step": 1270 + }, + { + "epoch": 1.8420289855072465, + "grad_norm": 1.0364992953210546, + "learning_rate": 2.1443907675791734e-05, + "loss": 0.5137, + "step": 1271 + }, + { + "epoch": 1.8434782608695652, + "grad_norm": 0.31898501485381864, + "learning_rate": 2.14170692431562e-05, + "loss": 0.4855, + "step": 1272 + }, + { + "epoch": 1.844927536231884, + "grad_norm": 0.3045897927125765, + "learning_rate": 2.1390230810520666e-05, + "loss": 0.4238, + "step": 1273 + }, + { + "epoch": 1.8463768115942027, + "grad_norm": 0.3166686935868539, + "learning_rate": 2.1363392377885133e-05, + "loss": 0.5035, + "step": 1274 + }, + { + "epoch": 1.8478260869565217, + "grad_norm": 0.2688013508113102, + "learning_rate": 2.13365539452496e-05, + "loss": 0.4263, + "step": 1275 + }, + { + "epoch": 1.8492753623188407, + "grad_norm": 0.304899379478993, + "learning_rate": 2.1309715512614065e-05, + "loss": 0.4904, + "step": 1276 + }, + { + "epoch": 1.8507246376811595, + "grad_norm": 0.35844167840160257, + "learning_rate": 2.128287707997853e-05, + "loss": 0.4878, + "step": 1277 + }, + { + "epoch": 1.8521739130434782, + "grad_norm": 0.27706401361248995, + "learning_rate": 2.1256038647342997e-05, + "loss": 0.4274, + "step": 1278 + }, + { + "epoch": 1.853623188405797, + "grad_norm": 0.6396040828561423, + "learning_rate": 2.1229200214707463e-05, + "loss": 0.4881, + "step": 1279 + }, + { + "epoch": 1.855072463768116, + "grad_norm": 0.3127894695259788, + "learning_rate": 2.120236178207193e-05, + "loss": 0.4723, + "step": 1280 + }, + { + "epoch": 1.856521739130435, + "grad_norm": 0.29512152030072514, + "learning_rate": 2.1175523349436392e-05, + "loss": 0.4736, + "step": 1281 + }, + { + "epoch": 1.8579710144927537, + "grad_norm": 0.32603319773420636, + "learning_rate": 2.1148684916800858e-05, + "loss": 0.5158, + "step": 1282 + }, + { + "epoch": 1.8594202898550725, + "grad_norm": 0.31299453677117295, + "learning_rate": 2.1121846484165324e-05, + "loss": 0.5014, + "step": 1283 + }, + { + "epoch": 1.8608695652173912, + "grad_norm": 0.3041031362582152, + "learning_rate": 2.109500805152979e-05, + "loss": 0.4746, + "step": 1284 + }, + { + "epoch": 1.8623188405797102, + "grad_norm": 0.2754178261200115, + "learning_rate": 2.1068169618894257e-05, + "loss": 0.4237, + "step": 1285 + }, + { + "epoch": 1.863768115942029, + "grad_norm": 0.292174368122742, + "learning_rate": 2.1041331186258723e-05, + "loss": 0.5038, + "step": 1286 + }, + { + "epoch": 1.865217391304348, + "grad_norm": 0.3138925305688222, + "learning_rate": 2.101449275362319e-05, + "loss": 0.531, + "step": 1287 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.2920449919714151, + "learning_rate": 2.0987654320987655e-05, + "loss": 0.4655, + "step": 1288 + }, + { + "epoch": 1.8681159420289855, + "grad_norm": 0.2774442013795414, + "learning_rate": 2.096081588835212e-05, + "loss": 0.4613, + "step": 1289 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 0.29089291776029375, + "learning_rate": 2.0933977455716587e-05, + "loss": 0.4932, + "step": 1290 + }, + { + "epoch": 1.8710144927536232, + "grad_norm": 0.3568060057258649, + "learning_rate": 2.0907139023081053e-05, + "loss": 0.4316, + "step": 1291 + }, + { + "epoch": 1.8724637681159422, + "grad_norm": 0.347097941623319, + "learning_rate": 2.088030059044552e-05, + "loss": 0.4974, + "step": 1292 + }, + { + "epoch": 1.873913043478261, + "grad_norm": 0.3433506640776097, + "learning_rate": 2.0853462157809986e-05, + "loss": 0.5941, + "step": 1293 + }, + { + "epoch": 1.8753623188405797, + "grad_norm": 0.36194743289211745, + "learning_rate": 2.0826623725174452e-05, + "loss": 0.5001, + "step": 1294 + }, + { + "epoch": 1.8768115942028984, + "grad_norm": 0.3960985897096472, + "learning_rate": 2.0799785292538918e-05, + "loss": 0.4855, + "step": 1295 + }, + { + "epoch": 1.8782608695652174, + "grad_norm": 0.3526409226448084, + "learning_rate": 2.0772946859903384e-05, + "loss": 0.5259, + "step": 1296 + }, + { + "epoch": 1.8797101449275362, + "grad_norm": 0.32968502854409054, + "learning_rate": 2.074610842726785e-05, + "loss": 0.4089, + "step": 1297 + }, + { + "epoch": 1.8811594202898552, + "grad_norm": 0.31356361732084437, + "learning_rate": 2.0719269994632316e-05, + "loss": 0.5163, + "step": 1298 + }, + { + "epoch": 1.882608695652174, + "grad_norm": 0.3427992740512394, + "learning_rate": 2.0692431561996782e-05, + "loss": 0.5196, + "step": 1299 + }, + { + "epoch": 1.8840579710144927, + "grad_norm": 0.31325906176651624, + "learning_rate": 2.0665593129361245e-05, + "loss": 0.4887, + "step": 1300 + }, + { + "epoch": 1.8855072463768114, + "grad_norm": 0.2962587958372876, + "learning_rate": 2.063875469672571e-05, + "loss": 0.4972, + "step": 1301 + }, + { + "epoch": 1.8869565217391304, + "grad_norm": 0.32768267756192265, + "learning_rate": 2.0611916264090177e-05, + "loss": 0.4018, + "step": 1302 + }, + { + "epoch": 1.8884057971014494, + "grad_norm": 0.33194347131378105, + "learning_rate": 2.0585077831454643e-05, + "loss": 0.499, + "step": 1303 + }, + { + "epoch": 1.8898550724637682, + "grad_norm": 0.2932119296945969, + "learning_rate": 2.055823939881911e-05, + "loss": 0.4639, + "step": 1304 + }, + { + "epoch": 1.891304347826087, + "grad_norm": 0.3764989655928959, + "learning_rate": 2.0531400966183576e-05, + "loss": 0.516, + "step": 1305 + }, + { + "epoch": 1.8927536231884057, + "grad_norm": 0.3531420090951196, + "learning_rate": 2.0504562533548042e-05, + "loss": 0.5185, + "step": 1306 + }, + { + "epoch": 1.8942028985507247, + "grad_norm": 0.3010580871678021, + "learning_rate": 2.0477724100912508e-05, + "loss": 0.4991, + "step": 1307 + }, + { + "epoch": 1.8956521739130436, + "grad_norm": 0.29344052606590904, + "learning_rate": 2.0450885668276974e-05, + "loss": 0.4724, + "step": 1308 + }, + { + "epoch": 1.8971014492753624, + "grad_norm": 0.34527331703901987, + "learning_rate": 2.042404723564144e-05, + "loss": 0.5225, + "step": 1309 + }, + { + "epoch": 1.8985507246376812, + "grad_norm": 0.3118341745147994, + "learning_rate": 2.0397208803005903e-05, + "loss": 0.5096, + "step": 1310 + }, + { + "epoch": 1.9, + "grad_norm": 4.4212037823678205, + "learning_rate": 2.037037037037037e-05, + "loss": 0.4954, + "step": 1311 + }, + { + "epoch": 1.901449275362319, + "grad_norm": 1.186945577141863, + "learning_rate": 2.0343531937734835e-05, + "loss": 0.525, + "step": 1312 + }, + { + "epoch": 1.9028985507246376, + "grad_norm": 0.35995105362938973, + "learning_rate": 2.03166935050993e-05, + "loss": 0.4655, + "step": 1313 + }, + { + "epoch": 1.9043478260869566, + "grad_norm": 0.2803749141962748, + "learning_rate": 2.028985507246377e-05, + "loss": 0.4443, + "step": 1314 + }, + { + "epoch": 1.9057971014492754, + "grad_norm": 0.2810489649699971, + "learning_rate": 2.0263016639828237e-05, + "loss": 0.4722, + "step": 1315 + }, + { + "epoch": 1.9072463768115941, + "grad_norm": 0.30476564064352385, + "learning_rate": 2.0236178207192703e-05, + "loss": 0.4782, + "step": 1316 + }, + { + "epoch": 1.908695652173913, + "grad_norm": 0.26799442712957444, + "learning_rate": 2.020933977455717e-05, + "loss": 0.3711, + "step": 1317 + }, + { + "epoch": 1.9101449275362319, + "grad_norm": 0.30200296290988266, + "learning_rate": 2.0182501341921632e-05, + "loss": 0.4994, + "step": 1318 + }, + { + "epoch": 1.9115942028985509, + "grad_norm": 0.31939099104943003, + "learning_rate": 2.0155662909286098e-05, + "loss": 0.4946, + "step": 1319 + }, + { + "epoch": 1.9130434782608696, + "grad_norm": 0.3068851680782278, + "learning_rate": 2.0128824476650564e-05, + "loss": 0.4616, + "step": 1320 + }, + { + "epoch": 1.9144927536231884, + "grad_norm": 0.3023208269337483, + "learning_rate": 2.010198604401503e-05, + "loss": 0.4763, + "step": 1321 + }, + { + "epoch": 1.9159420289855071, + "grad_norm": 0.3024815227588282, + "learning_rate": 2.0075147611379496e-05, + "loss": 0.4892, + "step": 1322 + }, + { + "epoch": 1.9173913043478261, + "grad_norm": 0.32337045749778953, + "learning_rate": 2.0048309178743963e-05, + "loss": 0.4847, + "step": 1323 + }, + { + "epoch": 1.9188405797101449, + "grad_norm": 0.2775258100528873, + "learning_rate": 2.002147074610843e-05, + "loss": 0.447, + "step": 1324 + }, + { + "epoch": 1.9202898550724639, + "grad_norm": 0.2894689084026751, + "learning_rate": 1.9994632313472895e-05, + "loss": 0.4561, + "step": 1325 + }, + { + "epoch": 1.9217391304347826, + "grad_norm": 0.3136492116234664, + "learning_rate": 1.996779388083736e-05, + "loss": 0.513, + "step": 1326 + }, + { + "epoch": 1.9231884057971014, + "grad_norm": 0.25726562553145293, + "learning_rate": 1.9940955448201827e-05, + "loss": 0.4732, + "step": 1327 + }, + { + "epoch": 1.9246376811594201, + "grad_norm": 0.29313931534891746, + "learning_rate": 1.991411701556629e-05, + "loss": 0.4824, + "step": 1328 + }, + { + "epoch": 1.9260869565217391, + "grad_norm": 0.3245324762668572, + "learning_rate": 1.9887278582930756e-05, + "loss": 0.4719, + "step": 1329 + }, + { + "epoch": 1.927536231884058, + "grad_norm": 0.2825614921402087, + "learning_rate": 1.9860440150295222e-05, + "loss": 0.4607, + "step": 1330 + }, + { + "epoch": 1.9289855072463769, + "grad_norm": 0.26485872541861677, + "learning_rate": 1.9833601717659688e-05, + "loss": 0.4902, + "step": 1331 + }, + { + "epoch": 1.9304347826086956, + "grad_norm": 0.28708767044003036, + "learning_rate": 1.9806763285024154e-05, + "loss": 0.464, + "step": 1332 + }, + { + "epoch": 1.9318840579710144, + "grad_norm": 0.29675243481316566, + "learning_rate": 1.977992485238862e-05, + "loss": 0.5058, + "step": 1333 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.2835769832589822, + "learning_rate": 1.9753086419753087e-05, + "loss": 0.5023, + "step": 1334 + }, + { + "epoch": 1.9347826086956523, + "grad_norm": 0.3843873450170395, + "learning_rate": 1.9726247987117553e-05, + "loss": 0.4539, + "step": 1335 + }, + { + "epoch": 1.936231884057971, + "grad_norm": 0.29259710654659776, + "learning_rate": 1.969940955448202e-05, + "loss": 0.4876, + "step": 1336 + }, + { + "epoch": 1.9376811594202898, + "grad_norm": 0.2901300055132593, + "learning_rate": 1.9672571121846485e-05, + "loss": 0.5418, + "step": 1337 + }, + { + "epoch": 1.9391304347826086, + "grad_norm": 0.2719106341759771, + "learning_rate": 1.964573268921095e-05, + "loss": 0.4376, + "step": 1338 + }, + { + "epoch": 1.9405797101449276, + "grad_norm": 0.2840996716380229, + "learning_rate": 1.9618894256575417e-05, + "loss": 0.5078, + "step": 1339 + }, + { + "epoch": 1.9420289855072463, + "grad_norm": 5.946655975501915, + "learning_rate": 1.9592055823939883e-05, + "loss": 0.6071, + "step": 1340 + }, + { + "epoch": 1.9434782608695653, + "grad_norm": 0.3083209392903689, + "learning_rate": 1.956521739130435e-05, + "loss": 0.476, + "step": 1341 + }, + { + "epoch": 1.944927536231884, + "grad_norm": 0.3189040042298081, + "learning_rate": 1.9538378958668816e-05, + "loss": 0.4886, + "step": 1342 + }, + { + "epoch": 1.9463768115942028, + "grad_norm": 0.2920370744669305, + "learning_rate": 1.9511540526033282e-05, + "loss": 0.4871, + "step": 1343 + }, + { + "epoch": 1.9478260869565216, + "grad_norm": 0.30710542670229024, + "learning_rate": 1.9484702093397748e-05, + "loss": 0.5296, + "step": 1344 + }, + { + "epoch": 1.9492753623188406, + "grad_norm": 0.33290375374662645, + "learning_rate": 1.9457863660762214e-05, + "loss": 0.5039, + "step": 1345 + }, + { + "epoch": 1.9507246376811596, + "grad_norm": 0.3201136657518179, + "learning_rate": 1.943102522812668e-05, + "loss": 0.5562, + "step": 1346 + }, + { + "epoch": 1.9521739130434783, + "grad_norm": 0.2834790046306402, + "learning_rate": 1.9404186795491143e-05, + "loss": 0.5026, + "step": 1347 + }, + { + "epoch": 1.953623188405797, + "grad_norm": 0.3023987772650251, + "learning_rate": 1.937734836285561e-05, + "loss": 0.4607, + "step": 1348 + }, + { + "epoch": 1.9550724637681158, + "grad_norm": 0.61842500520766, + "learning_rate": 1.9350509930220075e-05, + "loss": 0.5083, + "step": 1349 + }, + { + "epoch": 1.9565217391304348, + "grad_norm": 0.28431314772247096, + "learning_rate": 1.932367149758454e-05, + "loss": 0.4381, + "step": 1350 + }, + { + "epoch": 1.9579710144927536, + "grad_norm": 0.28922134943197386, + "learning_rate": 1.9296833064949007e-05, + "loss": 0.5042, + "step": 1351 + }, + { + "epoch": 1.9594202898550726, + "grad_norm": 0.3398952053798585, + "learning_rate": 1.9269994632313474e-05, + "loss": 0.5823, + "step": 1352 + }, + { + "epoch": 1.9608695652173913, + "grad_norm": 0.2933088409635406, + "learning_rate": 1.924315619967794e-05, + "loss": 0.5117, + "step": 1353 + }, + { + "epoch": 1.96231884057971, + "grad_norm": 0.28452221937531197, + "learning_rate": 1.9216317767042406e-05, + "loss": 0.5014, + "step": 1354 + }, + { + "epoch": 1.9637681159420288, + "grad_norm": 0.3431564358613733, + "learning_rate": 1.9189479334406872e-05, + "loss": 0.5004, + "step": 1355 + }, + { + "epoch": 1.9652173913043478, + "grad_norm": 0.29866438230282527, + "learning_rate": 1.9162640901771338e-05, + "loss": 0.4668, + "step": 1356 + }, + { + "epoch": 1.9666666666666668, + "grad_norm": 0.31282041209931233, + "learning_rate": 1.91358024691358e-05, + "loss": 0.478, + "step": 1357 + }, + { + "epoch": 1.9681159420289855, + "grad_norm": 0.27585050965807306, + "learning_rate": 1.9108964036500267e-05, + "loss": 0.458, + "step": 1358 + }, + { + "epoch": 1.9695652173913043, + "grad_norm": 0.29943407731775756, + "learning_rate": 1.9082125603864733e-05, + "loss": 0.4685, + "step": 1359 + }, + { + "epoch": 1.971014492753623, + "grad_norm": 0.36370754002131483, + "learning_rate": 1.9055287171229203e-05, + "loss": 0.4864, + "step": 1360 + }, + { + "epoch": 1.972463768115942, + "grad_norm": 0.2818369497611728, + "learning_rate": 1.902844873859367e-05, + "loss": 0.3647, + "step": 1361 + }, + { + "epoch": 1.973913043478261, + "grad_norm": 0.31444328219509243, + "learning_rate": 1.9001610305958135e-05, + "loss": 0.5593, + "step": 1362 + }, + { + "epoch": 1.9753623188405798, + "grad_norm": 0.29217505932655186, + "learning_rate": 1.89747718733226e-05, + "loss": 0.4606, + "step": 1363 + }, + { + "epoch": 1.9768115942028985, + "grad_norm": 0.2887298578180074, + "learning_rate": 1.8947933440687067e-05, + "loss": 0.4693, + "step": 1364 + }, + { + "epoch": 1.9782608695652173, + "grad_norm": 0.3007350027357869, + "learning_rate": 1.892109500805153e-05, + "loss": 0.4575, + "step": 1365 + }, + { + "epoch": 1.9797101449275363, + "grad_norm": 0.28022918917301265, + "learning_rate": 1.8894256575415996e-05, + "loss": 0.4412, + "step": 1366 + }, + { + "epoch": 1.981159420289855, + "grad_norm": 0.32612237980146064, + "learning_rate": 1.8867418142780462e-05, + "loss": 0.5018, + "step": 1367 + }, + { + "epoch": 1.982608695652174, + "grad_norm": 0.26476557741702605, + "learning_rate": 1.8840579710144928e-05, + "loss": 0.4294, + "step": 1368 + }, + { + "epoch": 1.9840579710144928, + "grad_norm": 0.28174585585869416, + "learning_rate": 1.8813741277509394e-05, + "loss": 0.4914, + "step": 1369 + }, + { + "epoch": 1.9855072463768115, + "grad_norm": 0.30361583383126906, + "learning_rate": 1.878690284487386e-05, + "loss": 0.4352, + "step": 1370 + }, + { + "epoch": 1.9869565217391303, + "grad_norm": 0.2649565863402794, + "learning_rate": 1.8760064412238327e-05, + "loss": 0.4491, + "step": 1371 + }, + { + "epoch": 1.9884057971014493, + "grad_norm": 0.2630797812371338, + "learning_rate": 1.8733225979602793e-05, + "loss": 0.4775, + "step": 1372 + }, + { + "epoch": 1.9898550724637682, + "grad_norm": 0.2858928894983428, + "learning_rate": 1.870638754696726e-05, + "loss": 0.466, + "step": 1373 + }, + { + "epoch": 1.991304347826087, + "grad_norm": 0.3132053561794151, + "learning_rate": 1.8679549114331725e-05, + "loss": 0.5051, + "step": 1374 + }, + { + "epoch": 1.9927536231884058, + "grad_norm": 0.288525312176386, + "learning_rate": 1.865271068169619e-05, + "loss": 0.5381, + "step": 1375 + }, + { + "epoch": 1.9942028985507245, + "grad_norm": 0.27387962716803127, + "learning_rate": 1.8625872249060654e-05, + "loss": 0.4684, + "step": 1376 + }, + { + "epoch": 1.9956521739130435, + "grad_norm": 0.28738312103663016, + "learning_rate": 1.859903381642512e-05, + "loss": 0.4264, + "step": 1377 + }, + { + "epoch": 1.9971014492753625, + "grad_norm": 0.3556560210674581, + "learning_rate": 1.8572195383789586e-05, + "loss": 0.5116, + "step": 1378 + }, + { + "epoch": 1.9985507246376812, + "grad_norm": 0.3239364265662191, + "learning_rate": 1.8545356951154052e-05, + "loss": 0.524, + "step": 1379 + }, + { + "epoch": 2.0, + "grad_norm": 0.2831017590599986, + "learning_rate": 1.8518518518518518e-05, + "loss": 0.4528, + "step": 1380 + }, + { + "epoch": 2.0014492753623188, + "grad_norm": 0.3523786241603801, + "learning_rate": 1.8491680085882984e-05, + "loss": 0.3702, + "step": 1381 + }, + { + "epoch": 2.0028985507246375, + "grad_norm": 0.3030663523734319, + "learning_rate": 1.846484165324745e-05, + "loss": 0.3364, + "step": 1382 + }, + { + "epoch": 2.0043478260869567, + "grad_norm": 0.3140992539468059, + "learning_rate": 1.8438003220611917e-05, + "loss": 0.3604, + "step": 1383 + }, + { + "epoch": 2.0057971014492755, + "grad_norm": 0.33703359223943585, + "learning_rate": 1.8411164787976383e-05, + "loss": 0.3995, + "step": 1384 + }, + { + "epoch": 2.0072463768115942, + "grad_norm": 0.31543966911381965, + "learning_rate": 1.838432635534085e-05, + "loss": 0.398, + "step": 1385 + }, + { + "epoch": 2.008695652173913, + "grad_norm": 0.3162075337562776, + "learning_rate": 1.8357487922705315e-05, + "loss": 0.3711, + "step": 1386 + }, + { + "epoch": 2.0101449275362318, + "grad_norm": 0.3315543853764206, + "learning_rate": 1.833064949006978e-05, + "loss": 0.3733, + "step": 1387 + }, + { + "epoch": 2.0115942028985505, + "grad_norm": 0.3562957815235896, + "learning_rate": 1.8303811057434247e-05, + "loss": 0.3979, + "step": 1388 + }, + { + "epoch": 2.0130434782608697, + "grad_norm": 0.28341374991032275, + "learning_rate": 1.8276972624798713e-05, + "loss": 0.3344, + "step": 1389 + }, + { + "epoch": 2.0144927536231885, + "grad_norm": 0.2802899932244061, + "learning_rate": 1.825013419216318e-05, + "loss": 0.3477, + "step": 1390 + }, + { + "epoch": 2.0159420289855072, + "grad_norm": 0.3424385111997662, + "learning_rate": 1.8223295759527646e-05, + "loss": 0.404, + "step": 1391 + }, + { + "epoch": 2.017391304347826, + "grad_norm": 0.2984055315815846, + "learning_rate": 1.8196457326892112e-05, + "loss": 0.3632, + "step": 1392 + }, + { + "epoch": 2.0188405797101447, + "grad_norm": 0.2983097121531309, + "learning_rate": 1.8169618894256578e-05, + "loss": 0.3756, + "step": 1393 + }, + { + "epoch": 2.020289855072464, + "grad_norm": 0.27573986873525247, + "learning_rate": 1.814278046162104e-05, + "loss": 0.4272, + "step": 1394 + }, + { + "epoch": 2.0217391304347827, + "grad_norm": 0.3037230522518841, + "learning_rate": 1.8115942028985507e-05, + "loss": 0.3596, + "step": 1395 + }, + { + "epoch": 2.0231884057971015, + "grad_norm": 0.26797546209215395, + "learning_rate": 1.8089103596349973e-05, + "loss": 0.3673, + "step": 1396 + }, + { + "epoch": 2.02463768115942, + "grad_norm": 0.33238669799496534, + "learning_rate": 1.806226516371444e-05, + "loss": 0.3727, + "step": 1397 + }, + { + "epoch": 2.026086956521739, + "grad_norm": 0.31405379508560144, + "learning_rate": 1.8035426731078905e-05, + "loss": 0.4106, + "step": 1398 + }, + { + "epoch": 2.027536231884058, + "grad_norm": 0.24931890950961952, + "learning_rate": 1.800858829844337e-05, + "loss": 0.3413, + "step": 1399 + }, + { + "epoch": 2.028985507246377, + "grad_norm": 0.2757289599197962, + "learning_rate": 1.7981749865807837e-05, + "loss": 0.405, + "step": 1400 + }, + { + "epoch": 2.0304347826086957, + "grad_norm": 0.265334942485615, + "learning_rate": 1.7954911433172304e-05, + "loss": 0.3478, + "step": 1401 + }, + { + "epoch": 2.0318840579710145, + "grad_norm": 0.27009414571202384, + "learning_rate": 1.792807300053677e-05, + "loss": 0.3809, + "step": 1402 + }, + { + "epoch": 2.033333333333333, + "grad_norm": 1.055881132874562, + "learning_rate": 1.7901234567901236e-05, + "loss": 0.3555, + "step": 1403 + }, + { + "epoch": 2.034782608695652, + "grad_norm": 0.3399683985781049, + "learning_rate": 1.78743961352657e-05, + "loss": 0.3936, + "step": 1404 + }, + { + "epoch": 2.036231884057971, + "grad_norm": 0.38978587070503584, + "learning_rate": 1.7847557702630165e-05, + "loss": 0.4131, + "step": 1405 + }, + { + "epoch": 2.03768115942029, + "grad_norm": 0.3323719722356293, + "learning_rate": 1.782071926999463e-05, + "loss": 0.3929, + "step": 1406 + }, + { + "epoch": 2.0391304347826087, + "grad_norm": 0.3496319650720124, + "learning_rate": 1.77938808373591e-05, + "loss": 0.3974, + "step": 1407 + }, + { + "epoch": 2.0405797101449274, + "grad_norm": 0.29021047672453953, + "learning_rate": 1.7767042404723566e-05, + "loss": 0.4147, + "step": 1408 + }, + { + "epoch": 2.042028985507246, + "grad_norm": 0.3746794123510468, + "learning_rate": 1.7740203972088033e-05, + "loss": 0.4319, + "step": 1409 + }, + { + "epoch": 2.0434782608695654, + "grad_norm": 0.32112065217197794, + "learning_rate": 1.77133655394525e-05, + "loss": 0.3461, + "step": 1410 + }, + { + "epoch": 2.044927536231884, + "grad_norm": 0.32295608409620047, + "learning_rate": 1.7686527106816965e-05, + "loss": 0.348, + "step": 1411 + }, + { + "epoch": 2.046376811594203, + "grad_norm": 0.2877795513155748, + "learning_rate": 1.765968867418143e-05, + "loss": 0.3937, + "step": 1412 + }, + { + "epoch": 2.0478260869565217, + "grad_norm": 0.3269016501234976, + "learning_rate": 1.7632850241545894e-05, + "loss": 0.4032, + "step": 1413 + }, + { + "epoch": 2.0492753623188404, + "grad_norm": 0.3065874525483955, + "learning_rate": 1.760601180891036e-05, + "loss": 0.3697, + "step": 1414 + }, + { + "epoch": 2.050724637681159, + "grad_norm": 0.30251777186615336, + "learning_rate": 1.7579173376274826e-05, + "loss": 0.4002, + "step": 1415 + }, + { + "epoch": 2.0521739130434784, + "grad_norm": 0.2923862896542669, + "learning_rate": 1.7552334943639292e-05, + "loss": 0.4159, + "step": 1416 + }, + { + "epoch": 2.053623188405797, + "grad_norm": 0.2699902274060441, + "learning_rate": 1.7525496511003758e-05, + "loss": 0.4173, + "step": 1417 + }, + { + "epoch": 2.055072463768116, + "grad_norm": 0.2904906346194216, + "learning_rate": 1.7498658078368224e-05, + "loss": 0.3799, + "step": 1418 + }, + { + "epoch": 2.0565217391304347, + "grad_norm": 0.2648593871130252, + "learning_rate": 1.747181964573269e-05, + "loss": 0.3918, + "step": 1419 + }, + { + "epoch": 2.0579710144927534, + "grad_norm": 0.27163514907233427, + "learning_rate": 1.7444981213097157e-05, + "loss": 0.37, + "step": 1420 + }, + { + "epoch": 2.0594202898550726, + "grad_norm": 0.25859866053020986, + "learning_rate": 1.7418142780461623e-05, + "loss": 0.4068, + "step": 1421 + }, + { + "epoch": 2.0608695652173914, + "grad_norm": 0.26762775078295287, + "learning_rate": 1.739130434782609e-05, + "loss": 0.4238, + "step": 1422 + }, + { + "epoch": 2.06231884057971, + "grad_norm": 0.2647847770956608, + "learning_rate": 1.736446591519055e-05, + "loss": 0.3958, + "step": 1423 + }, + { + "epoch": 2.063768115942029, + "grad_norm": 0.24667071694024753, + "learning_rate": 1.7337627482555018e-05, + "loss": 0.3443, + "step": 1424 + }, + { + "epoch": 2.0652173913043477, + "grad_norm": 0.27158923135252777, + "learning_rate": 1.7310789049919484e-05, + "loss": 0.3871, + "step": 1425 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 0.2566345926387272, + "learning_rate": 1.728395061728395e-05, + "loss": 0.4334, + "step": 1426 + }, + { + "epoch": 2.0681159420289856, + "grad_norm": 0.24107253116295427, + "learning_rate": 1.7257112184648416e-05, + "loss": 0.4235, + "step": 1427 + }, + { + "epoch": 2.0695652173913044, + "grad_norm": 0.28099353675057254, + "learning_rate": 1.7230273752012882e-05, + "loss": 0.3238, + "step": 1428 + }, + { + "epoch": 2.071014492753623, + "grad_norm": 0.3208249477068484, + "learning_rate": 1.720343531937735e-05, + "loss": 0.4569, + "step": 1429 + }, + { + "epoch": 2.072463768115942, + "grad_norm": 0.2280711164683154, + "learning_rate": 1.7176596886741818e-05, + "loss": 0.3728, + "step": 1430 + }, + { + "epoch": 2.0739130434782607, + "grad_norm": 0.24172226721506201, + "learning_rate": 1.714975845410628e-05, + "loss": 0.4002, + "step": 1431 + }, + { + "epoch": 2.07536231884058, + "grad_norm": 0.2815390040260254, + "learning_rate": 1.7122920021470747e-05, + "loss": 0.3797, + "step": 1432 + }, + { + "epoch": 2.0768115942028986, + "grad_norm": 0.24636656338577184, + "learning_rate": 1.7096081588835213e-05, + "loss": 0.3533, + "step": 1433 + }, + { + "epoch": 2.0782608695652174, + "grad_norm": 0.2716289213218912, + "learning_rate": 1.706924315619968e-05, + "loss": 0.3981, + "step": 1434 + }, + { + "epoch": 2.079710144927536, + "grad_norm": 0.25829679326857236, + "learning_rate": 1.7042404723564145e-05, + "loss": 0.4223, + "step": 1435 + }, + { + "epoch": 2.081159420289855, + "grad_norm": 0.250177425572541, + "learning_rate": 1.701556629092861e-05, + "loss": 0.3447, + "step": 1436 + }, + { + "epoch": 2.082608695652174, + "grad_norm": 0.2502529045762175, + "learning_rate": 1.6988727858293077e-05, + "loss": 0.363, + "step": 1437 + }, + { + "epoch": 2.084057971014493, + "grad_norm": 0.2447443764539991, + "learning_rate": 1.6961889425657543e-05, + "loss": 0.3834, + "step": 1438 + }, + { + "epoch": 2.0855072463768116, + "grad_norm": 0.24219777526195496, + "learning_rate": 1.693505099302201e-05, + "loss": 0.3257, + "step": 1439 + }, + { + "epoch": 2.0869565217391304, + "grad_norm": 0.2920526142203735, + "learning_rate": 1.6908212560386476e-05, + "loss": 0.4298, + "step": 1440 + }, + { + "epoch": 2.088405797101449, + "grad_norm": 0.2516657259945037, + "learning_rate": 1.6881374127750942e-05, + "loss": 0.3853, + "step": 1441 + }, + { + "epoch": 2.0898550724637683, + "grad_norm": 0.24165452159967626, + "learning_rate": 1.6854535695115405e-05, + "loss": 0.3824, + "step": 1442 + }, + { + "epoch": 2.091304347826087, + "grad_norm": 0.2355200461177853, + "learning_rate": 1.682769726247987e-05, + "loss": 0.3333, + "step": 1443 + }, + { + "epoch": 2.092753623188406, + "grad_norm": 0.24122064994896833, + "learning_rate": 1.6800858829844337e-05, + "loss": 0.3108, + "step": 1444 + }, + { + "epoch": 2.0942028985507246, + "grad_norm": 0.27102253831694195, + "learning_rate": 1.6774020397208803e-05, + "loss": 0.3869, + "step": 1445 + }, + { + "epoch": 2.0956521739130434, + "grad_norm": 0.23602739749485968, + "learning_rate": 1.674718196457327e-05, + "loss": 0.3993, + "step": 1446 + }, + { + "epoch": 2.097101449275362, + "grad_norm": 0.2807521284756467, + "learning_rate": 1.6720343531937735e-05, + "loss": 0.4124, + "step": 1447 + }, + { + "epoch": 2.0985507246376813, + "grad_norm": 0.23997496682279448, + "learning_rate": 1.66935050993022e-05, + "loss": 0.3634, + "step": 1448 + }, + { + "epoch": 2.1, + "grad_norm": 0.23149144183564388, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.3466, + "step": 1449 + }, + { + "epoch": 2.101449275362319, + "grad_norm": 0.2451881745573423, + "learning_rate": 1.6639828234031134e-05, + "loss": 0.3046, + "step": 1450 + }, + { + "epoch": 2.1028985507246376, + "grad_norm": 0.24181484293881986, + "learning_rate": 1.66129898013956e-05, + "loss": 0.3777, + "step": 1451 + }, + { + "epoch": 2.1043478260869564, + "grad_norm": 0.26395665403756063, + "learning_rate": 1.6586151368760062e-05, + "loss": 0.3637, + "step": 1452 + }, + { + "epoch": 2.1057971014492756, + "grad_norm": 0.25149885689244855, + "learning_rate": 1.6559312936124532e-05, + "loss": 0.3668, + "step": 1453 + }, + { + "epoch": 2.1072463768115943, + "grad_norm": 0.24249920612235193, + "learning_rate": 1.6532474503488998e-05, + "loss": 0.3903, + "step": 1454 + }, + { + "epoch": 2.108695652173913, + "grad_norm": 0.26713497584114004, + "learning_rate": 1.6505636070853464e-05, + "loss": 0.3562, + "step": 1455 + }, + { + "epoch": 2.110144927536232, + "grad_norm": 0.2593799864084221, + "learning_rate": 1.647879763821793e-05, + "loss": 0.3559, + "step": 1456 + }, + { + "epoch": 2.1115942028985506, + "grad_norm": 0.2592694481669805, + "learning_rate": 1.6451959205582396e-05, + "loss": 0.4043, + "step": 1457 + }, + { + "epoch": 2.1130434782608694, + "grad_norm": 0.24758345963815817, + "learning_rate": 1.6425120772946863e-05, + "loss": 0.3778, + "step": 1458 + }, + { + "epoch": 2.1144927536231886, + "grad_norm": 0.26493518217950146, + "learning_rate": 1.639828234031133e-05, + "loss": 0.3574, + "step": 1459 + }, + { + "epoch": 2.1159420289855073, + "grad_norm": 0.25281018813456596, + "learning_rate": 1.637144390767579e-05, + "loss": 0.3922, + "step": 1460 + }, + { + "epoch": 2.117391304347826, + "grad_norm": 0.24534360749969875, + "learning_rate": 1.6344605475040258e-05, + "loss": 0.3379, + "step": 1461 + }, + { + "epoch": 2.118840579710145, + "grad_norm": 0.24638959833813115, + "learning_rate": 1.6317767042404724e-05, + "loss": 0.3538, + "step": 1462 + }, + { + "epoch": 2.1202898550724636, + "grad_norm": 0.24803895065658563, + "learning_rate": 1.629092860976919e-05, + "loss": 0.4084, + "step": 1463 + }, + { + "epoch": 2.121739130434783, + "grad_norm": 0.25908341056136924, + "learning_rate": 1.6264090177133656e-05, + "loss": 0.4083, + "step": 1464 + }, + { + "epoch": 2.1231884057971016, + "grad_norm": 0.23918352019175904, + "learning_rate": 1.6237251744498122e-05, + "loss": 0.379, + "step": 1465 + }, + { + "epoch": 2.1246376811594203, + "grad_norm": 0.2318776018543893, + "learning_rate": 1.6210413311862588e-05, + "loss": 0.3552, + "step": 1466 + }, + { + "epoch": 2.126086956521739, + "grad_norm": 0.46583670802882937, + "learning_rate": 1.6183574879227054e-05, + "loss": 0.3675, + "step": 1467 + }, + { + "epoch": 2.127536231884058, + "grad_norm": 0.2597134207234505, + "learning_rate": 1.615673644659152e-05, + "loss": 0.3349, + "step": 1468 + }, + { + "epoch": 2.1289855072463766, + "grad_norm": 0.24390171120238152, + "learning_rate": 1.6129898013955987e-05, + "loss": 0.3716, + "step": 1469 + }, + { + "epoch": 2.130434782608696, + "grad_norm": 0.24362666570186983, + "learning_rate": 1.610305958132045e-05, + "loss": 0.3634, + "step": 1470 + }, + { + "epoch": 2.1318840579710145, + "grad_norm": 0.2723467761379894, + "learning_rate": 1.6076221148684915e-05, + "loss": 0.3985, + "step": 1471 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.2431944553336726, + "learning_rate": 1.604938271604938e-05, + "loss": 0.3581, + "step": 1472 + }, + { + "epoch": 2.134782608695652, + "grad_norm": 0.26094641628999893, + "learning_rate": 1.6022544283413848e-05, + "loss": 0.372, + "step": 1473 + }, + { + "epoch": 2.136231884057971, + "grad_norm": 0.24313430833284463, + "learning_rate": 1.5995705850778314e-05, + "loss": 0.3398, + "step": 1474 + }, + { + "epoch": 2.13768115942029, + "grad_norm": 0.2560446345933897, + "learning_rate": 1.596886741814278e-05, + "loss": 0.3542, + "step": 1475 + }, + { + "epoch": 2.139130434782609, + "grad_norm": 0.24771966399779138, + "learning_rate": 1.5942028985507246e-05, + "loss": 0.373, + "step": 1476 + }, + { + "epoch": 2.1405797101449275, + "grad_norm": 0.6791987543987067, + "learning_rate": 1.5915190552871716e-05, + "loss": 0.3547, + "step": 1477 + }, + { + "epoch": 2.1420289855072463, + "grad_norm": 0.2649813056093659, + "learning_rate": 1.5888352120236182e-05, + "loss": 0.3889, + "step": 1478 + }, + { + "epoch": 2.143478260869565, + "grad_norm": 0.25422139002817895, + "learning_rate": 1.5861513687600644e-05, + "loss": 0.3968, + "step": 1479 + }, + { + "epoch": 2.1449275362318843, + "grad_norm": 0.2576624449637141, + "learning_rate": 1.583467525496511e-05, + "loss": 0.3593, + "step": 1480 + }, + { + "epoch": 2.146376811594203, + "grad_norm": 0.2586375234022925, + "learning_rate": 1.5807836822329577e-05, + "loss": 0.3671, + "step": 1481 + }, + { + "epoch": 2.1478260869565218, + "grad_norm": 0.2558240672741214, + "learning_rate": 1.5780998389694043e-05, + "loss": 0.4098, + "step": 1482 + }, + { + "epoch": 2.1492753623188405, + "grad_norm": 0.268688991057188, + "learning_rate": 1.575415995705851e-05, + "loss": 0.3933, + "step": 1483 + }, + { + "epoch": 2.1507246376811593, + "grad_norm": 0.2532381024335753, + "learning_rate": 1.5727321524422975e-05, + "loss": 0.3494, + "step": 1484 + }, + { + "epoch": 2.1521739130434785, + "grad_norm": 0.26098343354326276, + "learning_rate": 1.570048309178744e-05, + "loss": 0.3546, + "step": 1485 + }, + { + "epoch": 2.1536231884057973, + "grad_norm": 0.2891962630112305, + "learning_rate": 1.5673644659151907e-05, + "loss": 0.4239, + "step": 1486 + }, + { + "epoch": 2.155072463768116, + "grad_norm": 0.26439594268339023, + "learning_rate": 1.5646806226516374e-05, + "loss": 0.4137, + "step": 1487 + }, + { + "epoch": 2.1565217391304348, + "grad_norm": 0.2661719925783797, + "learning_rate": 1.561996779388084e-05, + "loss": 0.3996, + "step": 1488 + }, + { + "epoch": 2.1579710144927535, + "grad_norm": 0.2389028862134638, + "learning_rate": 1.5593129361245302e-05, + "loss": 0.3406, + "step": 1489 + }, + { + "epoch": 2.1594202898550723, + "grad_norm": 0.24546221102827662, + "learning_rate": 1.556629092860977e-05, + "loss": 0.3792, + "step": 1490 + }, + { + "epoch": 2.1608695652173915, + "grad_norm": 0.276109060092543, + "learning_rate": 1.5539452495974235e-05, + "loss": 0.4163, + "step": 1491 + }, + { + "epoch": 2.1623188405797102, + "grad_norm": 0.467588348904684, + "learning_rate": 1.55126140633387e-05, + "loss": 0.361, + "step": 1492 + }, + { + "epoch": 2.163768115942029, + "grad_norm": 0.23622254966412615, + "learning_rate": 1.5485775630703167e-05, + "loss": 0.3845, + "step": 1493 + }, + { + "epoch": 2.1652173913043478, + "grad_norm": 0.24695504994769543, + "learning_rate": 1.5458937198067633e-05, + "loss": 0.3272, + "step": 1494 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.25126118120410157, + "learning_rate": 1.54320987654321e-05, + "loss": 0.3526, + "step": 1495 + }, + { + "epoch": 2.1681159420289857, + "grad_norm": 0.26226328500893514, + "learning_rate": 1.5405260332796565e-05, + "loss": 0.4089, + "step": 1496 + }, + { + "epoch": 2.1695652173913045, + "grad_norm": 0.23389975075817726, + "learning_rate": 1.537842190016103e-05, + "loss": 0.3409, + "step": 1497 + }, + { + "epoch": 2.1710144927536232, + "grad_norm": 0.2615319879343272, + "learning_rate": 1.5351583467525498e-05, + "loss": 0.3646, + "step": 1498 + }, + { + "epoch": 2.172463768115942, + "grad_norm": 0.2708157191632523, + "learning_rate": 1.532474503488996e-05, + "loss": 0.3632, + "step": 1499 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.23612916073428386, + "learning_rate": 1.529790660225443e-05, + "loss": 0.3333, + "step": 1500 + }, + { + "epoch": 2.1753623188405795, + "grad_norm": 0.26253628983361604, + "learning_rate": 1.5271068169618896e-05, + "loss": 0.3783, + "step": 1501 + }, + { + "epoch": 2.1768115942028987, + "grad_norm": 0.2799278780036722, + "learning_rate": 1.5244229736983362e-05, + "loss": 0.39, + "step": 1502 + }, + { + "epoch": 2.1782608695652175, + "grad_norm": 2.923354758482103, + "learning_rate": 1.5217391304347828e-05, + "loss": 0.3859, + "step": 1503 + }, + { + "epoch": 2.1797101449275362, + "grad_norm": 0.26300827777912456, + "learning_rate": 1.5190552871712294e-05, + "loss": 0.3882, + "step": 1504 + }, + { + "epoch": 2.181159420289855, + "grad_norm": 0.24980607270012614, + "learning_rate": 1.5163714439076759e-05, + "loss": 0.4039, + "step": 1505 + }, + { + "epoch": 2.1826086956521737, + "grad_norm": 0.2650949294763254, + "learning_rate": 1.5136876006441225e-05, + "loss": 0.4225, + "step": 1506 + }, + { + "epoch": 2.184057971014493, + "grad_norm": 0.23582919001983096, + "learning_rate": 1.5110037573805691e-05, + "loss": 0.3915, + "step": 1507 + }, + { + "epoch": 2.1855072463768117, + "grad_norm": 0.26149592082357265, + "learning_rate": 1.5083199141170157e-05, + "loss": 0.4391, + "step": 1508 + }, + { + "epoch": 2.1869565217391305, + "grad_norm": 0.24956056460952147, + "learning_rate": 1.5056360708534623e-05, + "loss": 0.3973, + "step": 1509 + }, + { + "epoch": 2.1884057971014492, + "grad_norm": 0.2670527077121037, + "learning_rate": 1.5029522275899088e-05, + "loss": 0.3859, + "step": 1510 + }, + { + "epoch": 2.189855072463768, + "grad_norm": 0.2478647439410972, + "learning_rate": 1.5002683843263554e-05, + "loss": 0.3954, + "step": 1511 + }, + { + "epoch": 2.1913043478260867, + "grad_norm": 0.2683179076496102, + "learning_rate": 1.497584541062802e-05, + "loss": 0.415, + "step": 1512 + }, + { + "epoch": 2.192753623188406, + "grad_norm": 0.2594324410575354, + "learning_rate": 1.4949006977992486e-05, + "loss": 0.3551, + "step": 1513 + }, + { + "epoch": 2.1942028985507247, + "grad_norm": 0.2457851168205183, + "learning_rate": 1.4922168545356952e-05, + "loss": 0.4231, + "step": 1514 + }, + { + "epoch": 2.1956521739130435, + "grad_norm": 0.24177430563161123, + "learning_rate": 1.4895330112721417e-05, + "loss": 0.3922, + "step": 1515 + }, + { + "epoch": 2.197101449275362, + "grad_norm": 0.2572364642689413, + "learning_rate": 1.4868491680085883e-05, + "loss": 0.3742, + "step": 1516 + }, + { + "epoch": 2.198550724637681, + "grad_norm": 0.24100963020540228, + "learning_rate": 1.4841653247450349e-05, + "loss": 0.3837, + "step": 1517 + }, + { + "epoch": 2.2, + "grad_norm": 5.119712217144526, + "learning_rate": 1.4814814814814815e-05, + "loss": 0.4164, + "step": 1518 + }, + { + "epoch": 2.201449275362319, + "grad_norm": 0.2516637687665249, + "learning_rate": 1.4787976382179281e-05, + "loss": 0.3679, + "step": 1519 + }, + { + "epoch": 2.2028985507246377, + "grad_norm": 0.26976714186644757, + "learning_rate": 1.4761137949543746e-05, + "loss": 0.3729, + "step": 1520 + }, + { + "epoch": 2.2043478260869565, + "grad_norm": 0.26502840804982475, + "learning_rate": 1.4734299516908212e-05, + "loss": 0.444, + "step": 1521 + }, + { + "epoch": 2.205797101449275, + "grad_norm": 0.24414225976585754, + "learning_rate": 1.4707461084272678e-05, + "loss": 0.3969, + "step": 1522 + }, + { + "epoch": 2.207246376811594, + "grad_norm": 0.2337178526893521, + "learning_rate": 1.4680622651637147e-05, + "loss": 0.3732, + "step": 1523 + }, + { + "epoch": 2.208695652173913, + "grad_norm": 0.26253557351427015, + "learning_rate": 1.4653784219001612e-05, + "loss": 0.3613, + "step": 1524 + }, + { + "epoch": 2.210144927536232, + "grad_norm": 0.26389875956191183, + "learning_rate": 1.4626945786366078e-05, + "loss": 0.3975, + "step": 1525 + }, + { + "epoch": 2.2115942028985507, + "grad_norm": 0.28084385613675417, + "learning_rate": 1.4600107353730544e-05, + "loss": 0.3476, + "step": 1526 + }, + { + "epoch": 2.2130434782608694, + "grad_norm": 0.2646052280590228, + "learning_rate": 1.457326892109501e-05, + "loss": 0.3714, + "step": 1527 + }, + { + "epoch": 2.214492753623188, + "grad_norm": 0.2645379652903494, + "learning_rate": 1.4546430488459476e-05, + "loss": 0.3934, + "step": 1528 + }, + { + "epoch": 2.2159420289855074, + "grad_norm": 0.2666041230309001, + "learning_rate": 1.451959205582394e-05, + "loss": 0.4162, + "step": 1529 + }, + { + "epoch": 2.217391304347826, + "grad_norm": 0.25393841436995956, + "learning_rate": 1.4492753623188407e-05, + "loss": 0.3521, + "step": 1530 + }, + { + "epoch": 2.218840579710145, + "grad_norm": 2.5834333622726957, + "learning_rate": 1.4465915190552873e-05, + "loss": 0.3571, + "step": 1531 + }, + { + "epoch": 2.2202898550724637, + "grad_norm": 0.25785009466800646, + "learning_rate": 1.4439076757917339e-05, + "loss": 0.4149, + "step": 1532 + }, + { + "epoch": 2.2217391304347824, + "grad_norm": 0.24957707518993402, + "learning_rate": 1.4412238325281805e-05, + "loss": 0.374, + "step": 1533 + }, + { + "epoch": 2.2231884057971016, + "grad_norm": 0.23262952045687874, + "learning_rate": 1.438539989264627e-05, + "loss": 0.3882, + "step": 1534 + }, + { + "epoch": 2.2246376811594204, + "grad_norm": 0.25960049374696026, + "learning_rate": 1.4358561460010736e-05, + "loss": 0.3939, + "step": 1535 + }, + { + "epoch": 2.226086956521739, + "grad_norm": 0.2491894668049094, + "learning_rate": 1.4331723027375202e-05, + "loss": 0.3577, + "step": 1536 + }, + { + "epoch": 2.227536231884058, + "grad_norm": 0.25138391003811594, + "learning_rate": 1.4304884594739668e-05, + "loss": 0.3752, + "step": 1537 + }, + { + "epoch": 2.2289855072463767, + "grad_norm": 0.214850791994487, + "learning_rate": 1.4278046162104134e-05, + "loss": 0.3217, + "step": 1538 + }, + { + "epoch": 2.230434782608696, + "grad_norm": 0.26161881685821325, + "learning_rate": 1.4251207729468599e-05, + "loss": 0.3867, + "step": 1539 + }, + { + "epoch": 2.2318840579710146, + "grad_norm": 0.28080848285012555, + "learning_rate": 1.4224369296833065e-05, + "loss": 0.3626, + "step": 1540 + }, + { + "epoch": 2.2333333333333334, + "grad_norm": 0.25475653356537886, + "learning_rate": 1.419753086419753e-05, + "loss": 0.3469, + "step": 1541 + }, + { + "epoch": 2.234782608695652, + "grad_norm": 0.3712708204701495, + "learning_rate": 1.4170692431561997e-05, + "loss": 0.3743, + "step": 1542 + }, + { + "epoch": 2.236231884057971, + "grad_norm": 0.8361471019434635, + "learning_rate": 1.4143853998926463e-05, + "loss": 0.3946, + "step": 1543 + }, + { + "epoch": 2.2376811594202897, + "grad_norm": 0.31565988365410325, + "learning_rate": 1.4117015566290927e-05, + "loss": 0.3839, + "step": 1544 + }, + { + "epoch": 2.239130434782609, + "grad_norm": 0.3180964477239743, + "learning_rate": 1.4090177133655394e-05, + "loss": 0.3998, + "step": 1545 + }, + { + "epoch": 2.2405797101449276, + "grad_norm": 0.2595863556849321, + "learning_rate": 1.406333870101986e-05, + "loss": 0.3842, + "step": 1546 + }, + { + "epoch": 2.2420289855072464, + "grad_norm": 0.2300656737294017, + "learning_rate": 1.4036500268384328e-05, + "loss": 0.3791, + "step": 1547 + }, + { + "epoch": 2.243478260869565, + "grad_norm": 0.26630741953742765, + "learning_rate": 1.4009661835748794e-05, + "loss": 0.3565, + "step": 1548 + }, + { + "epoch": 2.244927536231884, + "grad_norm": 0.3449002614397516, + "learning_rate": 1.398282340311326e-05, + "loss": 0.356, + "step": 1549 + }, + { + "epoch": 2.246376811594203, + "grad_norm": 0.278493690085927, + "learning_rate": 1.3955984970477726e-05, + "loss": 0.3849, + "step": 1550 + }, + { + "epoch": 2.247826086956522, + "grad_norm": 0.30149468853130207, + "learning_rate": 1.3929146537842192e-05, + "loss": 0.4505, + "step": 1551 + }, + { + "epoch": 2.2492753623188406, + "grad_norm": 0.27048184170368406, + "learning_rate": 1.3902308105206656e-05, + "loss": 0.3985, + "step": 1552 + }, + { + "epoch": 2.2507246376811594, + "grad_norm": 0.2913064801887286, + "learning_rate": 1.3875469672571123e-05, + "loss": 0.3717, + "step": 1553 + }, + { + "epoch": 2.252173913043478, + "grad_norm": 0.2730993886762121, + "learning_rate": 1.3848631239935589e-05, + "loss": 0.3484, + "step": 1554 + }, + { + "epoch": 2.253623188405797, + "grad_norm": 0.29507804335844673, + "learning_rate": 1.3821792807300055e-05, + "loss": 0.3219, + "step": 1555 + }, + { + "epoch": 2.255072463768116, + "grad_norm": 0.28418547043948034, + "learning_rate": 1.3794954374664521e-05, + "loss": 0.3412, + "step": 1556 + }, + { + "epoch": 2.256521739130435, + "grad_norm": 0.23898122611394282, + "learning_rate": 1.3768115942028985e-05, + "loss": 0.3825, + "step": 1557 + }, + { + "epoch": 2.2579710144927536, + "grad_norm": 0.2966037675150378, + "learning_rate": 1.3741277509393452e-05, + "loss": 0.3391, + "step": 1558 + }, + { + "epoch": 2.2594202898550724, + "grad_norm": 0.336848326235953, + "learning_rate": 1.3714439076757918e-05, + "loss": 0.395, + "step": 1559 + }, + { + "epoch": 2.260869565217391, + "grad_norm": 0.2722507559170827, + "learning_rate": 1.3687600644122384e-05, + "loss": 0.3558, + "step": 1560 + }, + { + "epoch": 2.2623188405797103, + "grad_norm": 0.2668793089188747, + "learning_rate": 1.366076221148685e-05, + "loss": 0.3325, + "step": 1561 + }, + { + "epoch": 2.263768115942029, + "grad_norm": 0.31039322433546784, + "learning_rate": 1.3633923778851316e-05, + "loss": 0.3879, + "step": 1562 + }, + { + "epoch": 2.265217391304348, + "grad_norm": 0.288457375992248, + "learning_rate": 1.360708534621578e-05, + "loss": 0.3928, + "step": 1563 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.2735574240926222, + "learning_rate": 1.3580246913580247e-05, + "loss": 0.3802, + "step": 1564 + }, + { + "epoch": 2.2681159420289854, + "grad_norm": 0.26280010543390336, + "learning_rate": 1.3553408480944713e-05, + "loss": 0.3504, + "step": 1565 + }, + { + "epoch": 2.269565217391304, + "grad_norm": 0.3080823982213214, + "learning_rate": 1.3526570048309179e-05, + "loss": 0.3772, + "step": 1566 + }, + { + "epoch": 2.2710144927536233, + "grad_norm": 0.277292211551906, + "learning_rate": 1.3499731615673645e-05, + "loss": 0.4081, + "step": 1567 + }, + { + "epoch": 2.272463768115942, + "grad_norm": 0.26869130104232364, + "learning_rate": 1.347289318303811e-05, + "loss": 0.3448, + "step": 1568 + }, + { + "epoch": 2.273913043478261, + "grad_norm": 0.27781485973024744, + "learning_rate": 1.3446054750402576e-05, + "loss": 0.3928, + "step": 1569 + }, + { + "epoch": 2.2753623188405796, + "grad_norm": 0.24601016996456326, + "learning_rate": 1.3419216317767045e-05, + "loss": 0.3919, + "step": 1570 + }, + { + "epoch": 2.2768115942028984, + "grad_norm": 0.2442491835619423, + "learning_rate": 1.339237788513151e-05, + "loss": 0.3681, + "step": 1571 + }, + { + "epoch": 2.2782608695652176, + "grad_norm": 0.29939513174678234, + "learning_rate": 1.3365539452495976e-05, + "loss": 0.4182, + "step": 1572 + }, + { + "epoch": 2.2797101449275363, + "grad_norm": 0.2379937617351913, + "learning_rate": 1.3338701019860442e-05, + "loss": 0.3469, + "step": 1573 + }, + { + "epoch": 2.281159420289855, + "grad_norm": 0.24206655548265582, + "learning_rate": 1.3311862587224908e-05, + "loss": 0.3823, + "step": 1574 + }, + { + "epoch": 2.282608695652174, + "grad_norm": 0.25153264548901644, + "learning_rate": 1.3285024154589374e-05, + "loss": 0.3633, + "step": 1575 + }, + { + "epoch": 2.2840579710144926, + "grad_norm": 0.24838756048823832, + "learning_rate": 1.3258185721953838e-05, + "loss": 0.3752, + "step": 1576 + }, + { + "epoch": 2.2855072463768114, + "grad_norm": 0.24475452285064764, + "learning_rate": 1.3231347289318305e-05, + "loss": 0.3748, + "step": 1577 + }, + { + "epoch": 2.2869565217391306, + "grad_norm": 0.26345551133178485, + "learning_rate": 1.320450885668277e-05, + "loss": 0.347, + "step": 1578 + }, + { + "epoch": 2.2884057971014493, + "grad_norm": 0.24643703206359371, + "learning_rate": 1.3177670424047237e-05, + "loss": 0.3933, + "step": 1579 + }, + { + "epoch": 2.289855072463768, + "grad_norm": 0.2904257571389863, + "learning_rate": 1.3150831991411703e-05, + "loss": 0.3985, + "step": 1580 + }, + { + "epoch": 2.291304347826087, + "grad_norm": 0.23888677788284127, + "learning_rate": 1.3123993558776167e-05, + "loss": 0.3486, + "step": 1581 + }, + { + "epoch": 2.292753623188406, + "grad_norm": 0.25676575141902436, + "learning_rate": 1.3097155126140634e-05, + "loss": 0.3529, + "step": 1582 + }, + { + "epoch": 2.294202898550725, + "grad_norm": 0.2290381327720587, + "learning_rate": 1.30703166935051e-05, + "loss": 0.3446, + "step": 1583 + }, + { + "epoch": 2.2956521739130435, + "grad_norm": 0.2673558483575196, + "learning_rate": 1.3043478260869566e-05, + "loss": 0.4187, + "step": 1584 + }, + { + "epoch": 2.2971014492753623, + "grad_norm": 0.2619657872949245, + "learning_rate": 1.3016639828234032e-05, + "loss": 0.4018, + "step": 1585 + }, + { + "epoch": 2.298550724637681, + "grad_norm": 0.2471540318096013, + "learning_rate": 1.2989801395598496e-05, + "loss": 0.3582, + "step": 1586 + }, + { + "epoch": 2.3, + "grad_norm": 0.24938930140778784, + "learning_rate": 1.2962962962962962e-05, + "loss": 0.3113, + "step": 1587 + }, + { + "epoch": 2.301449275362319, + "grad_norm": 0.226686670452519, + "learning_rate": 1.2936124530327429e-05, + "loss": 0.3655, + "step": 1588 + }, + { + "epoch": 2.302898550724638, + "grad_norm": 0.2448061300560438, + "learning_rate": 1.2909286097691895e-05, + "loss": 0.3634, + "step": 1589 + }, + { + "epoch": 2.3043478260869565, + "grad_norm": 0.23174962185631323, + "learning_rate": 1.288244766505636e-05, + "loss": 0.3644, + "step": 1590 + }, + { + "epoch": 2.3057971014492753, + "grad_norm": 0.26870532656946267, + "learning_rate": 1.2855609232420825e-05, + "loss": 0.3919, + "step": 1591 + }, + { + "epoch": 2.307246376811594, + "grad_norm": 0.295247629024476, + "learning_rate": 1.2828770799785291e-05, + "loss": 0.3849, + "step": 1592 + }, + { + "epoch": 2.3086956521739133, + "grad_norm": 0.25583435866670345, + "learning_rate": 1.2801932367149761e-05, + "loss": 0.3998, + "step": 1593 + }, + { + "epoch": 2.310144927536232, + "grad_norm": 0.25026084065358445, + "learning_rate": 1.2775093934514227e-05, + "loss": 0.3571, + "step": 1594 + }, + { + "epoch": 2.3115942028985508, + "grad_norm": 0.2942700249747664, + "learning_rate": 1.2748255501878691e-05, + "loss": 0.3749, + "step": 1595 + }, + { + "epoch": 2.3130434782608695, + "grad_norm": 0.248980629991386, + "learning_rate": 1.2721417069243158e-05, + "loss": 0.384, + "step": 1596 + }, + { + "epoch": 2.3144927536231883, + "grad_norm": 0.2532058397112117, + "learning_rate": 1.2694578636607624e-05, + "loss": 0.3709, + "step": 1597 + }, + { + "epoch": 2.315942028985507, + "grad_norm": 0.26982970940029316, + "learning_rate": 1.266774020397209e-05, + "loss": 0.3548, + "step": 1598 + }, + { + "epoch": 2.3173913043478263, + "grad_norm": 0.255607765102706, + "learning_rate": 1.2640901771336556e-05, + "loss": 0.3663, + "step": 1599 + }, + { + "epoch": 2.318840579710145, + "grad_norm": 0.25522783943627514, + "learning_rate": 1.261406333870102e-05, + "loss": 0.373, + "step": 1600 + }, + { + "epoch": 2.3202898550724638, + "grad_norm": 0.23165148455281287, + "learning_rate": 1.2587224906065487e-05, + "loss": 0.3594, + "step": 1601 + }, + { + "epoch": 2.3217391304347825, + "grad_norm": 0.2315974386021932, + "learning_rate": 1.2560386473429953e-05, + "loss": 0.3194, + "step": 1602 + }, + { + "epoch": 2.3231884057971013, + "grad_norm": 0.26823600554364063, + "learning_rate": 1.2533548040794419e-05, + "loss": 0.4055, + "step": 1603 + }, + { + "epoch": 2.3246376811594205, + "grad_norm": 0.2769963244395018, + "learning_rate": 1.2506709608158885e-05, + "loss": 0.3917, + "step": 1604 + }, + { + "epoch": 2.3260869565217392, + "grad_norm": 0.2528968534522811, + "learning_rate": 1.247987117552335e-05, + "loss": 0.3419, + "step": 1605 + }, + { + "epoch": 2.327536231884058, + "grad_norm": 0.25160922497130767, + "learning_rate": 1.2453032742887815e-05, + "loss": 0.3872, + "step": 1606 + }, + { + "epoch": 2.3289855072463768, + "grad_norm": 0.247492946747638, + "learning_rate": 1.2426194310252282e-05, + "loss": 0.3221, + "step": 1607 + }, + { + "epoch": 2.3304347826086955, + "grad_norm": 0.24529622457309413, + "learning_rate": 1.2399355877616748e-05, + "loss": 0.3213, + "step": 1608 + }, + { + "epoch": 2.3318840579710143, + "grad_norm": 0.2530193260025237, + "learning_rate": 1.2372517444981214e-05, + "loss": 0.3504, + "step": 1609 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.2697304808458077, + "learning_rate": 1.2345679012345678e-05, + "loss": 0.3604, + "step": 1610 + }, + { + "epoch": 2.3347826086956522, + "grad_norm": 0.25794714226841003, + "learning_rate": 1.2318840579710146e-05, + "loss": 0.3781, + "step": 1611 + }, + { + "epoch": 2.336231884057971, + "grad_norm": 0.27312415594607103, + "learning_rate": 1.2292002147074612e-05, + "loss": 0.4151, + "step": 1612 + }, + { + "epoch": 2.3376811594202898, + "grad_norm": 0.22763034416379813, + "learning_rate": 1.2265163714439078e-05, + "loss": 0.373, + "step": 1613 + }, + { + "epoch": 2.3391304347826085, + "grad_norm": 0.24147613295919393, + "learning_rate": 1.2238325281803543e-05, + "loss": 0.3416, + "step": 1614 + }, + { + "epoch": 2.3405797101449277, + "grad_norm": 0.24221281514548484, + "learning_rate": 1.2211486849168009e-05, + "loss": 0.347, + "step": 1615 + }, + { + "epoch": 2.3420289855072465, + "grad_norm": 0.27036392653264024, + "learning_rate": 1.2184648416532475e-05, + "loss": 0.3722, + "step": 1616 + }, + { + "epoch": 2.3434782608695652, + "grad_norm": 0.28875833209074553, + "learning_rate": 1.2157809983896941e-05, + "loss": 0.4155, + "step": 1617 + }, + { + "epoch": 2.344927536231884, + "grad_norm": 0.2473071236867308, + "learning_rate": 1.2130971551261407e-05, + "loss": 0.3571, + "step": 1618 + }, + { + "epoch": 2.3463768115942027, + "grad_norm": 0.25668025739386363, + "learning_rate": 1.2104133118625872e-05, + "loss": 0.4249, + "step": 1619 + }, + { + "epoch": 2.3478260869565215, + "grad_norm": 0.27416799604861636, + "learning_rate": 1.2077294685990338e-05, + "loss": 0.3987, + "step": 1620 + }, + { + "epoch": 2.3492753623188407, + "grad_norm": 0.2365647100348582, + "learning_rate": 1.2050456253354804e-05, + "loss": 0.3634, + "step": 1621 + }, + { + "epoch": 2.3507246376811595, + "grad_norm": 0.27636558798058686, + "learning_rate": 1.2023617820719272e-05, + "loss": 0.4219, + "step": 1622 + }, + { + "epoch": 2.3521739130434782, + "grad_norm": 0.24753399152318645, + "learning_rate": 1.1996779388083736e-05, + "loss": 0.378, + "step": 1623 + }, + { + "epoch": 2.353623188405797, + "grad_norm": 0.2974985218029262, + "learning_rate": 1.1969940955448202e-05, + "loss": 0.3691, + "step": 1624 + }, + { + "epoch": 2.355072463768116, + "grad_norm": 0.2400268147449765, + "learning_rate": 1.1943102522812668e-05, + "loss": 0.3604, + "step": 1625 + }, + { + "epoch": 2.356521739130435, + "grad_norm": 0.26089767720269386, + "learning_rate": 1.1916264090177135e-05, + "loss": 0.399, + "step": 1626 + }, + { + "epoch": 2.3579710144927537, + "grad_norm": 0.23960649203229525, + "learning_rate": 1.18894256575416e-05, + "loss": 0.4135, + "step": 1627 + }, + { + "epoch": 2.3594202898550725, + "grad_norm": 0.2564677593387533, + "learning_rate": 1.1862587224906065e-05, + "loss": 0.3766, + "step": 1628 + }, + { + "epoch": 2.360869565217391, + "grad_norm": 0.23217865131785306, + "learning_rate": 1.1835748792270531e-05, + "loss": 0.3706, + "step": 1629 + }, + { + "epoch": 2.36231884057971, + "grad_norm": 0.253409443424873, + "learning_rate": 1.1808910359634997e-05, + "loss": 0.3507, + "step": 1630 + }, + { + "epoch": 2.3637681159420287, + "grad_norm": 0.2585803797976821, + "learning_rate": 1.1782071926999464e-05, + "loss": 0.341, + "step": 1631 + }, + { + "epoch": 2.365217391304348, + "grad_norm": 0.2343633560766101, + "learning_rate": 1.175523349436393e-05, + "loss": 0.3835, + "step": 1632 + }, + { + "epoch": 2.3666666666666667, + "grad_norm": 0.22943771254165923, + "learning_rate": 1.1728395061728396e-05, + "loss": 0.3369, + "step": 1633 + }, + { + "epoch": 2.3681159420289855, + "grad_norm": 0.2520076354267771, + "learning_rate": 1.1701556629092862e-05, + "loss": 0.3721, + "step": 1634 + }, + { + "epoch": 2.369565217391304, + "grad_norm": 0.23350354169235513, + "learning_rate": 1.1674718196457328e-05, + "loss": 0.38, + "step": 1635 + }, + { + "epoch": 2.3710144927536234, + "grad_norm": 0.24654601299800227, + "learning_rate": 1.1647879763821794e-05, + "loss": 0.4016, + "step": 1636 + }, + { + "epoch": 2.372463768115942, + "grad_norm": 0.23800732022778495, + "learning_rate": 1.162104133118626e-05, + "loss": 0.3526, + "step": 1637 + }, + { + "epoch": 2.373913043478261, + "grad_norm": 0.24450160756693554, + "learning_rate": 1.1594202898550725e-05, + "loss": 0.4154, + "step": 1638 + }, + { + "epoch": 2.3753623188405797, + "grad_norm": 0.26047505838335094, + "learning_rate": 1.1567364465915191e-05, + "loss": 0.3873, + "step": 1639 + }, + { + "epoch": 2.3768115942028984, + "grad_norm": 0.35237406523845116, + "learning_rate": 1.1540526033279657e-05, + "loss": 0.3573, + "step": 1640 + }, + { + "epoch": 2.378260869565217, + "grad_norm": 0.2464247686725185, + "learning_rate": 1.1513687600644123e-05, + "loss": 0.3513, + "step": 1641 + }, + { + "epoch": 2.3797101449275364, + "grad_norm": 0.24305252893817997, + "learning_rate": 1.148684916800859e-05, + "loss": 0.3515, + "step": 1642 + }, + { + "epoch": 2.381159420289855, + "grad_norm": 0.24473955308504647, + "learning_rate": 1.1460010735373054e-05, + "loss": 0.4069, + "step": 1643 + }, + { + "epoch": 2.382608695652174, + "grad_norm": 0.22287472790277094, + "learning_rate": 1.143317230273752e-05, + "loss": 0.3401, + "step": 1644 + }, + { + "epoch": 2.3840579710144927, + "grad_norm": 0.23881686125956375, + "learning_rate": 1.1406333870101986e-05, + "loss": 0.4016, + "step": 1645 + }, + { + "epoch": 2.3855072463768114, + "grad_norm": 0.23327446919723854, + "learning_rate": 1.1379495437466454e-05, + "loss": 0.3556, + "step": 1646 + }, + { + "epoch": 2.3869565217391306, + "grad_norm": 0.24396771678051765, + "learning_rate": 1.1352657004830918e-05, + "loss": 0.3918, + "step": 1647 + }, + { + "epoch": 2.3884057971014494, + "grad_norm": 0.24450824467838372, + "learning_rate": 1.1325818572195384e-05, + "loss": 0.3605, + "step": 1648 + }, + { + "epoch": 2.389855072463768, + "grad_norm": 0.27919448053191337, + "learning_rate": 1.129898013955985e-05, + "loss": 0.3786, + "step": 1649 + }, + { + "epoch": 2.391304347826087, + "grad_norm": 0.261991121797679, + "learning_rate": 1.1272141706924317e-05, + "loss": 0.3696, + "step": 1650 + }, + { + "epoch": 2.3927536231884057, + "grad_norm": 0.21085700795205384, + "learning_rate": 1.1245303274288783e-05, + "loss": 0.3033, + "step": 1651 + }, + { + "epoch": 2.3942028985507244, + "grad_norm": 0.22978194186144127, + "learning_rate": 1.1218464841653247e-05, + "loss": 0.3714, + "step": 1652 + }, + { + "epoch": 2.3956521739130436, + "grad_norm": 0.25088071978596216, + "learning_rate": 1.1191626409017713e-05, + "loss": 0.3195, + "step": 1653 + }, + { + "epoch": 2.3971014492753624, + "grad_norm": 0.2381310397831137, + "learning_rate": 1.116478797638218e-05, + "loss": 0.3404, + "step": 1654 + }, + { + "epoch": 2.398550724637681, + "grad_norm": 0.24188352968749965, + "learning_rate": 1.1137949543746646e-05, + "loss": 0.3577, + "step": 1655 + }, + { + "epoch": 2.4, + "grad_norm": 0.2680659121690275, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.3738, + "step": 1656 + }, + { + "epoch": 2.4014492753623187, + "grad_norm": 0.24947078992123775, + "learning_rate": 1.1084272678475578e-05, + "loss": 0.3996, + "step": 1657 + }, + { + "epoch": 2.402898550724638, + "grad_norm": 0.26438557516423244, + "learning_rate": 1.1057434245840044e-05, + "loss": 0.3918, + "step": 1658 + }, + { + "epoch": 2.4043478260869566, + "grad_norm": 0.2459562582649405, + "learning_rate": 1.103059581320451e-05, + "loss": 0.3514, + "step": 1659 + }, + { + "epoch": 2.4057971014492754, + "grad_norm": 0.2614107116304787, + "learning_rate": 1.1003757380568976e-05, + "loss": 0.3452, + "step": 1660 + }, + { + "epoch": 2.407246376811594, + "grad_norm": 0.26588141942238935, + "learning_rate": 1.097691894793344e-05, + "loss": 0.3893, + "step": 1661 + }, + { + "epoch": 2.408695652173913, + "grad_norm": 0.2561460931983173, + "learning_rate": 1.0950080515297907e-05, + "loss": 0.3612, + "step": 1662 + }, + { + "epoch": 2.4101449275362317, + "grad_norm": 0.26203192216579646, + "learning_rate": 1.0923242082662373e-05, + "loss": 0.3527, + "step": 1663 + }, + { + "epoch": 2.411594202898551, + "grad_norm": 0.2801535164029354, + "learning_rate": 1.0896403650026839e-05, + "loss": 0.3742, + "step": 1664 + }, + { + "epoch": 2.4130434782608696, + "grad_norm": 0.28338654825522025, + "learning_rate": 1.0869565217391305e-05, + "loss": 0.4081, + "step": 1665 + }, + { + "epoch": 2.4144927536231884, + "grad_norm": 0.22440848405306443, + "learning_rate": 1.084272678475577e-05, + "loss": 0.3166, + "step": 1666 + }, + { + "epoch": 2.415942028985507, + "grad_norm": 0.23489668860066648, + "learning_rate": 1.0815888352120236e-05, + "loss": 0.3186, + "step": 1667 + }, + { + "epoch": 2.417391304347826, + "grad_norm": 0.2386299301125299, + "learning_rate": 1.0789049919484702e-05, + "loss": 0.3942, + "step": 1668 + }, + { + "epoch": 2.418840579710145, + "grad_norm": 0.23436076505372425, + "learning_rate": 1.076221148684917e-05, + "loss": 0.3699, + "step": 1669 + }, + { + "epoch": 2.420289855072464, + "grad_norm": 0.22909140214577542, + "learning_rate": 1.0735373054213636e-05, + "loss": 0.3476, + "step": 1670 + }, + { + "epoch": 2.4217391304347826, + "grad_norm": 0.2634134796188035, + "learning_rate": 1.07085346215781e-05, + "loss": 0.349, + "step": 1671 + }, + { + "epoch": 2.4231884057971014, + "grad_norm": 0.2889603805151576, + "learning_rate": 1.0681696188942566e-05, + "loss": 0.3477, + "step": 1672 + }, + { + "epoch": 2.42463768115942, + "grad_norm": 0.22004010840649466, + "learning_rate": 1.0654857756307032e-05, + "loss": 0.3377, + "step": 1673 + }, + { + "epoch": 2.426086956521739, + "grad_norm": 0.22623503233422046, + "learning_rate": 1.0628019323671499e-05, + "loss": 0.343, + "step": 1674 + }, + { + "epoch": 2.427536231884058, + "grad_norm": 0.23996984275551328, + "learning_rate": 1.0601180891035965e-05, + "loss": 0.3623, + "step": 1675 + }, + { + "epoch": 2.428985507246377, + "grad_norm": 0.25513314558224837, + "learning_rate": 1.0574342458400429e-05, + "loss": 0.4259, + "step": 1676 + }, + { + "epoch": 2.4304347826086956, + "grad_norm": 0.2265078118915947, + "learning_rate": 1.0547504025764895e-05, + "loss": 0.3691, + "step": 1677 + }, + { + "epoch": 2.4318840579710144, + "grad_norm": 0.23586487503109818, + "learning_rate": 1.0520665593129361e-05, + "loss": 0.3442, + "step": 1678 + }, + { + "epoch": 2.4333333333333336, + "grad_norm": 0.24515098029968993, + "learning_rate": 1.0493827160493827e-05, + "loss": 0.3572, + "step": 1679 + }, + { + "epoch": 2.4347826086956523, + "grad_norm": 0.24521992528057063, + "learning_rate": 1.0466988727858294e-05, + "loss": 0.3703, + "step": 1680 + }, + { + "epoch": 2.436231884057971, + "grad_norm": 0.2356508765619123, + "learning_rate": 1.044015029522276e-05, + "loss": 0.3969, + "step": 1681 + }, + { + "epoch": 2.43768115942029, + "grad_norm": 0.25389786816355275, + "learning_rate": 1.0413311862587226e-05, + "loss": 0.3975, + "step": 1682 + }, + { + "epoch": 2.4391304347826086, + "grad_norm": 0.26355397779718365, + "learning_rate": 1.0386473429951692e-05, + "loss": 0.3739, + "step": 1683 + }, + { + "epoch": 2.4405797101449274, + "grad_norm": 0.2551779484837169, + "learning_rate": 1.0359634997316158e-05, + "loss": 0.3885, + "step": 1684 + }, + { + "epoch": 2.4420289855072466, + "grad_norm": 0.23631562097115857, + "learning_rate": 1.0332796564680623e-05, + "loss": 0.3855, + "step": 1685 + }, + { + "epoch": 2.4434782608695653, + "grad_norm": 0.23618850704064212, + "learning_rate": 1.0305958132045089e-05, + "loss": 0.4032, + "step": 1686 + }, + { + "epoch": 2.444927536231884, + "grad_norm": 0.24259183480511126, + "learning_rate": 1.0279119699409555e-05, + "loss": 0.3573, + "step": 1687 + }, + { + "epoch": 2.446376811594203, + "grad_norm": 0.24202233121882677, + "learning_rate": 1.0252281266774021e-05, + "loss": 0.3871, + "step": 1688 + }, + { + "epoch": 2.4478260869565216, + "grad_norm": 0.23825262433919453, + "learning_rate": 1.0225442834138487e-05, + "loss": 0.3619, + "step": 1689 + }, + { + "epoch": 2.449275362318841, + "grad_norm": 0.5378715452062908, + "learning_rate": 1.0198604401502951e-05, + "loss": 0.417, + "step": 1690 + }, + { + "epoch": 2.4507246376811596, + "grad_norm": 0.23590631249131455, + "learning_rate": 1.0171765968867418e-05, + "loss": 0.3728, + "step": 1691 + }, + { + "epoch": 2.4521739130434783, + "grad_norm": 0.22704366611632165, + "learning_rate": 1.0144927536231885e-05, + "loss": 0.3584, + "step": 1692 + }, + { + "epoch": 2.453623188405797, + "grad_norm": 0.24678300364706693, + "learning_rate": 1.0118089103596352e-05, + "loss": 0.3648, + "step": 1693 + }, + { + "epoch": 2.455072463768116, + "grad_norm": 0.24623958958542122, + "learning_rate": 1.0091250670960816e-05, + "loss": 0.3672, + "step": 1694 + }, + { + "epoch": 2.4565217391304346, + "grad_norm": 0.2548241126895578, + "learning_rate": 1.0064412238325282e-05, + "loss": 0.3891, + "step": 1695 + }, + { + "epoch": 2.457971014492754, + "grad_norm": 0.2307626058795168, + "learning_rate": 1.0037573805689748e-05, + "loss": 0.3588, + "step": 1696 + }, + { + "epoch": 2.4594202898550726, + "grad_norm": 0.22542946104824368, + "learning_rate": 1.0010735373054214e-05, + "loss": 0.3975, + "step": 1697 + }, + { + "epoch": 2.4608695652173913, + "grad_norm": 0.24207984238673924, + "learning_rate": 9.98389694041868e-06, + "loss": 0.3916, + "step": 1698 + }, + { + "epoch": 2.46231884057971, + "grad_norm": 0.27096598935315286, + "learning_rate": 9.957058507783145e-06, + "loss": 0.3776, + "step": 1699 + }, + { + "epoch": 2.463768115942029, + "grad_norm": 0.22850501591889746, + "learning_rate": 9.930220075147611e-06, + "loss": 0.3576, + "step": 1700 + }, + { + "epoch": 2.465217391304348, + "grad_norm": 0.22631443374823518, + "learning_rate": 9.903381642512077e-06, + "loss": 0.3618, + "step": 1701 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 0.2321696076731389, + "learning_rate": 9.876543209876543e-06, + "loss": 0.3665, + "step": 1702 + }, + { + "epoch": 2.4681159420289855, + "grad_norm": 0.24687628048049745, + "learning_rate": 9.84970477724101e-06, + "loss": 0.3586, + "step": 1703 + }, + { + "epoch": 2.4695652173913043, + "grad_norm": 0.24373847387295003, + "learning_rate": 9.822866344605476e-06, + "loss": 0.3652, + "step": 1704 + }, + { + "epoch": 2.471014492753623, + "grad_norm": 2.8444772494805672, + "learning_rate": 9.796027911969942e-06, + "loss": 0.4705, + "step": 1705 + }, + { + "epoch": 2.472463768115942, + "grad_norm": 0.2930409317815382, + "learning_rate": 9.769189479334408e-06, + "loss": 0.3877, + "step": 1706 + }, + { + "epoch": 2.473913043478261, + "grad_norm": 0.2601267088028184, + "learning_rate": 9.742351046698874e-06, + "loss": 0.3644, + "step": 1707 + }, + { + "epoch": 2.47536231884058, + "grad_norm": 0.2487197364866856, + "learning_rate": 9.71551261406334e-06, + "loss": 0.4045, + "step": 1708 + }, + { + "epoch": 2.4768115942028985, + "grad_norm": 0.2374071864441594, + "learning_rate": 9.688674181427804e-06, + "loss": 0.3568, + "step": 1709 + }, + { + "epoch": 2.4782608695652173, + "grad_norm": 0.24310394382631415, + "learning_rate": 9.66183574879227e-06, + "loss": 0.3498, + "step": 1710 + }, + { + "epoch": 2.479710144927536, + "grad_norm": 0.23519481570010747, + "learning_rate": 9.634997316156737e-06, + "loss": 0.3985, + "step": 1711 + }, + { + "epoch": 2.4811594202898553, + "grad_norm": 0.24301143269826625, + "learning_rate": 9.608158883521203e-06, + "loss": 0.3962, + "step": 1712 + }, + { + "epoch": 2.482608695652174, + "grad_norm": 0.23603059564584986, + "learning_rate": 9.581320450885669e-06, + "loss": 0.3232, + "step": 1713 + }, + { + "epoch": 2.4840579710144928, + "grad_norm": 0.30965155321248744, + "learning_rate": 9.554482018250133e-06, + "loss": 0.4068, + "step": 1714 + }, + { + "epoch": 2.4855072463768115, + "grad_norm": 0.23388751278509945, + "learning_rate": 9.527643585614601e-06, + "loss": 0.3458, + "step": 1715 + }, + { + "epoch": 2.4869565217391303, + "grad_norm": 0.2686097858868765, + "learning_rate": 9.500805152979067e-06, + "loss": 0.4051, + "step": 1716 + }, + { + "epoch": 2.488405797101449, + "grad_norm": 0.2718429545342115, + "learning_rate": 9.473966720343534e-06, + "loss": 0.4266, + "step": 1717 + }, + { + "epoch": 2.4898550724637682, + "grad_norm": 0.2382473738490544, + "learning_rate": 9.447128287707998e-06, + "loss": 0.4204, + "step": 1718 + }, + { + "epoch": 2.491304347826087, + "grad_norm": 0.23698376840914315, + "learning_rate": 9.420289855072464e-06, + "loss": 0.3631, + "step": 1719 + }, + { + "epoch": 2.4927536231884058, + "grad_norm": 0.23506995248352258, + "learning_rate": 9.39345142243693e-06, + "loss": 0.3271, + "step": 1720 + }, + { + "epoch": 2.4942028985507245, + "grad_norm": 0.2564784422890809, + "learning_rate": 9.366612989801396e-06, + "loss": 0.405, + "step": 1721 + }, + { + "epoch": 2.4956521739130437, + "grad_norm": 0.23915383613372546, + "learning_rate": 9.339774557165862e-06, + "loss": 0.3521, + "step": 1722 + }, + { + "epoch": 2.4971014492753625, + "grad_norm": 0.4073306961152648, + "learning_rate": 9.312936124530327e-06, + "loss": 0.3665, + "step": 1723 + }, + { + "epoch": 2.4985507246376812, + "grad_norm": 0.2285687119732312, + "learning_rate": 9.286097691894793e-06, + "loss": 0.3588, + "step": 1724 + }, + { + "epoch": 2.5, + "grad_norm": 0.2458977310163074, + "learning_rate": 9.259259259259259e-06, + "loss": 0.3362, + "step": 1725 + }, + { + "epoch": 2.5014492753623188, + "grad_norm": 0.26512727560130933, + "learning_rate": 9.232420826623725e-06, + "loss": 0.3928, + "step": 1726 + }, + { + "epoch": 2.5028985507246375, + "grad_norm": 0.2457255606384545, + "learning_rate": 9.205582393988191e-06, + "loss": 0.4004, + "step": 1727 + }, + { + "epoch": 2.5043478260869563, + "grad_norm": 0.25178128091977336, + "learning_rate": 9.178743961352658e-06, + "loss": 0.4259, + "step": 1728 + }, + { + "epoch": 2.5057971014492755, + "grad_norm": 0.24079198772295304, + "learning_rate": 9.151905528717124e-06, + "loss": 0.3565, + "step": 1729 + }, + { + "epoch": 2.5072463768115942, + "grad_norm": 0.2931582636874582, + "learning_rate": 9.12506709608159e-06, + "loss": 0.3875, + "step": 1730 + }, + { + "epoch": 2.508695652173913, + "grad_norm": 0.25170263590126585, + "learning_rate": 9.098228663446056e-06, + "loss": 0.3722, + "step": 1731 + }, + { + "epoch": 2.5101449275362318, + "grad_norm": 0.24220178096105427, + "learning_rate": 9.07139023081052e-06, + "loss": 0.3667, + "step": 1732 + }, + { + "epoch": 2.511594202898551, + "grad_norm": 0.22224946420231548, + "learning_rate": 9.044551798174986e-06, + "loss": 0.3191, + "step": 1733 + }, + { + "epoch": 2.5130434782608697, + "grad_norm": 0.24089894753896055, + "learning_rate": 9.017713365539453e-06, + "loss": 0.362, + "step": 1734 + }, + { + "epoch": 2.5144927536231885, + "grad_norm": 0.22841326431279493, + "learning_rate": 8.990874932903919e-06, + "loss": 0.3453, + "step": 1735 + }, + { + "epoch": 2.5159420289855072, + "grad_norm": 0.2393750097499759, + "learning_rate": 8.964036500268385e-06, + "loss": 0.3583, + "step": 1736 + }, + { + "epoch": 2.517391304347826, + "grad_norm": 0.26855067884257117, + "learning_rate": 8.93719806763285e-06, + "loss": 0.4091, + "step": 1737 + }, + { + "epoch": 2.5188405797101447, + "grad_norm": 0.23517858529776384, + "learning_rate": 8.910359634997315e-06, + "loss": 0.4249, + "step": 1738 + }, + { + "epoch": 2.5202898550724635, + "grad_norm": 0.2560093265834709, + "learning_rate": 8.883521202361783e-06, + "loss": 0.3909, + "step": 1739 + }, + { + "epoch": 2.5217391304347827, + "grad_norm": 0.24412506294757666, + "learning_rate": 8.85668276972625e-06, + "loss": 0.3974, + "step": 1740 + }, + { + "epoch": 2.5231884057971015, + "grad_norm": 0.24174238885908517, + "learning_rate": 8.829844337090715e-06, + "loss": 0.3788, + "step": 1741 + }, + { + "epoch": 2.52463768115942, + "grad_norm": 0.24203049886043884, + "learning_rate": 8.80300590445518e-06, + "loss": 0.3996, + "step": 1742 + }, + { + "epoch": 2.526086956521739, + "grad_norm": 0.2331818758188412, + "learning_rate": 8.776167471819646e-06, + "loss": 0.3727, + "step": 1743 + }, + { + "epoch": 2.527536231884058, + "grad_norm": 0.2259773165695989, + "learning_rate": 8.749329039184112e-06, + "loss": 0.337, + "step": 1744 + }, + { + "epoch": 2.528985507246377, + "grad_norm": 0.21970295461037015, + "learning_rate": 8.722490606548578e-06, + "loss": 0.3076, + "step": 1745 + }, + { + "epoch": 2.5304347826086957, + "grad_norm": 0.240548024122901, + "learning_rate": 8.695652173913044e-06, + "loss": 0.3617, + "step": 1746 + }, + { + "epoch": 2.5318840579710145, + "grad_norm": 0.22471292230572035, + "learning_rate": 8.668813741277509e-06, + "loss": 0.3845, + "step": 1747 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.2324219627223449, + "learning_rate": 8.641975308641975e-06, + "loss": 0.3577, + "step": 1748 + }, + { + "epoch": 2.534782608695652, + "grad_norm": 0.2517119814575534, + "learning_rate": 8.615136876006441e-06, + "loss": 0.3838, + "step": 1749 + }, + { + "epoch": 2.536231884057971, + "grad_norm": 0.23162613939840387, + "learning_rate": 8.588298443370909e-06, + "loss": 0.3807, + "step": 1750 + }, + { + "epoch": 2.53768115942029, + "grad_norm": 0.23210374731896627, + "learning_rate": 8.561460010735373e-06, + "loss": 0.4135, + "step": 1751 + }, + { + "epoch": 2.5391304347826087, + "grad_norm": 0.2506407668081656, + "learning_rate": 8.53462157809984e-06, + "loss": 0.3798, + "step": 1752 + }, + { + "epoch": 2.5405797101449274, + "grad_norm": 0.24427314286408663, + "learning_rate": 8.507783145464306e-06, + "loss": 0.3534, + "step": 1753 + }, + { + "epoch": 2.5420289855072467, + "grad_norm": 0.2521195759976504, + "learning_rate": 8.480944712828772e-06, + "loss": 0.4297, + "step": 1754 + }, + { + "epoch": 2.5434782608695654, + "grad_norm": 0.22423334849213983, + "learning_rate": 8.454106280193238e-06, + "loss": 0.3649, + "step": 1755 + }, + { + "epoch": 2.544927536231884, + "grad_norm": 0.22454826458337243, + "learning_rate": 8.427267847557702e-06, + "loss": 0.3854, + "step": 1756 + }, + { + "epoch": 2.546376811594203, + "grad_norm": 0.2495166610490925, + "learning_rate": 8.400429414922168e-06, + "loss": 0.3238, + "step": 1757 + }, + { + "epoch": 2.5478260869565217, + "grad_norm": 0.2598103637415767, + "learning_rate": 8.373590982286635e-06, + "loss": 0.424, + "step": 1758 + }, + { + "epoch": 2.5492753623188404, + "grad_norm": 0.2489444890547027, + "learning_rate": 8.3467525496511e-06, + "loss": 0.3986, + "step": 1759 + }, + { + "epoch": 2.550724637681159, + "grad_norm": 0.24194370005383292, + "learning_rate": 8.319914117015567e-06, + "loss": 0.3844, + "step": 1760 + }, + { + "epoch": 2.5521739130434784, + "grad_norm": 0.23640849731439492, + "learning_rate": 8.293075684380031e-06, + "loss": 0.3811, + "step": 1761 + }, + { + "epoch": 2.553623188405797, + "grad_norm": 0.2272161818033302, + "learning_rate": 8.266237251744499e-06, + "loss": 0.3691, + "step": 1762 + }, + { + "epoch": 2.555072463768116, + "grad_norm": 0.21753209322286424, + "learning_rate": 8.239398819108965e-06, + "loss": 0.3484, + "step": 1763 + }, + { + "epoch": 2.5565217391304347, + "grad_norm": 0.24397631738740422, + "learning_rate": 8.212560386473431e-06, + "loss": 0.37, + "step": 1764 + }, + { + "epoch": 2.557971014492754, + "grad_norm": 0.2280370924702514, + "learning_rate": 8.185721953837896e-06, + "loss": 0.3813, + "step": 1765 + }, + { + "epoch": 2.5594202898550726, + "grad_norm": 0.25304936700858394, + "learning_rate": 8.158883521202362e-06, + "loss": 0.3995, + "step": 1766 + }, + { + "epoch": 2.5608695652173914, + "grad_norm": 0.2490687010035602, + "learning_rate": 8.132045088566828e-06, + "loss": 0.411, + "step": 1767 + }, + { + "epoch": 2.56231884057971, + "grad_norm": 0.2513432328777938, + "learning_rate": 8.105206655931294e-06, + "loss": 0.4075, + "step": 1768 + }, + { + "epoch": 2.563768115942029, + "grad_norm": 0.23915876131137653, + "learning_rate": 8.07836822329576e-06, + "loss": 0.4069, + "step": 1769 + }, + { + "epoch": 2.5652173913043477, + "grad_norm": 0.23030860469836206, + "learning_rate": 8.051529790660225e-06, + "loss": 0.4023, + "step": 1770 + }, + { + "epoch": 2.5666666666666664, + "grad_norm": 0.22533124000674842, + "learning_rate": 8.02469135802469e-06, + "loss": 0.353, + "step": 1771 + }, + { + "epoch": 2.5681159420289856, + "grad_norm": 0.23594375320707797, + "learning_rate": 7.997852925389157e-06, + "loss": 0.347, + "step": 1772 + }, + { + "epoch": 2.5695652173913044, + "grad_norm": 0.23574195339153892, + "learning_rate": 7.971014492753623e-06, + "loss": 0.3703, + "step": 1773 + }, + { + "epoch": 2.571014492753623, + "grad_norm": 0.2471542224831895, + "learning_rate": 7.944176060118091e-06, + "loss": 0.3881, + "step": 1774 + }, + { + "epoch": 2.572463768115942, + "grad_norm": 0.237479615359418, + "learning_rate": 7.917337627482555e-06, + "loss": 0.3866, + "step": 1775 + }, + { + "epoch": 2.573913043478261, + "grad_norm": 0.2302231762999212, + "learning_rate": 7.890499194847021e-06, + "loss": 0.399, + "step": 1776 + }, + { + "epoch": 2.57536231884058, + "grad_norm": 0.3134114587170842, + "learning_rate": 7.863660762211488e-06, + "loss": 0.3493, + "step": 1777 + }, + { + "epoch": 2.5768115942028986, + "grad_norm": 0.2246109336895405, + "learning_rate": 7.836822329575954e-06, + "loss": 0.3601, + "step": 1778 + }, + { + "epoch": 2.5782608695652174, + "grad_norm": 0.26148405527459617, + "learning_rate": 7.80998389694042e-06, + "loss": 0.3813, + "step": 1779 + }, + { + "epoch": 2.579710144927536, + "grad_norm": 0.22015793799835642, + "learning_rate": 7.783145464304884e-06, + "loss": 0.3359, + "step": 1780 + }, + { + "epoch": 2.581159420289855, + "grad_norm": 0.22781683594318913, + "learning_rate": 7.75630703166935e-06, + "loss": 0.3932, + "step": 1781 + }, + { + "epoch": 2.5826086956521737, + "grad_norm": 0.22622094981865146, + "learning_rate": 7.729468599033817e-06, + "loss": 0.3422, + "step": 1782 + }, + { + "epoch": 2.584057971014493, + "grad_norm": 0.23289969908151067, + "learning_rate": 7.702630166398283e-06, + "loss": 0.3605, + "step": 1783 + }, + { + "epoch": 2.5855072463768116, + "grad_norm": 0.246003880930885, + "learning_rate": 7.675791733762749e-06, + "loss": 0.4108, + "step": 1784 + }, + { + "epoch": 2.5869565217391304, + "grad_norm": 0.22934975828663517, + "learning_rate": 7.648953301127215e-06, + "loss": 0.3613, + "step": 1785 + }, + { + "epoch": 2.588405797101449, + "grad_norm": 0.23776511365549477, + "learning_rate": 7.622114868491681e-06, + "loss": 0.3452, + "step": 1786 + }, + { + "epoch": 2.5898550724637683, + "grad_norm": 0.23173291377330582, + "learning_rate": 7.595276435856147e-06, + "loss": 0.3636, + "step": 1787 + }, + { + "epoch": 2.591304347826087, + "grad_norm": 0.22746789158031808, + "learning_rate": 7.568438003220612e-06, + "loss": 0.3738, + "step": 1788 + }, + { + "epoch": 2.592753623188406, + "grad_norm": 0.23726838497942668, + "learning_rate": 7.5415995705850785e-06, + "loss": 0.3537, + "step": 1789 + }, + { + "epoch": 2.5942028985507246, + "grad_norm": 0.2536496869289765, + "learning_rate": 7.514761137949544e-06, + "loss": 0.3823, + "step": 1790 + }, + { + "epoch": 2.5956521739130434, + "grad_norm": 0.2323632110547211, + "learning_rate": 7.48792270531401e-06, + "loss": 0.3398, + "step": 1791 + }, + { + "epoch": 2.597101449275362, + "grad_norm": 0.24434757356294867, + "learning_rate": 7.461084272678476e-06, + "loss": 0.3987, + "step": 1792 + }, + { + "epoch": 2.598550724637681, + "grad_norm": 0.23017559291290987, + "learning_rate": 7.434245840042941e-06, + "loss": 0.3727, + "step": 1793 + }, + { + "epoch": 2.6, + "grad_norm": 0.23150175814809001, + "learning_rate": 7.4074074074074075e-06, + "loss": 0.3801, + "step": 1794 + }, + { + "epoch": 2.601449275362319, + "grad_norm": 0.22181897932620395, + "learning_rate": 7.380568974771873e-06, + "loss": 0.3828, + "step": 1795 + }, + { + "epoch": 2.6028985507246376, + "grad_norm": 0.23263891313339105, + "learning_rate": 7.353730542136339e-06, + "loss": 0.3557, + "step": 1796 + }, + { + "epoch": 2.6043478260869564, + "grad_norm": 0.6529574844149014, + "learning_rate": 7.326892109500806e-06, + "loss": 0.3669, + "step": 1797 + }, + { + "epoch": 2.6057971014492756, + "grad_norm": 0.21973835286616067, + "learning_rate": 7.300053676865272e-06, + "loss": 0.3439, + "step": 1798 + }, + { + "epoch": 2.6072463768115943, + "grad_norm": 0.24012497988938392, + "learning_rate": 7.273215244229738e-06, + "loss": 0.3586, + "step": 1799 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.21946520371623274, + "learning_rate": 7.246376811594203e-06, + "loss": 0.3417, + "step": 1800 + }, + { + "epoch": 2.610144927536232, + "grad_norm": 0.2093748367861486, + "learning_rate": 7.2195383789586695e-06, + "loss": 0.2987, + "step": 1801 + }, + { + "epoch": 2.6115942028985506, + "grad_norm": 0.23063885100070947, + "learning_rate": 7.192699946323135e-06, + "loss": 0.376, + "step": 1802 + }, + { + "epoch": 2.6130434782608694, + "grad_norm": 0.24566039962287953, + "learning_rate": 7.165861513687601e-06, + "loss": 0.3889, + "step": 1803 + }, + { + "epoch": 2.6144927536231886, + "grad_norm": 0.22630481844434078, + "learning_rate": 7.139023081052067e-06, + "loss": 0.3805, + "step": 1804 + }, + { + "epoch": 2.6159420289855073, + "grad_norm": 0.23513134414589493, + "learning_rate": 7.112184648416532e-06, + "loss": 0.3703, + "step": 1805 + }, + { + "epoch": 2.617391304347826, + "grad_norm": 0.258085032589009, + "learning_rate": 7.0853462157809985e-06, + "loss": 0.42, + "step": 1806 + }, + { + "epoch": 2.618840579710145, + "grad_norm": 0.24787040551614853, + "learning_rate": 7.058507783145464e-06, + "loss": 0.383, + "step": 1807 + }, + { + "epoch": 2.620289855072464, + "grad_norm": 0.23833700860460905, + "learning_rate": 7.03166935050993e-06, + "loss": 0.3724, + "step": 1808 + }, + { + "epoch": 2.621739130434783, + "grad_norm": 0.22142826203858182, + "learning_rate": 7.004830917874397e-06, + "loss": 0.3417, + "step": 1809 + }, + { + "epoch": 2.6231884057971016, + "grad_norm": 0.2207677353089534, + "learning_rate": 6.977992485238863e-06, + "loss": 0.3337, + "step": 1810 + }, + { + "epoch": 2.6246376811594203, + "grad_norm": 0.2411024299096652, + "learning_rate": 6.951154052603328e-06, + "loss": 0.3628, + "step": 1811 + }, + { + "epoch": 2.626086956521739, + "grad_norm": 0.24307442140976468, + "learning_rate": 6.924315619967794e-06, + "loss": 0.405, + "step": 1812 + }, + { + "epoch": 2.627536231884058, + "grad_norm": 0.22134105902238405, + "learning_rate": 6.8974771873322605e-06, + "loss": 0.3666, + "step": 1813 + }, + { + "epoch": 2.6289855072463766, + "grad_norm": 0.24753571131810514, + "learning_rate": 6.870638754696726e-06, + "loss": 0.3757, + "step": 1814 + }, + { + "epoch": 2.630434782608696, + "grad_norm": 0.22052117468602914, + "learning_rate": 6.843800322061192e-06, + "loss": 0.3658, + "step": 1815 + }, + { + "epoch": 2.6318840579710145, + "grad_norm": 0.2240473452768075, + "learning_rate": 6.816961889425658e-06, + "loss": 0.3288, + "step": 1816 + }, + { + "epoch": 2.6333333333333333, + "grad_norm": 0.23427589102526614, + "learning_rate": 6.790123456790123e-06, + "loss": 0.3873, + "step": 1817 + }, + { + "epoch": 2.634782608695652, + "grad_norm": 0.2200630110125934, + "learning_rate": 6.7632850241545894e-06, + "loss": 0.3664, + "step": 1818 + }, + { + "epoch": 2.6362318840579713, + "grad_norm": 0.22018626187701326, + "learning_rate": 6.736446591519055e-06, + "loss": 0.3423, + "step": 1819 + }, + { + "epoch": 2.63768115942029, + "grad_norm": 0.2525070837601251, + "learning_rate": 6.7096081588835225e-06, + "loss": 0.4146, + "step": 1820 + }, + { + "epoch": 2.639130434782609, + "grad_norm": 0.22184775100735113, + "learning_rate": 6.682769726247988e-06, + "loss": 0.3442, + "step": 1821 + }, + { + "epoch": 2.6405797101449275, + "grad_norm": 0.23636480564458984, + "learning_rate": 6.655931293612454e-06, + "loss": 0.3622, + "step": 1822 + }, + { + "epoch": 2.6420289855072463, + "grad_norm": 0.22147421135139164, + "learning_rate": 6.629092860976919e-06, + "loss": 0.3782, + "step": 1823 + }, + { + "epoch": 2.643478260869565, + "grad_norm": 0.24603239641639185, + "learning_rate": 6.602254428341385e-06, + "loss": 0.4063, + "step": 1824 + }, + { + "epoch": 2.644927536231884, + "grad_norm": 0.23159270932447462, + "learning_rate": 6.5754159957058515e-06, + "loss": 0.377, + "step": 1825 + }, + { + "epoch": 2.646376811594203, + "grad_norm": 0.2105455774554014, + "learning_rate": 6.548577563070317e-06, + "loss": 0.327, + "step": 1826 + }, + { + "epoch": 2.6478260869565218, + "grad_norm": 0.25620857332710895, + "learning_rate": 6.521739130434783e-06, + "loss": 0.3937, + "step": 1827 + }, + { + "epoch": 2.6492753623188405, + "grad_norm": 0.22469586076992126, + "learning_rate": 6.494900697799248e-06, + "loss": 0.3517, + "step": 1828 + }, + { + "epoch": 2.6507246376811593, + "grad_norm": 0.22576026831366242, + "learning_rate": 6.468062265163714e-06, + "loss": 0.3591, + "step": 1829 + }, + { + "epoch": 2.6521739130434785, + "grad_norm": 0.2192462255694706, + "learning_rate": 6.44122383252818e-06, + "loss": 0.361, + "step": 1830 + }, + { + "epoch": 2.6536231884057973, + "grad_norm": 0.23381609847056584, + "learning_rate": 6.414385399892646e-06, + "loss": 0.4258, + "step": 1831 + }, + { + "epoch": 2.655072463768116, + "grad_norm": 0.21088337829414863, + "learning_rate": 6.3875469672571135e-06, + "loss": 0.3589, + "step": 1832 + }, + { + "epoch": 2.6565217391304348, + "grad_norm": 0.2348314510153269, + "learning_rate": 6.360708534621579e-06, + "loss": 0.3854, + "step": 1833 + }, + { + "epoch": 2.6579710144927535, + "grad_norm": 0.22631453842046725, + "learning_rate": 6.333870101986045e-06, + "loss": 0.3901, + "step": 1834 + }, + { + "epoch": 2.6594202898550723, + "grad_norm": 0.22994007869388175, + "learning_rate": 6.30703166935051e-06, + "loss": 0.3654, + "step": 1835 + }, + { + "epoch": 2.660869565217391, + "grad_norm": 0.2203624234609837, + "learning_rate": 6.280193236714976e-06, + "loss": 0.3927, + "step": 1836 + }, + { + "epoch": 2.6623188405797102, + "grad_norm": 0.2195249377140087, + "learning_rate": 6.2533548040794425e-06, + "loss": 0.3373, + "step": 1837 + }, + { + "epoch": 2.663768115942029, + "grad_norm": 0.23181377885580856, + "learning_rate": 6.226516371443908e-06, + "loss": 0.4036, + "step": 1838 + }, + { + "epoch": 2.6652173913043478, + "grad_norm": 0.2219526383481847, + "learning_rate": 6.199677938808374e-06, + "loss": 0.3534, + "step": 1839 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.23362554464225765, + "learning_rate": 6.172839506172839e-06, + "loss": 0.4265, + "step": 1840 + }, + { + "epoch": 2.6681159420289857, + "grad_norm": 0.24196797259877534, + "learning_rate": 6.146001073537306e-06, + "loss": 0.38, + "step": 1841 + }, + { + "epoch": 2.6695652173913045, + "grad_norm": 0.23489975729889526, + "learning_rate": 6.119162640901771e-06, + "loss": 0.3683, + "step": 1842 + }, + { + "epoch": 2.6710144927536232, + "grad_norm": 0.2299695993386685, + "learning_rate": 6.0923242082662375e-06, + "loss": 0.3546, + "step": 1843 + }, + { + "epoch": 2.672463768115942, + "grad_norm": 0.26701383005463475, + "learning_rate": 6.065485775630704e-06, + "loss": 0.3768, + "step": 1844 + }, + { + "epoch": 2.6739130434782608, + "grad_norm": 0.33256383947003426, + "learning_rate": 6.038647342995169e-06, + "loss": 0.3197, + "step": 1845 + }, + { + "epoch": 2.6753623188405795, + "grad_norm": 0.2431108486304523, + "learning_rate": 6.011808910359636e-06, + "loss": 0.3592, + "step": 1846 + }, + { + "epoch": 2.6768115942028987, + "grad_norm": 0.24168905691623235, + "learning_rate": 5.984970477724101e-06, + "loss": 0.4084, + "step": 1847 + }, + { + "epoch": 2.6782608695652175, + "grad_norm": 0.22030359227920543, + "learning_rate": 5.958132045088567e-06, + "loss": 0.3122, + "step": 1848 + }, + { + "epoch": 2.6797101449275362, + "grad_norm": 0.22133704619831146, + "learning_rate": 5.931293612453033e-06, + "loss": 0.3527, + "step": 1849 + }, + { + "epoch": 2.681159420289855, + "grad_norm": 0.23488002154972068, + "learning_rate": 5.904455179817499e-06, + "loss": 0.3577, + "step": 1850 + }, + { + "epoch": 2.6826086956521737, + "grad_norm": 0.23371719237593863, + "learning_rate": 5.877616747181965e-06, + "loss": 0.3624, + "step": 1851 + }, + { + "epoch": 2.684057971014493, + "grad_norm": 0.2390257085571172, + "learning_rate": 5.850778314546431e-06, + "loss": 0.3692, + "step": 1852 + }, + { + "epoch": 2.6855072463768117, + "grad_norm": 0.25246189224755156, + "learning_rate": 5.823939881910897e-06, + "loss": 0.4123, + "step": 1853 + }, + { + "epoch": 2.6869565217391305, + "grad_norm": 0.22169205705749817, + "learning_rate": 5.797101449275362e-06, + "loss": 0.3683, + "step": 1854 + }, + { + "epoch": 2.6884057971014492, + "grad_norm": 0.21896924257649364, + "learning_rate": 5.7702630166398285e-06, + "loss": 0.366, + "step": 1855 + }, + { + "epoch": 2.689855072463768, + "grad_norm": 0.22653260988190463, + "learning_rate": 5.743424584004295e-06, + "loss": 0.3597, + "step": 1856 + }, + { + "epoch": 2.6913043478260867, + "grad_norm": 0.2287381038960718, + "learning_rate": 5.71658615136876e-06, + "loss": 0.3974, + "step": 1857 + }, + { + "epoch": 2.692753623188406, + "grad_norm": 0.23347533317562097, + "learning_rate": 5.689747718733227e-06, + "loss": 0.3857, + "step": 1858 + }, + { + "epoch": 2.6942028985507247, + "grad_norm": 0.22367324176057518, + "learning_rate": 5.662909286097692e-06, + "loss": 0.3801, + "step": 1859 + }, + { + "epoch": 2.6956521739130435, + "grad_norm": 0.23472201362272851, + "learning_rate": 5.636070853462158e-06, + "loss": 0.4245, + "step": 1860 + }, + { + "epoch": 2.697101449275362, + "grad_norm": 0.3157297294018279, + "learning_rate": 5.6092324208266236e-06, + "loss": 0.3867, + "step": 1861 + }, + { + "epoch": 2.6985507246376814, + "grad_norm": 0.2291967553591276, + "learning_rate": 5.58239398819109e-06, + "loss": 0.368, + "step": 1862 + }, + { + "epoch": 2.7, + "grad_norm": 0.22354248296465548, + "learning_rate": 5.555555555555556e-06, + "loss": 0.3865, + "step": 1863 + }, + { + "epoch": 2.701449275362319, + "grad_norm": 0.22739828851934407, + "learning_rate": 5.528717122920022e-06, + "loss": 0.4034, + "step": 1864 + }, + { + "epoch": 2.7028985507246377, + "grad_norm": 0.2267575561496053, + "learning_rate": 5.501878690284488e-06, + "loss": 0.3503, + "step": 1865 + }, + { + "epoch": 2.7043478260869565, + "grad_norm": 0.24634783666743043, + "learning_rate": 5.475040257648953e-06, + "loss": 0.3976, + "step": 1866 + }, + { + "epoch": 2.705797101449275, + "grad_norm": 0.23064788007901818, + "learning_rate": 5.4482018250134195e-06, + "loss": 0.3492, + "step": 1867 + }, + { + "epoch": 2.707246376811594, + "grad_norm": 0.2691754884654957, + "learning_rate": 5.421363392377885e-06, + "loss": 0.3994, + "step": 1868 + }, + { + "epoch": 2.708695652173913, + "grad_norm": 0.22996872590791068, + "learning_rate": 5.394524959742351e-06, + "loss": 0.3842, + "step": 1869 + }, + { + "epoch": 2.710144927536232, + "grad_norm": 0.2423580047509324, + "learning_rate": 5.367686527106818e-06, + "loss": 0.3451, + "step": 1870 + }, + { + "epoch": 2.7115942028985507, + "grad_norm": 0.23179393107934326, + "learning_rate": 5.340848094471283e-06, + "loss": 0.3142, + "step": 1871 + }, + { + "epoch": 2.7130434782608694, + "grad_norm": 0.24728239436941335, + "learning_rate": 5.314009661835749e-06, + "loss": 0.4023, + "step": 1872 + }, + { + "epoch": 2.7144927536231886, + "grad_norm": 0.25281705888035133, + "learning_rate": 5.2871712292002145e-06, + "loss": 0.3679, + "step": 1873 + }, + { + "epoch": 2.7159420289855074, + "grad_norm": 0.2503489075733349, + "learning_rate": 5.260332796564681e-06, + "loss": 0.3686, + "step": 1874 + }, + { + "epoch": 2.717391304347826, + "grad_norm": 0.2118793346562308, + "learning_rate": 5.233494363929147e-06, + "loss": 0.3503, + "step": 1875 + }, + { + "epoch": 2.718840579710145, + "grad_norm": 0.22433352553126884, + "learning_rate": 5.206655931293613e-06, + "loss": 0.3819, + "step": 1876 + }, + { + "epoch": 2.7202898550724637, + "grad_norm": 0.23356449816008817, + "learning_rate": 5.179817498658079e-06, + "loss": 0.3704, + "step": 1877 + }, + { + "epoch": 2.7217391304347824, + "grad_norm": 0.22883220912475813, + "learning_rate": 5.152979066022544e-06, + "loss": 0.3812, + "step": 1878 + }, + { + "epoch": 2.723188405797101, + "grad_norm": 0.2123879981586008, + "learning_rate": 5.1261406333870105e-06, + "loss": 0.3564, + "step": 1879 + }, + { + "epoch": 2.7246376811594204, + "grad_norm": 0.22948078088179824, + "learning_rate": 5.099302200751476e-06, + "loss": 0.3846, + "step": 1880 + }, + { + "epoch": 2.726086956521739, + "grad_norm": 0.21642023725127324, + "learning_rate": 5.072463768115943e-06, + "loss": 0.3468, + "step": 1881 + }, + { + "epoch": 2.727536231884058, + "grad_norm": 0.22925279104543358, + "learning_rate": 5.045625335480408e-06, + "loss": 0.373, + "step": 1882 + }, + { + "epoch": 2.7289855072463767, + "grad_norm": 0.23657457987131836, + "learning_rate": 5.018786902844874e-06, + "loss": 0.3966, + "step": 1883 + }, + { + "epoch": 2.730434782608696, + "grad_norm": 0.22094146109857993, + "learning_rate": 4.99194847020934e-06, + "loss": 0.3903, + "step": 1884 + }, + { + "epoch": 2.7318840579710146, + "grad_norm": 0.22239394945020372, + "learning_rate": 4.9651100375738055e-06, + "loss": 0.3746, + "step": 1885 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 0.243518585995688, + "learning_rate": 4.938271604938272e-06, + "loss": 0.4275, + "step": 1886 + }, + { + "epoch": 2.734782608695652, + "grad_norm": 0.25187515639105273, + "learning_rate": 4.911433172302738e-06, + "loss": 0.3532, + "step": 1887 + }, + { + "epoch": 2.736231884057971, + "grad_norm": 0.22632621759318555, + "learning_rate": 4.884594739667204e-06, + "loss": 0.3769, + "step": 1888 + }, + { + "epoch": 2.7376811594202897, + "grad_norm": 0.22478743881018365, + "learning_rate": 4.85775630703167e-06, + "loss": 0.3925, + "step": 1889 + }, + { + "epoch": 2.7391304347826084, + "grad_norm": 0.21774392701540912, + "learning_rate": 4.830917874396135e-06, + "loss": 0.3422, + "step": 1890 + }, + { + "epoch": 2.7405797101449276, + "grad_norm": 0.2154279005368851, + "learning_rate": 4.8040794417606014e-06, + "loss": 0.3395, + "step": 1891 + }, + { + "epoch": 2.7420289855072464, + "grad_norm": 0.250856961061175, + "learning_rate": 4.777241009125067e-06, + "loss": 0.3844, + "step": 1892 + }, + { + "epoch": 2.743478260869565, + "grad_norm": 0.22435761376804456, + "learning_rate": 4.750402576489534e-06, + "loss": 0.3767, + "step": 1893 + }, + { + "epoch": 2.744927536231884, + "grad_norm": 0.21992969440428498, + "learning_rate": 4.723564143853999e-06, + "loss": 0.3178, + "step": 1894 + }, + { + "epoch": 2.746376811594203, + "grad_norm": 0.2334336353725486, + "learning_rate": 4.696725711218465e-06, + "loss": 0.3919, + "step": 1895 + }, + { + "epoch": 2.747826086956522, + "grad_norm": 0.2216197480193231, + "learning_rate": 4.669887278582931e-06, + "loss": 0.3388, + "step": 1896 + }, + { + "epoch": 2.7492753623188406, + "grad_norm": 0.22330046792674918, + "learning_rate": 4.6430488459473965e-06, + "loss": 0.3627, + "step": 1897 + }, + { + "epoch": 2.7507246376811594, + "grad_norm": 0.22226569426691617, + "learning_rate": 4.616210413311863e-06, + "loss": 0.3657, + "step": 1898 + }, + { + "epoch": 2.752173913043478, + "grad_norm": 0.230265527395819, + "learning_rate": 4.589371980676329e-06, + "loss": 0.4176, + "step": 1899 + }, + { + "epoch": 2.753623188405797, + "grad_norm": 0.21860986911594793, + "learning_rate": 4.562533548040795e-06, + "loss": 0.3597, + "step": 1900 + }, + { + "epoch": 2.755072463768116, + "grad_norm": 0.21878736281978017, + "learning_rate": 4.53569511540526e-06, + "loss": 0.3511, + "step": 1901 + }, + { + "epoch": 2.756521739130435, + "grad_norm": 0.2126842228269991, + "learning_rate": 4.508856682769726e-06, + "loss": 0.3649, + "step": 1902 + }, + { + "epoch": 2.7579710144927536, + "grad_norm": 0.2265362594489967, + "learning_rate": 4.482018250134192e-06, + "loss": 0.3639, + "step": 1903 + }, + { + "epoch": 2.7594202898550724, + "grad_norm": 0.24180262661923574, + "learning_rate": 4.455179817498658e-06, + "loss": 0.3872, + "step": 1904 + }, + { + "epoch": 2.7608695652173916, + "grad_norm": 0.21206981799411836, + "learning_rate": 4.428341384863125e-06, + "loss": 0.319, + "step": 1905 + }, + { + "epoch": 2.7623188405797103, + "grad_norm": 2.8559315823110727, + "learning_rate": 4.40150295222759e-06, + "loss": 0.3828, + "step": 1906 + }, + { + "epoch": 2.763768115942029, + "grad_norm": 0.2165891805436599, + "learning_rate": 4.374664519592056e-06, + "loss": 0.3601, + "step": 1907 + }, + { + "epoch": 2.765217391304348, + "grad_norm": 0.2687478416264375, + "learning_rate": 4.347826086956522e-06, + "loss": 0.4265, + "step": 1908 + }, + { + "epoch": 2.7666666666666666, + "grad_norm": 0.23162147279056533, + "learning_rate": 4.3209876543209875e-06, + "loss": 0.3776, + "step": 1909 + }, + { + "epoch": 2.7681159420289854, + "grad_norm": 0.24001548884386711, + "learning_rate": 4.2941492216854545e-06, + "loss": 0.3539, + "step": 1910 + }, + { + "epoch": 2.769565217391304, + "grad_norm": 0.21822860633731908, + "learning_rate": 4.26731078904992e-06, + "loss": 0.3721, + "step": 1911 + }, + { + "epoch": 2.7710144927536233, + "grad_norm": 0.22484154136612883, + "learning_rate": 4.240472356414386e-06, + "loss": 0.3794, + "step": 1912 + }, + { + "epoch": 2.772463768115942, + "grad_norm": 0.25675934996166644, + "learning_rate": 4.213633923778851e-06, + "loss": 0.4349, + "step": 1913 + }, + { + "epoch": 2.773913043478261, + "grad_norm": 0.24379391339974027, + "learning_rate": 4.186795491143317e-06, + "loss": 0.4392, + "step": 1914 + }, + { + "epoch": 2.7753623188405796, + "grad_norm": 0.21306475644565454, + "learning_rate": 4.159957058507783e-06, + "loss": 0.3595, + "step": 1915 + }, + { + "epoch": 2.776811594202899, + "grad_norm": 0.24061893214197644, + "learning_rate": 4.1331186258722495e-06, + "loss": 0.4079, + "step": 1916 + }, + { + "epoch": 2.7782608695652176, + "grad_norm": 0.2227123946898031, + "learning_rate": 4.106280193236716e-06, + "loss": 0.3993, + "step": 1917 + }, + { + "epoch": 2.7797101449275363, + "grad_norm": 0.22460600749977255, + "learning_rate": 4.079441760601181e-06, + "loss": 0.3517, + "step": 1918 + }, + { + "epoch": 2.781159420289855, + "grad_norm": 0.22317560971512612, + "learning_rate": 4.052603327965647e-06, + "loss": 0.3392, + "step": 1919 + }, + { + "epoch": 2.782608695652174, + "grad_norm": 0.251545566462596, + "learning_rate": 4.025764895330112e-06, + "loss": 0.4172, + "step": 1920 + }, + { + "epoch": 2.7840579710144926, + "grad_norm": 0.24032524857232762, + "learning_rate": 3.9989264626945785e-06, + "loss": 0.4041, + "step": 1921 + }, + { + "epoch": 2.7855072463768114, + "grad_norm": 0.23355053943345475, + "learning_rate": 3.9720880300590454e-06, + "loss": 0.399, + "step": 1922 + }, + { + "epoch": 2.7869565217391306, + "grad_norm": 0.2650792625387716, + "learning_rate": 3.945249597423511e-06, + "loss": 0.366, + "step": 1923 + }, + { + "epoch": 2.7884057971014493, + "grad_norm": 0.24920047860144612, + "learning_rate": 3.918411164787977e-06, + "loss": 0.3717, + "step": 1924 + }, + { + "epoch": 2.789855072463768, + "grad_norm": 0.22497324802029764, + "learning_rate": 3.891572732152442e-06, + "loss": 0.3554, + "step": 1925 + }, + { + "epoch": 2.791304347826087, + "grad_norm": 0.2430456602220627, + "learning_rate": 3.864734299516908e-06, + "loss": 0.3837, + "step": 1926 + }, + { + "epoch": 2.792753623188406, + "grad_norm": 0.23237653073700315, + "learning_rate": 3.837895866881374e-06, + "loss": 0.4283, + "step": 1927 + }, + { + "epoch": 2.794202898550725, + "grad_norm": 0.2458387819745744, + "learning_rate": 3.8110574342458405e-06, + "loss": 0.3827, + "step": 1928 + }, + { + "epoch": 2.7956521739130435, + "grad_norm": 0.23093469706892966, + "learning_rate": 3.784219001610306e-06, + "loss": 0.3635, + "step": 1929 + }, + { + "epoch": 2.7971014492753623, + "grad_norm": 0.24018081788643378, + "learning_rate": 3.757380568974772e-06, + "loss": 0.3959, + "step": 1930 + }, + { + "epoch": 2.798550724637681, + "grad_norm": 0.21167980466508968, + "learning_rate": 3.730542136339238e-06, + "loss": 0.3397, + "step": 1931 + }, + { + "epoch": 2.8, + "grad_norm": 0.21167642108349205, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.2966, + "step": 1932 + }, + { + "epoch": 2.8014492753623186, + "grad_norm": 0.22246501796322174, + "learning_rate": 3.6768652710681694e-06, + "loss": 0.3471, + "step": 1933 + }, + { + "epoch": 2.802898550724638, + "grad_norm": 0.21898837354019404, + "learning_rate": 3.650026838432636e-06, + "loss": 0.3636, + "step": 1934 + }, + { + "epoch": 2.8043478260869565, + "grad_norm": 0.22057898413689808, + "learning_rate": 3.6231884057971017e-06, + "loss": 0.3805, + "step": 1935 + }, + { + "epoch": 2.8057971014492753, + "grad_norm": 0.21911240754826977, + "learning_rate": 3.5963499731615674e-06, + "loss": 0.3804, + "step": 1936 + }, + { + "epoch": 2.807246376811594, + "grad_norm": 0.21040512826343807, + "learning_rate": 3.5695115405260335e-06, + "loss": 0.358, + "step": 1937 + }, + { + "epoch": 2.8086956521739133, + "grad_norm": 0.22517831098190927, + "learning_rate": 3.5426731078904992e-06, + "loss": 0.3814, + "step": 1938 + }, + { + "epoch": 2.810144927536232, + "grad_norm": 0.22692037568667617, + "learning_rate": 3.515834675254965e-06, + "loss": 0.405, + "step": 1939 + }, + { + "epoch": 2.8115942028985508, + "grad_norm": 0.21054577410504252, + "learning_rate": 3.4889962426194315e-06, + "loss": 0.3231, + "step": 1940 + }, + { + "epoch": 2.8130434782608695, + "grad_norm": 0.21603644115846818, + "learning_rate": 3.462157809983897e-06, + "loss": 0.3606, + "step": 1941 + }, + { + "epoch": 2.8144927536231883, + "grad_norm": 0.21532363538849883, + "learning_rate": 3.435319377348363e-06, + "loss": 0.3626, + "step": 1942 + }, + { + "epoch": 2.815942028985507, + "grad_norm": 0.22447387294670362, + "learning_rate": 3.408480944712829e-06, + "loss": 0.3614, + "step": 1943 + }, + { + "epoch": 2.8173913043478263, + "grad_norm": 0.24462971782857168, + "learning_rate": 3.3816425120772947e-06, + "loss": 0.4489, + "step": 1944 + }, + { + "epoch": 2.818840579710145, + "grad_norm": 0.2246947387946271, + "learning_rate": 3.3548040794417613e-06, + "loss": 0.3898, + "step": 1945 + }, + { + "epoch": 2.8202898550724638, + "grad_norm": 0.24533916969836606, + "learning_rate": 3.327965646806227e-06, + "loss": 0.3777, + "step": 1946 + }, + { + "epoch": 2.8217391304347825, + "grad_norm": 0.37104571577158546, + "learning_rate": 3.3011272141706927e-06, + "loss": 0.4285, + "step": 1947 + }, + { + "epoch": 2.8231884057971013, + "grad_norm": 0.2252493784676985, + "learning_rate": 3.2742887815351584e-06, + "loss": 0.4041, + "step": 1948 + }, + { + "epoch": 2.8246376811594205, + "grad_norm": 0.23358946546345724, + "learning_rate": 3.247450348899624e-06, + "loss": 0.4158, + "step": 1949 + }, + { + "epoch": 2.8260869565217392, + "grad_norm": 0.21586160621892583, + "learning_rate": 3.22061191626409e-06, + "loss": 0.3488, + "step": 1950 + }, + { + "epoch": 2.827536231884058, + "grad_norm": 0.3055949920007374, + "learning_rate": 3.1937734836285568e-06, + "loss": 0.3766, + "step": 1951 + }, + { + "epoch": 2.8289855072463768, + "grad_norm": 0.2093497712044223, + "learning_rate": 3.1669350509930225e-06, + "loss": 0.3491, + "step": 1952 + }, + { + "epoch": 2.8304347826086955, + "grad_norm": 0.21283625140471957, + "learning_rate": 3.140096618357488e-06, + "loss": 0.3869, + "step": 1953 + }, + { + "epoch": 2.8318840579710143, + "grad_norm": 0.21477929276167043, + "learning_rate": 3.113258185721954e-06, + "loss": 0.3494, + "step": 1954 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.2106104901914822, + "learning_rate": 3.0864197530864196e-06, + "loss": 0.36, + "step": 1955 + }, + { + "epoch": 2.8347826086956522, + "grad_norm": 0.23305285262666858, + "learning_rate": 3.0595813204508857e-06, + "loss": 0.4305, + "step": 1956 + }, + { + "epoch": 2.836231884057971, + "grad_norm": 0.22466858251147637, + "learning_rate": 3.032742887815352e-06, + "loss": 0.3847, + "step": 1957 + }, + { + "epoch": 2.8376811594202898, + "grad_norm": 0.24561966059885118, + "learning_rate": 3.005904455179818e-06, + "loss": 0.372, + "step": 1958 + }, + { + "epoch": 2.839130434782609, + "grad_norm": 0.24884208733974736, + "learning_rate": 2.9790660225442837e-06, + "loss": 0.3848, + "step": 1959 + }, + { + "epoch": 2.8405797101449277, + "grad_norm": 0.23587441969983608, + "learning_rate": 2.9522275899087494e-06, + "loss": 0.3723, + "step": 1960 + }, + { + "epoch": 2.8420289855072465, + "grad_norm": 0.22289741421781245, + "learning_rate": 2.9253891572732155e-06, + "loss": 0.3639, + "step": 1961 + }, + { + "epoch": 2.8434782608695652, + "grad_norm": 0.21516329109133195, + "learning_rate": 2.898550724637681e-06, + "loss": 0.3471, + "step": 1962 + }, + { + "epoch": 2.844927536231884, + "grad_norm": 0.22475075634579428, + "learning_rate": 2.8717122920021473e-06, + "loss": 0.3948, + "step": 1963 + }, + { + "epoch": 2.8463768115942027, + "grad_norm": 0.246650267181553, + "learning_rate": 2.8448738593666134e-06, + "loss": 0.3866, + "step": 1964 + }, + { + "epoch": 2.8478260869565215, + "grad_norm": 0.20439464539156504, + "learning_rate": 2.818035426731079e-06, + "loss": 0.3293, + "step": 1965 + }, + { + "epoch": 2.8492753623188407, + "grad_norm": 0.2219043464799365, + "learning_rate": 2.791196994095545e-06, + "loss": 0.3424, + "step": 1966 + }, + { + "epoch": 2.8507246376811595, + "grad_norm": 0.23028216886417335, + "learning_rate": 2.764358561460011e-06, + "loss": 0.3736, + "step": 1967 + }, + { + "epoch": 2.8521739130434782, + "grad_norm": 0.2692112527395984, + "learning_rate": 2.7375201288244767e-06, + "loss": 0.3615, + "step": 1968 + }, + { + "epoch": 2.853623188405797, + "grad_norm": 0.21852360587413638, + "learning_rate": 2.7106816961889424e-06, + "loss": 0.352, + "step": 1969 + }, + { + "epoch": 2.855072463768116, + "grad_norm": 0.23283633142906562, + "learning_rate": 2.683843263553409e-06, + "loss": 0.3831, + "step": 1970 + }, + { + "epoch": 2.856521739130435, + "grad_norm": 0.22253240700761454, + "learning_rate": 2.6570048309178746e-06, + "loss": 0.3404, + "step": 1971 + }, + { + "epoch": 2.8579710144927537, + "grad_norm": 0.2186544446437513, + "learning_rate": 2.6301663982823403e-06, + "loss": 0.3482, + "step": 1972 + }, + { + "epoch": 2.8594202898550725, + "grad_norm": 0.24369866231612267, + "learning_rate": 2.6033279656468065e-06, + "loss": 0.4298, + "step": 1973 + }, + { + "epoch": 2.860869565217391, + "grad_norm": 0.22628698320755639, + "learning_rate": 2.576489533011272e-06, + "loss": 0.3456, + "step": 1974 + }, + { + "epoch": 2.86231884057971, + "grad_norm": 0.21467256321432143, + "learning_rate": 2.549651100375738e-06, + "loss": 0.344, + "step": 1975 + }, + { + "epoch": 2.8637681159420287, + "grad_norm": 0.22488105416337417, + "learning_rate": 2.522812667740204e-06, + "loss": 0.3897, + "step": 1976 + }, + { + "epoch": 2.865217391304348, + "grad_norm": 0.21172401558442616, + "learning_rate": 2.49597423510467e-06, + "loss": 0.3561, + "step": 1977 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 0.23817304874870301, + "learning_rate": 2.469135802469136e-06, + "loss": 0.3469, + "step": 1978 + }, + { + "epoch": 2.8681159420289855, + "grad_norm": 0.23063863203760862, + "learning_rate": 2.442297369833602e-06, + "loss": 0.3744, + "step": 1979 + }, + { + "epoch": 2.869565217391304, + "grad_norm": 0.2279855386844957, + "learning_rate": 2.4154589371980677e-06, + "loss": 0.4174, + "step": 1980 + }, + { + "epoch": 2.8710144927536234, + "grad_norm": 0.20446797527153535, + "learning_rate": 2.3886205045625334e-06, + "loss": 0.3103, + "step": 1981 + }, + { + "epoch": 2.872463768115942, + "grad_norm": 0.2313299210534719, + "learning_rate": 2.3617820719269995e-06, + "loss": 0.3971, + "step": 1982 + }, + { + "epoch": 2.873913043478261, + "grad_norm": 0.21186439987907596, + "learning_rate": 2.3349436392914656e-06, + "loss": 0.3081, + "step": 1983 + }, + { + "epoch": 2.8753623188405797, + "grad_norm": 0.21930666784044156, + "learning_rate": 2.3081052066559313e-06, + "loss": 0.3594, + "step": 1984 + }, + { + "epoch": 2.8768115942028984, + "grad_norm": 0.211911521071487, + "learning_rate": 2.2812667740203974e-06, + "loss": 0.3347, + "step": 1985 + }, + { + "epoch": 2.878260869565217, + "grad_norm": 0.6711232292058408, + "learning_rate": 2.254428341384863e-06, + "loss": 0.4081, + "step": 1986 + }, + { + "epoch": 2.879710144927536, + "grad_norm": 0.22267835282728443, + "learning_rate": 2.227589908749329e-06, + "loss": 0.3665, + "step": 1987 + }, + { + "epoch": 2.881159420289855, + "grad_norm": 0.22181600928646944, + "learning_rate": 2.200751476113795e-06, + "loss": 0.4074, + "step": 1988 + }, + { + "epoch": 2.882608695652174, + "grad_norm": 0.23661315520430906, + "learning_rate": 2.173913043478261e-06, + "loss": 0.3856, + "step": 1989 + }, + { + "epoch": 2.8840579710144927, + "grad_norm": 0.20941940533698208, + "learning_rate": 2.1470746108427272e-06, + "loss": 0.3375, + "step": 1990 + }, + { + "epoch": 2.8855072463768114, + "grad_norm": 0.21125596048646939, + "learning_rate": 2.120236178207193e-06, + "loss": 0.3557, + "step": 1991 + }, + { + "epoch": 2.8869565217391306, + "grad_norm": 0.24322069203581723, + "learning_rate": 2.0933977455716586e-06, + "loss": 0.4068, + "step": 1992 + }, + { + "epoch": 2.8884057971014494, + "grad_norm": 0.20858954044987255, + "learning_rate": 2.0665593129361248e-06, + "loss": 0.3168, + "step": 1993 + }, + { + "epoch": 2.889855072463768, + "grad_norm": 0.2236295696408854, + "learning_rate": 2.0397208803005905e-06, + "loss": 0.369, + "step": 1994 + }, + { + "epoch": 2.891304347826087, + "grad_norm": 0.22156324434803293, + "learning_rate": 2.012882447665056e-06, + "loss": 0.3437, + "step": 1995 + }, + { + "epoch": 2.8927536231884057, + "grad_norm": 0.21259380447613244, + "learning_rate": 1.9860440150295227e-06, + "loss": 0.3649, + "step": 1996 + }, + { + "epoch": 2.8942028985507244, + "grad_norm": 0.21497355911092556, + "learning_rate": 1.9592055823939884e-06, + "loss": 0.3651, + "step": 1997 + }, + { + "epoch": 2.8956521739130436, + "grad_norm": 0.224066475424279, + "learning_rate": 1.932367149758454e-06, + "loss": 0.3813, + "step": 1998 + }, + { + "epoch": 2.8971014492753624, + "grad_norm": 0.2361099140230203, + "learning_rate": 1.9055287171229203e-06, + "loss": 0.4119, + "step": 1999 + }, + { + "epoch": 2.898550724637681, + "grad_norm": 0.2285383121878008, + "learning_rate": 1.878690284487386e-06, + "loss": 0.412, + "step": 2000 + }, + { + "epoch": 2.9, + "grad_norm": 0.22744543201115258, + "learning_rate": 1.8518518518518519e-06, + "loss": 0.3667, + "step": 2001 + }, + { + "epoch": 2.901449275362319, + "grad_norm": 0.20438497271453532, + "learning_rate": 1.825013419216318e-06, + "loss": 0.3392, + "step": 2002 + }, + { + "epoch": 2.902898550724638, + "grad_norm": 0.22154818027410342, + "learning_rate": 1.7981749865807837e-06, + "loss": 0.3657, + "step": 2003 + }, + { + "epoch": 2.9043478260869566, + "grad_norm": 0.23655652039164077, + "learning_rate": 1.7713365539452496e-06, + "loss": 0.3472, + "step": 2004 + }, + { + "epoch": 2.9057971014492754, + "grad_norm": 0.21968133490213995, + "learning_rate": 1.7444981213097157e-06, + "loss": 0.3875, + "step": 2005 + }, + { + "epoch": 2.907246376811594, + "grad_norm": 0.20672773395313468, + "learning_rate": 1.7176596886741814e-06, + "loss": 0.3327, + "step": 2006 + }, + { + "epoch": 2.908695652173913, + "grad_norm": 0.2205358363917607, + "learning_rate": 1.6908212560386474e-06, + "loss": 0.389, + "step": 2007 + }, + { + "epoch": 2.9101449275362317, + "grad_norm": 0.23819173128880847, + "learning_rate": 1.6639828234031135e-06, + "loss": 0.3942, + "step": 2008 + }, + { + "epoch": 2.911594202898551, + "grad_norm": 0.2059964263544637, + "learning_rate": 1.6371443907675792e-06, + "loss": 0.3568, + "step": 2009 + }, + { + "epoch": 2.9130434782608696, + "grad_norm": 0.2171138770231751, + "learning_rate": 1.610305958132045e-06, + "loss": 0.38, + "step": 2010 + }, + { + "epoch": 2.9144927536231884, + "grad_norm": 0.23005385529348343, + "learning_rate": 1.5834675254965112e-06, + "loss": 0.3655, + "step": 2011 + }, + { + "epoch": 2.915942028985507, + "grad_norm": 0.21016123356167332, + "learning_rate": 1.556629092860977e-06, + "loss": 0.3672, + "step": 2012 + }, + { + "epoch": 2.9173913043478263, + "grad_norm": 0.22483884303098328, + "learning_rate": 1.5297906602254428e-06, + "loss": 0.3581, + "step": 2013 + }, + { + "epoch": 2.918840579710145, + "grad_norm": 0.22322264421802532, + "learning_rate": 1.502952227589909e-06, + "loss": 0.3537, + "step": 2014 + }, + { + "epoch": 2.920289855072464, + "grad_norm": 0.2311031178966409, + "learning_rate": 1.4761137949543747e-06, + "loss": 0.405, + "step": 2015 + }, + { + "epoch": 2.9217391304347826, + "grad_norm": 0.22217953477714308, + "learning_rate": 1.4492753623188406e-06, + "loss": 0.4127, + "step": 2016 + }, + { + "epoch": 2.9231884057971014, + "grad_norm": 0.22709560866583908, + "learning_rate": 1.4224369296833067e-06, + "loss": 0.3834, + "step": 2017 + }, + { + "epoch": 2.92463768115942, + "grad_norm": 0.22408928030872904, + "learning_rate": 1.3955984970477724e-06, + "loss": 0.3976, + "step": 2018 + }, + { + "epoch": 2.926086956521739, + "grad_norm": 0.2106162699815264, + "learning_rate": 1.3687600644122383e-06, + "loss": 0.371, + "step": 2019 + }, + { + "epoch": 2.927536231884058, + "grad_norm": 0.2018244638135191, + "learning_rate": 1.3419216317767045e-06, + "loss": 0.3538, + "step": 2020 + }, + { + "epoch": 2.928985507246377, + "grad_norm": 0.2347309750501861, + "learning_rate": 1.3150831991411702e-06, + "loss": 0.4322, + "step": 2021 + }, + { + "epoch": 2.9304347826086956, + "grad_norm": 0.23350101438562867, + "learning_rate": 1.288244766505636e-06, + "loss": 0.3775, + "step": 2022 + }, + { + "epoch": 2.9318840579710144, + "grad_norm": 0.8735079283802203, + "learning_rate": 1.261406333870102e-06, + "loss": 0.4373, + "step": 2023 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 0.8056038885314158, + "learning_rate": 1.234567901234568e-06, + "loss": 0.3715, + "step": 2024 + }, + { + "epoch": 2.9347826086956523, + "grad_norm": 0.21993418679321491, + "learning_rate": 1.2077294685990338e-06, + "loss": 0.3555, + "step": 2025 + }, + { + "epoch": 2.936231884057971, + "grad_norm": 0.23405472869389934, + "learning_rate": 1.1808910359634997e-06, + "loss": 0.3752, + "step": 2026 + }, + { + "epoch": 2.93768115942029, + "grad_norm": 0.2313082257049979, + "learning_rate": 1.1540526033279657e-06, + "loss": 0.4198, + "step": 2027 + }, + { + "epoch": 2.9391304347826086, + "grad_norm": 0.22480212739012523, + "learning_rate": 1.1272141706924316e-06, + "loss": 0.3798, + "step": 2028 + }, + { + "epoch": 2.9405797101449274, + "grad_norm": 0.21838170742723495, + "learning_rate": 1.1003757380568975e-06, + "loss": 0.3796, + "step": 2029 + }, + { + "epoch": 2.942028985507246, + "grad_norm": 0.21405428252058262, + "learning_rate": 1.0735373054213636e-06, + "loss": 0.3559, + "step": 2030 + }, + { + "epoch": 2.9434782608695653, + "grad_norm": 0.21549860413480246, + "learning_rate": 1.0466988727858293e-06, + "loss": 0.3334, + "step": 2031 + }, + { + "epoch": 2.944927536231884, + "grad_norm": 0.28906789347226164, + "learning_rate": 1.0198604401502952e-06, + "loss": 0.3527, + "step": 2032 + }, + { + "epoch": 2.946376811594203, + "grad_norm": 0.2064045807699817, + "learning_rate": 9.930220075147614e-07, + "loss": 0.3469, + "step": 2033 + }, + { + "epoch": 2.9478260869565216, + "grad_norm": 0.22024850212103755, + "learning_rate": 9.66183574879227e-07, + "loss": 0.3803, + "step": 2034 + }, + { + "epoch": 2.949275362318841, + "grad_norm": 0.21800534224417312, + "learning_rate": 9.39345142243693e-07, + "loss": 0.4064, + "step": 2035 + }, + { + "epoch": 2.9507246376811596, + "grad_norm": 0.2201865038559122, + "learning_rate": 9.12506709608159e-07, + "loss": 0.3785, + "step": 2036 + }, + { + "epoch": 2.9521739130434783, + "grad_norm": 0.2210475513425969, + "learning_rate": 8.856682769726248e-07, + "loss": 0.4036, + "step": 2037 + }, + { + "epoch": 2.953623188405797, + "grad_norm": 0.1984962905202788, + "learning_rate": 8.588298443370907e-07, + "loss": 0.3271, + "step": 2038 + }, + { + "epoch": 2.955072463768116, + "grad_norm": 0.24847445353598208, + "learning_rate": 8.319914117015567e-07, + "loss": 0.3313, + "step": 2039 + }, + { + "epoch": 2.9565217391304346, + "grad_norm": 0.2210563477725767, + "learning_rate": 8.051529790660226e-07, + "loss": 0.3857, + "step": 2040 + }, + { + "epoch": 2.9579710144927533, + "grad_norm": 0.21300962762024758, + "learning_rate": 7.783145464304885e-07, + "loss": 0.3656, + "step": 2041 + }, + { + "epoch": 2.9594202898550726, + "grad_norm": 0.20426722668302832, + "learning_rate": 7.514761137949545e-07, + "loss": 0.3239, + "step": 2042 + }, + { + "epoch": 2.9608695652173913, + "grad_norm": 0.21577157388977716, + "learning_rate": 7.246376811594203e-07, + "loss": 0.3689, + "step": 2043 + }, + { + "epoch": 2.96231884057971, + "grad_norm": 0.1989269908554471, + "learning_rate": 6.977992485238862e-07, + "loss": 0.2938, + "step": 2044 + }, + { + "epoch": 2.963768115942029, + "grad_norm": 0.22654103161247222, + "learning_rate": 6.709608158883522e-07, + "loss": 0.3623, + "step": 2045 + }, + { + "epoch": 2.965217391304348, + "grad_norm": 0.22926355644211238, + "learning_rate": 6.44122383252818e-07, + "loss": 0.3641, + "step": 2046 + }, + { + "epoch": 2.966666666666667, + "grad_norm": 0.22858125913553615, + "learning_rate": 6.17283950617284e-07, + "loss": 0.3596, + "step": 2047 + }, + { + "epoch": 2.9681159420289855, + "grad_norm": 0.23631989683689095, + "learning_rate": 5.904455179817499e-07, + "loss": 0.3811, + "step": 2048 + }, + { + "epoch": 2.9695652173913043, + "grad_norm": 0.2191396177656684, + "learning_rate": 5.636070853462158e-07, + "loss": 0.3804, + "step": 2049 + }, + { + "epoch": 2.971014492753623, + "grad_norm": 0.222858361371568, + "learning_rate": 5.367686527106818e-07, + "loss": 0.4073, + "step": 2050 + }, + { + "epoch": 2.972463768115942, + "grad_norm": 0.20318427662477487, + "learning_rate": 5.099302200751476e-07, + "loss": 0.3697, + "step": 2051 + }, + { + "epoch": 2.973913043478261, + "grad_norm": 0.217758620275763, + "learning_rate": 4.830917874396135e-07, + "loss": 0.4074, + "step": 2052 + }, + { + "epoch": 2.97536231884058, + "grad_norm": 0.22085137440848657, + "learning_rate": 4.562533548040795e-07, + "loss": 0.3512, + "step": 2053 + }, + { + "epoch": 2.9768115942028985, + "grad_norm": 0.22012562095031038, + "learning_rate": 4.2941492216854536e-07, + "loss": 0.3802, + "step": 2054 + }, + { + "epoch": 2.9782608695652173, + "grad_norm": 0.22618508012844524, + "learning_rate": 4.025764895330113e-07, + "loss": 0.374, + "step": 2055 + }, + { + "epoch": 2.9797101449275365, + "grad_norm": 0.20800555397382295, + "learning_rate": 3.7573805689747724e-07, + "loss": 0.33, + "step": 2056 + }, + { + "epoch": 2.9811594202898553, + "grad_norm": 0.2132367722323136, + "learning_rate": 3.488996242619431e-07, + "loss": 0.3509, + "step": 2057 + }, + { + "epoch": 2.982608695652174, + "grad_norm": 0.236211820119767, + "learning_rate": 3.22061191626409e-07, + "loss": 0.4553, + "step": 2058 + }, + { + "epoch": 2.9840579710144928, + "grad_norm": 0.21731210967631273, + "learning_rate": 2.9522275899087494e-07, + "loss": 0.4114, + "step": 2059 + }, + { + "epoch": 2.9855072463768115, + "grad_norm": 0.22337130008338704, + "learning_rate": 2.683843263553409e-07, + "loss": 0.4264, + "step": 2060 + }, + { + "epoch": 2.9869565217391303, + "grad_norm": 0.20921981714387702, + "learning_rate": 2.4154589371980677e-07, + "loss": 0.3334, + "step": 2061 + }, + { + "epoch": 2.988405797101449, + "grad_norm": 0.2065612804088296, + "learning_rate": 2.1470746108427268e-07, + "loss": 0.3262, + "step": 2062 + }, + { + "epoch": 2.9898550724637682, + "grad_norm": 0.2154820919743264, + "learning_rate": 1.8786902844873862e-07, + "loss": 0.3863, + "step": 2063 + }, + { + "epoch": 2.991304347826087, + "grad_norm": 4.776049818404092, + "learning_rate": 1.610305958132045e-07, + "loss": 0.4863, + "step": 2064 + }, + { + "epoch": 2.9927536231884058, + "grad_norm": 0.21262659774660408, + "learning_rate": 1.3419216317767045e-07, + "loss": 0.3271, + "step": 2065 + }, + { + "epoch": 2.9942028985507245, + "grad_norm": 0.2166325401832577, + "learning_rate": 1.0735373054213634e-07, + "loss": 0.3759, + "step": 2066 + }, + { + "epoch": 2.9956521739130437, + "grad_norm": 0.22097635962706288, + "learning_rate": 8.051529790660226e-08, + "loss": 0.369, + "step": 2067 + }, + { + "epoch": 2.9971014492753625, + "grad_norm": 0.2236357694002658, + "learning_rate": 5.367686527106817e-08, + "loss": 0.4169, + "step": 2068 + }, + { + "epoch": 2.9985507246376812, + "grad_norm": 0.21261718905247393, + "learning_rate": 2.6838432635534085e-08, + "loss": 0.3464, + "step": 2069 + }, + { + "epoch": 3.0, + "grad_norm": 0.23176378642380113, + "learning_rate": 0.0, + "loss": 0.3583, + "step": 2070 + }, + { + "epoch": 3.0, + "step": 2070, + "total_flos": 1.7567558587184579e+18, + "train_loss": 0.21386527973384672, + "train_runtime": 90615.9255, + "train_samples_per_second": 0.365, + "train_steps_per_second": 0.023 + } + ], + "logging_steps": 1, + "max_steps": 2070, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.7567558587184579e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}