diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,56034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5886410669119337, + "eval_steps": 500, + "global_step": 8000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.358013336399172e-05, + "grad_norm": 12.875, + "learning_rate": 0.0, + "loss": 1.3441, + "step": 1 + }, + { + "epoch": 0.00014716026672798344, + "grad_norm": 15.0, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.7824, + "step": 2 + }, + { + "epoch": 0.00022074040009197518, + "grad_norm": 16.625, + "learning_rate": 3.3333333333333333e-06, + "loss": 2.041, + "step": 3 + }, + { + "epoch": 0.0002943205334559669, + "grad_norm": 11.875, + "learning_rate": 5e-06, + "loss": 1.1798, + "step": 4 + }, + { + "epoch": 0.0003679006668199586, + "grad_norm": 12.125, + "learning_rate": 6.666666666666667e-06, + "loss": 1.3734, + "step": 5 + }, + { + "epoch": 0.00044148080018395036, + "grad_norm": 9.5, + "learning_rate": 8.333333333333334e-06, + "loss": 1.5999, + "step": 6 + }, + { + "epoch": 0.0005150609335479421, + "grad_norm": 7.03125, + "learning_rate": 1e-05, + "loss": 1.5471, + "step": 7 + }, + { + "epoch": 0.0005886410669119338, + "grad_norm": 4.71875, + "learning_rate": 1.1666666666666668e-05, + "loss": 1.0944, + "step": 8 + }, + { + "epoch": 0.0006622212002759255, + "grad_norm": 3.09375, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.8931, + "step": 9 + }, + { + "epoch": 0.0007358013336399172, + "grad_norm": 2.375, + "learning_rate": 1.5e-05, + "loss": 1.0534, + "step": 10 + }, + { + "epoch": 0.0008093814670039089, + "grad_norm": 2.640625, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.8836, + "step": 11 + }, + { + "epoch": 0.0008829616003679007, + "grad_norm": 2.15625, + "learning_rate": 1.8333333333333333e-05, + "loss": 1.1655, + "step": 12 + }, + { + "epoch": 0.0009565417337318924, + "grad_norm": 2.265625, + "learning_rate": 2e-05, + "loss": 1.7158, + "step": 13 + }, + { + "epoch": 0.0010301218670958842, + "grad_norm": 2.171875, + "learning_rate": 2.1666666666666667e-05, + "loss": 1.2426, + "step": 14 + }, + { + "epoch": 0.0011037020004598759, + "grad_norm": 1.6796875, + "learning_rate": 2.3333333333333336e-05, + "loss": 1.048, + "step": 15 + }, + { + "epoch": 0.0011772821338238676, + "grad_norm": 1.875, + "learning_rate": 2.5e-05, + "loss": 1.3412, + "step": 16 + }, + { + "epoch": 0.0012508622671878592, + "grad_norm": 1.5078125, + "learning_rate": 2.6666666666666667e-05, + "loss": 1.0901, + "step": 17 + }, + { + "epoch": 0.001324442400551851, + "grad_norm": 1.1875, + "learning_rate": 2.8333333333333335e-05, + "loss": 0.9881, + "step": 18 + }, + { + "epoch": 0.0013980225339158428, + "grad_norm": 1.4609375, + "learning_rate": 3e-05, + "loss": 0.9804, + "step": 19 + }, + { + "epoch": 0.0014716026672798345, + "grad_norm": 1.5625, + "learning_rate": 3.1666666666666666e-05, + "loss": 1.3544, + "step": 20 + }, + { + "epoch": 0.0015451828006438262, + "grad_norm": 2.140625, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.9491, + "step": 21 + }, + { + "epoch": 0.0016187629340078179, + "grad_norm": 1.2890625, + "learning_rate": 3.5e-05, + "loss": 0.9426, + "step": 22 + }, + { + "epoch": 0.0016923430673718095, + "grad_norm": 1.2109375, + "learning_rate": 3.6666666666666666e-05, + "loss": 1.0448, + "step": 23 + }, + { + "epoch": 0.0017659232007358014, + "grad_norm": 1.3359375, + "learning_rate": 3.8333333333333334e-05, + "loss": 1.2337, + "step": 24 + }, + { + "epoch": 0.0018395033340997931, + "grad_norm": 1.3046875, + "learning_rate": 4e-05, + "loss": 1.5146, + "step": 25 + }, + { + "epoch": 0.0019130834674637848, + "grad_norm": 1.4609375, + "learning_rate": 4.166666666666667e-05, + "loss": 1.1482, + "step": 26 + }, + { + "epoch": 0.0019866636008277765, + "grad_norm": 1.109375, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.8161, + "step": 27 + }, + { + "epoch": 0.0020602437341917684, + "grad_norm": 1.2265625, + "learning_rate": 4.5e-05, + "loss": 0.991, + "step": 28 + }, + { + "epoch": 0.00213382386755576, + "grad_norm": 0.98828125, + "learning_rate": 4.666666666666667e-05, + "loss": 0.7794, + "step": 29 + }, + { + "epoch": 0.0022074040009197517, + "grad_norm": 1.0703125, + "learning_rate": 4.8333333333333334e-05, + "loss": 0.8139, + "step": 30 + }, + { + "epoch": 0.002280984134283743, + "grad_norm": 1.09375, + "learning_rate": 5e-05, + "loss": 0.8812, + "step": 31 + }, + { + "epoch": 0.002354564267647735, + "grad_norm": 1.2109375, + "learning_rate": 4.9999999329148614e-05, + "loss": 0.6803, + "step": 32 + }, + { + "epoch": 0.002428144401011727, + "grad_norm": 1.265625, + "learning_rate": 4.999999731659447e-05, + "loss": 0.9949, + "step": 33 + }, + { + "epoch": 0.0025017245343757185, + "grad_norm": 1.1171875, + "learning_rate": 4.9999993962337696e-05, + "loss": 0.7634, + "step": 34 + }, + { + "epoch": 0.0025753046677397104, + "grad_norm": 1.296875, + "learning_rate": 4.9999989266378464e-05, + "loss": 0.9535, + "step": 35 + }, + { + "epoch": 0.002648884801103702, + "grad_norm": 1.078125, + "learning_rate": 4.999998322871703e-05, + "loss": 0.9741, + "step": 36 + }, + { + "epoch": 0.0027224649344676937, + "grad_norm": 1.4453125, + "learning_rate": 4.999997584935371e-05, + "loss": 0.8228, + "step": 37 + }, + { + "epoch": 0.0027960450678316856, + "grad_norm": 1.1015625, + "learning_rate": 4.9999967128288905e-05, + "loss": 0.8874, + "step": 38 + }, + { + "epoch": 0.002869625201195677, + "grad_norm": 0.90625, + "learning_rate": 4.9999957065523085e-05, + "loss": 0.7428, + "step": 39 + }, + { + "epoch": 0.002943205334559669, + "grad_norm": 0.9296875, + "learning_rate": 4.9999945661056786e-05, + "loss": 0.7836, + "step": 40 + }, + { + "epoch": 0.0030167854679236604, + "grad_norm": 1.1796875, + "learning_rate": 4.999993291489062e-05, + "loss": 0.9396, + "step": 41 + }, + { + "epoch": 0.0030903656012876523, + "grad_norm": 1.3359375, + "learning_rate": 4.9999918827025275e-05, + "loss": 0.9364, + "step": 42 + }, + { + "epoch": 0.0031639457346516442, + "grad_norm": 1.2109375, + "learning_rate": 4.99999033974615e-05, + "loss": 0.8486, + "step": 43 + }, + { + "epoch": 0.0032375258680156357, + "grad_norm": 1.21875, + "learning_rate": 4.999988662620013e-05, + "loss": 1.0065, + "step": 44 + }, + { + "epoch": 0.0033111060013796276, + "grad_norm": 1.1640625, + "learning_rate": 4.9999868513242065e-05, + "loss": 1.1676, + "step": 45 + }, + { + "epoch": 0.003384686134743619, + "grad_norm": 1.4609375, + "learning_rate": 4.9999849058588276e-05, + "loss": 1.1662, + "step": 46 + }, + { + "epoch": 0.003458266268107611, + "grad_norm": 1.0, + "learning_rate": 4.99998282622398e-05, + "loss": 0.7682, + "step": 47 + }, + { + "epoch": 0.003531846401471603, + "grad_norm": 1.3359375, + "learning_rate": 4.999980612419777e-05, + "loss": 1.3762, + "step": 48 + }, + { + "epoch": 0.0036054265348355943, + "grad_norm": 1.046875, + "learning_rate": 4.999978264446335e-05, + "loss": 1.1862, + "step": 49 + }, + { + "epoch": 0.0036790066681995862, + "grad_norm": 1.125, + "learning_rate": 4.9999757823037816e-05, + "loss": 1.0162, + "step": 50 + }, + { + "epoch": 0.0037525868015635777, + "grad_norm": 0.96484375, + "learning_rate": 4.999973165992251e-05, + "loss": 0.9144, + "step": 51 + }, + { + "epoch": 0.0038261669349275696, + "grad_norm": 1.3828125, + "learning_rate": 4.999970415511881e-05, + "loss": 1.7539, + "step": 52 + }, + { + "epoch": 0.0038997470682915615, + "grad_norm": 1.03125, + "learning_rate": 4.999967530862821e-05, + "loss": 1.0273, + "step": 53 + }, + { + "epoch": 0.003973327201655553, + "grad_norm": 1.1484375, + "learning_rate": 4.999964512045226e-05, + "loss": 1.1683, + "step": 54 + }, + { + "epoch": 0.004046907335019545, + "grad_norm": 1.203125, + "learning_rate": 4.999961359059257e-05, + "loss": 1.0855, + "step": 55 + }, + { + "epoch": 0.004120487468383537, + "grad_norm": 1.03125, + "learning_rate": 4.999958071905082e-05, + "loss": 0.9039, + "step": 56 + }, + { + "epoch": 0.004194067601747528, + "grad_norm": 1.515625, + "learning_rate": 4.999954650582881e-05, + "loss": 0.8736, + "step": 57 + }, + { + "epoch": 0.00426764773511152, + "grad_norm": 1.3515625, + "learning_rate": 4.999951095092835e-05, + "loss": 0.9829, + "step": 58 + }, + { + "epoch": 0.004341227868475512, + "grad_norm": 1.234375, + "learning_rate": 4.9999474054351357e-05, + "loss": 1.087, + "step": 59 + }, + { + "epoch": 0.0044148080018395035, + "grad_norm": 1.125, + "learning_rate": 4.99994358160998e-05, + "loss": 1.0548, + "step": 60 + }, + { + "epoch": 0.004488388135203495, + "grad_norm": 1.109375, + "learning_rate": 4.9999396236175754e-05, + "loss": 1.2358, + "step": 61 + }, + { + "epoch": 0.004561968268567486, + "grad_norm": 1.2109375, + "learning_rate": 4.999935531458132e-05, + "loss": 0.9955, + "step": 62 + }, + { + "epoch": 0.004635548401931478, + "grad_norm": 1.03125, + "learning_rate": 4.999931305131871e-05, + "loss": 1.0251, + "step": 63 + }, + { + "epoch": 0.00470912853529547, + "grad_norm": 0.98828125, + "learning_rate": 4.999926944639018e-05, + "loss": 1.0653, + "step": 64 + }, + { + "epoch": 0.004782708668659462, + "grad_norm": 1.1796875, + "learning_rate": 4.999922449979808e-05, + "loss": 1.0879, + "step": 65 + }, + { + "epoch": 0.004856288802023454, + "grad_norm": 1.0546875, + "learning_rate": 4.999917821154481e-05, + "loss": 1.0973, + "step": 66 + }, + { + "epoch": 0.004929868935387445, + "grad_norm": 1.015625, + "learning_rate": 4.999913058163287e-05, + "loss": 0.893, + "step": 67 + }, + { + "epoch": 0.005003449068751437, + "grad_norm": 1.0, + "learning_rate": 4.99990816100648e-05, + "loss": 1.3085, + "step": 68 + }, + { + "epoch": 0.005077029202115429, + "grad_norm": 0.8828125, + "learning_rate": 4.9999031296843244e-05, + "loss": 0.9653, + "step": 69 + }, + { + "epoch": 0.005150609335479421, + "grad_norm": 1.2265625, + "learning_rate": 4.99989796419709e-05, + "loss": 1.3415, + "step": 70 + }, + { + "epoch": 0.005224189468843413, + "grad_norm": 0.96875, + "learning_rate": 4.999892664545053e-05, + "loss": 1.1798, + "step": 71 + }, + { + "epoch": 0.005297769602207404, + "grad_norm": 1.09375, + "learning_rate": 4.999887230728497e-05, + "loss": 0.9253, + "step": 72 + }, + { + "epoch": 0.0053713497355713955, + "grad_norm": 0.8359375, + "learning_rate": 4.9998816627477166e-05, + "loss": 0.7838, + "step": 73 + }, + { + "epoch": 0.0054449298689353874, + "grad_norm": 1.2109375, + "learning_rate": 4.999875960603008e-05, + "loss": 1.5514, + "step": 74 + }, + { + "epoch": 0.005518510002299379, + "grad_norm": 1.3671875, + "learning_rate": 4.9998701242946785e-05, + "loss": 1.3704, + "step": 75 + }, + { + "epoch": 0.005592090135663371, + "grad_norm": 1.1875, + "learning_rate": 4.9998641538230415e-05, + "loss": 1.0745, + "step": 76 + }, + { + "epoch": 0.005665670269027362, + "grad_norm": 1.1015625, + "learning_rate": 4.999858049188417e-05, + "loss": 1.1878, + "step": 77 + }, + { + "epoch": 0.005739250402391354, + "grad_norm": 1.0390625, + "learning_rate": 4.999851810391132e-05, + "loss": 1.0663, + "step": 78 + }, + { + "epoch": 0.005812830535755346, + "grad_norm": 1.0625, + "learning_rate": 4.9998454374315216e-05, + "loss": 1.0414, + "step": 79 + }, + { + "epoch": 0.005886410669119338, + "grad_norm": 1.046875, + "learning_rate": 4.9998389303099284e-05, + "loss": 0.7892, + "step": 80 + }, + { + "epoch": 0.00595999080248333, + "grad_norm": 1.140625, + "learning_rate": 4.999832289026701e-05, + "loss": 1.0337, + "step": 81 + }, + { + "epoch": 0.006033570935847321, + "grad_norm": 0.8828125, + "learning_rate": 4.999825513582197e-05, + "loss": 0.7221, + "step": 82 + }, + { + "epoch": 0.006107151069211313, + "grad_norm": 0.953125, + "learning_rate": 4.999818603976779e-05, + "loss": 0.9394, + "step": 83 + }, + { + "epoch": 0.006180731202575305, + "grad_norm": 0.83984375, + "learning_rate": 4.999811560210817e-05, + "loss": 0.6104, + "step": 84 + }, + { + "epoch": 0.006254311335939297, + "grad_norm": 0.8671875, + "learning_rate": 4.999804382284692e-05, + "loss": 0.7438, + "step": 85 + }, + { + "epoch": 0.0063278914693032885, + "grad_norm": 1.5078125, + "learning_rate": 4.9997970701987855e-05, + "loss": 1.26, + "step": 86 + }, + { + "epoch": 0.0064014716026672795, + "grad_norm": 0.83203125, + "learning_rate": 4.999789623953493e-05, + "loss": 0.8338, + "step": 87 + }, + { + "epoch": 0.006475051736031271, + "grad_norm": 1.453125, + "learning_rate": 4.9997820435492116e-05, + "loss": 1.2148, + "step": 88 + }, + { + "epoch": 0.006548631869395263, + "grad_norm": 1.234375, + "learning_rate": 4.99977432898635e-05, + "loss": 1.1692, + "step": 89 + }, + { + "epoch": 0.006622212002759255, + "grad_norm": 0.94921875, + "learning_rate": 4.999766480265321e-05, + "loss": 0.9819, + "step": 90 + }, + { + "epoch": 0.006695792136123247, + "grad_norm": 1.1484375, + "learning_rate": 4.999758497386547e-05, + "loss": 1.4166, + "step": 91 + }, + { + "epoch": 0.006769372269487238, + "grad_norm": 0.9921875, + "learning_rate": 4.999750380350456e-05, + "loss": 0.7442, + "step": 92 + }, + { + "epoch": 0.00684295240285123, + "grad_norm": 1.015625, + "learning_rate": 4.999742129157483e-05, + "loss": 1.0275, + "step": 93 + }, + { + "epoch": 0.006916532536215222, + "grad_norm": 1.0859375, + "learning_rate": 4.999733743808071e-05, + "loss": 0.8956, + "step": 94 + }, + { + "epoch": 0.006990112669579214, + "grad_norm": 1.015625, + "learning_rate": 4.999725224302671e-05, + "loss": 0.8214, + "step": 95 + }, + { + "epoch": 0.007063692802943206, + "grad_norm": 1.0390625, + "learning_rate": 4.9997165706417395e-05, + "loss": 1.1712, + "step": 96 + }, + { + "epoch": 0.007137272936307197, + "grad_norm": 0.96484375, + "learning_rate": 4.99970778282574e-05, + "loss": 0.8904, + "step": 97 + }, + { + "epoch": 0.007210853069671189, + "grad_norm": 1.015625, + "learning_rate": 4.9996988608551454e-05, + "loss": 1.0042, + "step": 98 + }, + { + "epoch": 0.0072844332030351806, + "grad_norm": 0.85546875, + "learning_rate": 4.999689804730435e-05, + "loss": 0.8162, + "step": 99 + }, + { + "epoch": 0.0073580133363991725, + "grad_norm": 1.0703125, + "learning_rate": 4.9996806144520936e-05, + "loss": 0.8645, + "step": 100 + }, + { + "epoch": 0.007431593469763164, + "grad_norm": 0.90234375, + "learning_rate": 4.999671290020615e-05, + "loss": 1.0243, + "step": 101 + }, + { + "epoch": 0.007505173603127155, + "grad_norm": 0.859375, + "learning_rate": 4.999661831436499e-05, + "loss": 0.9468, + "step": 102 + }, + { + "epoch": 0.007578753736491147, + "grad_norm": 1.0390625, + "learning_rate": 4.999652238700253e-05, + "loss": 1.154, + "step": 103 + }, + { + "epoch": 0.007652333869855139, + "grad_norm": 1.296875, + "learning_rate": 4.999642511812394e-05, + "loss": 1.4333, + "step": 104 + }, + { + "epoch": 0.007725914003219131, + "grad_norm": 0.9140625, + "learning_rate": 4.999632650773442e-05, + "loss": 0.7866, + "step": 105 + }, + { + "epoch": 0.007799494136583123, + "grad_norm": 0.82421875, + "learning_rate": 4.999622655583927e-05, + "loss": 0.7542, + "step": 106 + }, + { + "epoch": 0.007873074269947114, + "grad_norm": 1.0625, + "learning_rate": 4.999612526244385e-05, + "loss": 1.2052, + "step": 107 + }, + { + "epoch": 0.007946654403311106, + "grad_norm": 1.2109375, + "learning_rate": 4.99960226275536e-05, + "loss": 1.369, + "step": 108 + }, + { + "epoch": 0.008020234536675098, + "grad_norm": 0.859375, + "learning_rate": 4.9995918651174016e-05, + "loss": 0.9614, + "step": 109 + }, + { + "epoch": 0.00809381467003909, + "grad_norm": 0.99609375, + "learning_rate": 4.99958133333107e-05, + "loss": 1.0933, + "step": 110 + }, + { + "epoch": 0.008167394803403082, + "grad_norm": 0.9921875, + "learning_rate": 4.999570667396929e-05, + "loss": 0.9463, + "step": 111 + }, + { + "epoch": 0.008240974936767074, + "grad_norm": 0.953125, + "learning_rate": 4.999559867315551e-05, + "loss": 1.0003, + "step": 112 + }, + { + "epoch": 0.008314555070131065, + "grad_norm": 1.59375, + "learning_rate": 4.999548933087516e-05, + "loss": 0.7679, + "step": 113 + }, + { + "epoch": 0.008388135203495056, + "grad_norm": 1.34375, + "learning_rate": 4.9995378647134106e-05, + "loss": 1.2124, + "step": 114 + }, + { + "epoch": 0.008461715336859047, + "grad_norm": 1.0234375, + "learning_rate": 4.99952666219383e-05, + "loss": 0.9484, + "step": 115 + }, + { + "epoch": 0.00853529547022304, + "grad_norm": 1.0546875, + "learning_rate": 4.999515325529373e-05, + "loss": 0.8705, + "step": 116 + }, + { + "epoch": 0.008608875603587031, + "grad_norm": 1.0859375, + "learning_rate": 4.99950385472065e-05, + "loss": 1.4054, + "step": 117 + }, + { + "epoch": 0.008682455736951023, + "grad_norm": 0.765625, + "learning_rate": 4.999492249768276e-05, + "loss": 0.688, + "step": 118 + }, + { + "epoch": 0.008756035870315015, + "grad_norm": 1.1484375, + "learning_rate": 4.999480510672874e-05, + "loss": 1.208, + "step": 119 + }, + { + "epoch": 0.008829616003679007, + "grad_norm": 1.1015625, + "learning_rate": 4.9994686374350744e-05, + "loss": 0.8497, + "step": 120 + }, + { + "epoch": 0.008903196137042999, + "grad_norm": 1.046875, + "learning_rate": 4.9994566300555124e-05, + "loss": 0.9617, + "step": 121 + }, + { + "epoch": 0.00897677627040699, + "grad_norm": 1.1171875, + "learning_rate": 4.9994444885348344e-05, + "loss": 1.2366, + "step": 122 + }, + { + "epoch": 0.009050356403770983, + "grad_norm": 1.0234375, + "learning_rate": 4.999432212873692e-05, + "loss": 0.7457, + "step": 123 + }, + { + "epoch": 0.009123936537134973, + "grad_norm": 1.1171875, + "learning_rate": 4.999419803072743e-05, + "loss": 1.1876, + "step": 124 + }, + { + "epoch": 0.009197516670498965, + "grad_norm": 1.0078125, + "learning_rate": 4.999407259132655e-05, + "loss": 0.9813, + "step": 125 + }, + { + "epoch": 0.009271096803862957, + "grad_norm": 0.8203125, + "learning_rate": 4.9993945810540985e-05, + "loss": 0.7469, + "step": 126 + }, + { + "epoch": 0.009344676937226949, + "grad_norm": 1.171875, + "learning_rate": 4.9993817688377566e-05, + "loss": 1.1302, + "step": 127 + }, + { + "epoch": 0.00941825707059094, + "grad_norm": 1.0859375, + "learning_rate": 4.999368822484315e-05, + "loss": 0.9609, + "step": 128 + }, + { + "epoch": 0.009491837203954932, + "grad_norm": 1.3125, + "learning_rate": 4.9993557419944696e-05, + "loss": 1.1844, + "step": 129 + }, + { + "epoch": 0.009565417337318924, + "grad_norm": 0.8984375, + "learning_rate": 4.999342527368922e-05, + "loss": 0.8395, + "step": 130 + }, + { + "epoch": 0.009638997470682916, + "grad_norm": 1.0625, + "learning_rate": 4.999329178608382e-05, + "loss": 1.1009, + "step": 131 + }, + { + "epoch": 0.009712577604046908, + "grad_norm": 1.3125, + "learning_rate": 4.999315695713566e-05, + "loss": 1.1639, + "step": 132 + }, + { + "epoch": 0.0097861577374109, + "grad_norm": 1.1328125, + "learning_rate": 4.999302078685196e-05, + "loss": 1.095, + "step": 133 + }, + { + "epoch": 0.00985973787077489, + "grad_norm": 1.015625, + "learning_rate": 4.999288327524004e-05, + "loss": 1.2376, + "step": 134 + }, + { + "epoch": 0.009933318004138882, + "grad_norm": 0.85546875, + "learning_rate": 4.999274442230729e-05, + "loss": 0.8503, + "step": 135 + }, + { + "epoch": 0.010006898137502874, + "grad_norm": 1.0234375, + "learning_rate": 4.9992604228061145e-05, + "loss": 1.1394, + "step": 136 + }, + { + "epoch": 0.010080478270866866, + "grad_norm": 1.1953125, + "learning_rate": 4.999246269250914e-05, + "loss": 0.8703, + "step": 137 + }, + { + "epoch": 0.010154058404230858, + "grad_norm": 1.0703125, + "learning_rate": 4.999231981565886e-05, + "loss": 1.0047, + "step": 138 + }, + { + "epoch": 0.01022763853759485, + "grad_norm": 1.234375, + "learning_rate": 4.999217559751799e-05, + "loss": 1.3795, + "step": 139 + }, + { + "epoch": 0.010301218670958841, + "grad_norm": 1.3828125, + "learning_rate": 4.9992030038094243e-05, + "loss": 1.2553, + "step": 140 + }, + { + "epoch": 0.010374798804322833, + "grad_norm": 1.2265625, + "learning_rate": 4.999188313739546e-05, + "loss": 1.3212, + "step": 141 + }, + { + "epoch": 0.010448378937686825, + "grad_norm": 1.2890625, + "learning_rate": 4.999173489542951e-05, + "loss": 1.5042, + "step": 142 + }, + { + "epoch": 0.010521959071050817, + "grad_norm": 1.2109375, + "learning_rate": 4.999158531220434e-05, + "loss": 1.1514, + "step": 143 + }, + { + "epoch": 0.010595539204414807, + "grad_norm": 1.0078125, + "learning_rate": 4.9991434387728e-05, + "loss": 0.8765, + "step": 144 + }, + { + "epoch": 0.0106691193377788, + "grad_norm": 1.1171875, + "learning_rate": 4.999128212200858e-05, + "loss": 1.4513, + "step": 145 + }, + { + "epoch": 0.010742699471142791, + "grad_norm": 0.984375, + "learning_rate": 4.999112851505424e-05, + "loss": 0.9521, + "step": 146 + }, + { + "epoch": 0.010816279604506783, + "grad_norm": 1.03125, + "learning_rate": 4.999097356687324e-05, + "loss": 0.9354, + "step": 147 + }, + { + "epoch": 0.010889859737870775, + "grad_norm": 1.1484375, + "learning_rate": 4.999081727747389e-05, + "loss": 1.1123, + "step": 148 + }, + { + "epoch": 0.010963439871234767, + "grad_norm": 2.421875, + "learning_rate": 4.999065964686458e-05, + "loss": 0.6834, + "step": 149 + }, + { + "epoch": 0.011037020004598759, + "grad_norm": 1.1015625, + "learning_rate": 4.9990500675053765e-05, + "loss": 1.0249, + "step": 150 + }, + { + "epoch": 0.01111060013796275, + "grad_norm": 0.87890625, + "learning_rate": 4.9990340362049974e-05, + "loss": 0.7333, + "step": 151 + }, + { + "epoch": 0.011184180271326742, + "grad_norm": 1.25, + "learning_rate": 4.999017870786182e-05, + "loss": 1.1691, + "step": 152 + }, + { + "epoch": 0.011257760404690733, + "grad_norm": 1.0390625, + "learning_rate": 4.9990015712497974e-05, + "loss": 1.3583, + "step": 153 + }, + { + "epoch": 0.011331340538054725, + "grad_norm": 0.88671875, + "learning_rate": 4.998985137596719e-05, + "loss": 1.1294, + "step": 154 + }, + { + "epoch": 0.011404920671418716, + "grad_norm": 1.0703125, + "learning_rate": 4.9989685698278274e-05, + "loss": 1.2782, + "step": 155 + }, + { + "epoch": 0.011478500804782708, + "grad_norm": 1.0234375, + "learning_rate": 4.998951867944013e-05, + "loss": 0.8186, + "step": 156 + }, + { + "epoch": 0.0115520809381467, + "grad_norm": 0.984375, + "learning_rate": 4.998935031946171e-05, + "loss": 1.305, + "step": 157 + }, + { + "epoch": 0.011625661071510692, + "grad_norm": 1.046875, + "learning_rate": 4.998918061835207e-05, + "loss": 1.0957, + "step": 158 + }, + { + "epoch": 0.011699241204874684, + "grad_norm": 0.8359375, + "learning_rate": 4.998900957612029e-05, + "loss": 0.7775, + "step": 159 + }, + { + "epoch": 0.011772821338238676, + "grad_norm": 1.109375, + "learning_rate": 4.998883719277557e-05, + "loss": 0.8133, + "step": 160 + }, + { + "epoch": 0.011846401471602668, + "grad_norm": 0.765625, + "learning_rate": 4.9988663468327156e-05, + "loss": 0.6707, + "step": 161 + }, + { + "epoch": 0.01191998160496666, + "grad_norm": 0.86328125, + "learning_rate": 4.998848840278437e-05, + "loss": 1.0147, + "step": 162 + }, + { + "epoch": 0.01199356173833065, + "grad_norm": 1.546875, + "learning_rate": 4.998831199615661e-05, + "loss": 1.0418, + "step": 163 + }, + { + "epoch": 0.012067141871694642, + "grad_norm": 1.0, + "learning_rate": 4.9988134248453333e-05, + "loss": 1.0659, + "step": 164 + }, + { + "epoch": 0.012140722005058634, + "grad_norm": 1.2890625, + "learning_rate": 4.9987955159684095e-05, + "loss": 1.0881, + "step": 165 + }, + { + "epoch": 0.012214302138422626, + "grad_norm": 1.09375, + "learning_rate": 4.9987774729858494e-05, + "loss": 0.8431, + "step": 166 + }, + { + "epoch": 0.012287882271786617, + "grad_norm": 0.8671875, + "learning_rate": 4.998759295898622e-05, + "loss": 0.7996, + "step": 167 + }, + { + "epoch": 0.01236146240515061, + "grad_norm": 1.1796875, + "learning_rate": 4.9987409847077033e-05, + "loss": 1.1523, + "step": 168 + }, + { + "epoch": 0.012435042538514601, + "grad_norm": 0.82421875, + "learning_rate": 4.9987225394140744e-05, + "loss": 1.0186, + "step": 169 + }, + { + "epoch": 0.012508622671878593, + "grad_norm": 1.484375, + "learning_rate": 4.9987039600187266e-05, + "loss": 1.361, + "step": 170 + }, + { + "epoch": 0.012582202805242585, + "grad_norm": 0.8984375, + "learning_rate": 4.9986852465226574e-05, + "loss": 0.9546, + "step": 171 + }, + { + "epoch": 0.012655782938606577, + "grad_norm": 1.1171875, + "learning_rate": 4.99866639892687e-05, + "loss": 1.074, + "step": 172 + }, + { + "epoch": 0.012729363071970567, + "grad_norm": 0.80859375, + "learning_rate": 4.998647417232375e-05, + "loss": 0.7568, + "step": 173 + }, + { + "epoch": 0.012802943205334559, + "grad_norm": 0.9921875, + "learning_rate": 4.998628301440194e-05, + "loss": 1.0731, + "step": 174 + }, + { + "epoch": 0.012876523338698551, + "grad_norm": 0.95703125, + "learning_rate": 4.9986090515513506e-05, + "loss": 0.8271, + "step": 175 + }, + { + "epoch": 0.012950103472062543, + "grad_norm": 1.15625, + "learning_rate": 4.9985896675668784e-05, + "loss": 1.154, + "step": 176 + }, + { + "epoch": 0.013023683605426535, + "grad_norm": 1.09375, + "learning_rate": 4.998570149487819e-05, + "loss": 0.9304, + "step": 177 + }, + { + "epoch": 0.013097263738790527, + "grad_norm": 0.8671875, + "learning_rate": 4.998550497315218e-05, + "loss": 1.0314, + "step": 178 + }, + { + "epoch": 0.013170843872154519, + "grad_norm": 0.9375, + "learning_rate": 4.99853071105013e-05, + "loss": 0.7165, + "step": 179 + }, + { + "epoch": 0.01324442400551851, + "grad_norm": 1.03125, + "learning_rate": 4.998510790693619e-05, + "loss": 0.7941, + "step": 180 + }, + { + "epoch": 0.013318004138882502, + "grad_norm": 0.92578125, + "learning_rate": 4.9984907362467525e-05, + "loss": 0.9811, + "step": 181 + }, + { + "epoch": 0.013391584272246494, + "grad_norm": 1.2421875, + "learning_rate": 4.9984705477106076e-05, + "loss": 0.9994, + "step": 182 + }, + { + "epoch": 0.013465164405610484, + "grad_norm": 1.1171875, + "learning_rate": 4.9984502250862666e-05, + "loss": 0.9321, + "step": 183 + }, + { + "epoch": 0.013538744538974476, + "grad_norm": 0.984375, + "learning_rate": 4.998429768374822e-05, + "loss": 0.8844, + "step": 184 + }, + { + "epoch": 0.013612324672338468, + "grad_norm": 0.9453125, + "learning_rate": 4.998409177577369e-05, + "loss": 1.1324, + "step": 185 + }, + { + "epoch": 0.01368590480570246, + "grad_norm": 1.0234375, + "learning_rate": 4.998388452695015e-05, + "loss": 0.7917, + "step": 186 + }, + { + "epoch": 0.013759484939066452, + "grad_norm": 1.0078125, + "learning_rate": 4.998367593728872e-05, + "loss": 0.8042, + "step": 187 + }, + { + "epoch": 0.013833065072430444, + "grad_norm": 0.91015625, + "learning_rate": 4.998346600680059e-05, + "loss": 1.1884, + "step": 188 + }, + { + "epoch": 0.013906645205794436, + "grad_norm": 1.328125, + "learning_rate": 4.998325473549702e-05, + "loss": 1.6683, + "step": 189 + }, + { + "epoch": 0.013980225339158428, + "grad_norm": 1.1171875, + "learning_rate": 4.998304212338936e-05, + "loss": 1.213, + "step": 190 + }, + { + "epoch": 0.01405380547252242, + "grad_norm": 0.84765625, + "learning_rate": 4.998282817048902e-05, + "loss": 0.8588, + "step": 191 + }, + { + "epoch": 0.014127385605886411, + "grad_norm": 1.09375, + "learning_rate": 4.998261287680747e-05, + "loss": 1.2776, + "step": 192 + }, + { + "epoch": 0.014200965739250402, + "grad_norm": 0.875, + "learning_rate": 4.9982396242356265e-05, + "loss": 0.7973, + "step": 193 + }, + { + "epoch": 0.014274545872614394, + "grad_norm": 1.03125, + "learning_rate": 4.998217826714705e-05, + "loss": 0.9769, + "step": 194 + }, + { + "epoch": 0.014348126005978385, + "grad_norm": 1.1484375, + "learning_rate": 4.9981958951191507e-05, + "loss": 1.0079, + "step": 195 + }, + { + "epoch": 0.014421706139342377, + "grad_norm": 0.96875, + "learning_rate": 4.9981738294501416e-05, + "loss": 1.121, + "step": 196 + }, + { + "epoch": 0.01449528627270637, + "grad_norm": 1.171875, + "learning_rate": 4.998151629708861e-05, + "loss": 1.2495, + "step": 197 + }, + { + "epoch": 0.014568866406070361, + "grad_norm": 0.9609375, + "learning_rate": 4.998129295896502e-05, + "loss": 0.8233, + "step": 198 + }, + { + "epoch": 0.014642446539434353, + "grad_norm": 1.0546875, + "learning_rate": 4.998106828014261e-05, + "loss": 1.0031, + "step": 199 + }, + { + "epoch": 0.014716026672798345, + "grad_norm": 0.84765625, + "learning_rate": 4.998084226063344e-05, + "loss": 0.7791, + "step": 200 + }, + { + "epoch": 0.014789606806162337, + "grad_norm": 0.89453125, + "learning_rate": 4.998061490044966e-05, + "loss": 0.8824, + "step": 201 + }, + { + "epoch": 0.014863186939526329, + "grad_norm": 1.09375, + "learning_rate": 4.998038619960346e-05, + "loss": 1.125, + "step": 202 + }, + { + "epoch": 0.014936767072890319, + "grad_norm": 0.96484375, + "learning_rate": 4.998015615810711e-05, + "loss": 0.828, + "step": 203 + }, + { + "epoch": 0.01501034720625431, + "grad_norm": 0.953125, + "learning_rate": 4.997992477597295e-05, + "loss": 0.9252, + "step": 204 + }, + { + "epoch": 0.015083927339618303, + "grad_norm": 1.15625, + "learning_rate": 4.9979692053213425e-05, + "loss": 1.188, + "step": 205 + }, + { + "epoch": 0.015157507472982295, + "grad_norm": 1.078125, + "learning_rate": 4.9979457989841e-05, + "loss": 0.9181, + "step": 206 + }, + { + "epoch": 0.015231087606346286, + "grad_norm": 0.90234375, + "learning_rate": 4.9979222585868245e-05, + "loss": 0.6557, + "step": 207 + }, + { + "epoch": 0.015304667739710278, + "grad_norm": 1.2265625, + "learning_rate": 4.997898584130779e-05, + "loss": 1.1323, + "step": 208 + }, + { + "epoch": 0.01537824787307427, + "grad_norm": 0.90234375, + "learning_rate": 4.9978747756172345e-05, + "loss": 0.9684, + "step": 209 + }, + { + "epoch": 0.015451828006438262, + "grad_norm": 1.3828125, + "learning_rate": 4.9978508330474686e-05, + "loss": 0.9946, + "step": 210 + }, + { + "epoch": 0.015525408139802254, + "grad_norm": 0.96875, + "learning_rate": 4.9978267564227666e-05, + "loss": 0.995, + "step": 211 + }, + { + "epoch": 0.015598988273166246, + "grad_norm": 0.93359375, + "learning_rate": 4.99780254574442e-05, + "loss": 0.756, + "step": 212 + }, + { + "epoch": 0.015672568406530238, + "grad_norm": 0.8828125, + "learning_rate": 4.997778201013729e-05, + "loss": 0.9518, + "step": 213 + }, + { + "epoch": 0.015746148539894228, + "grad_norm": 1.03125, + "learning_rate": 4.997753722231998e-05, + "loss": 1.4175, + "step": 214 + }, + { + "epoch": 0.01581972867325822, + "grad_norm": 0.95703125, + "learning_rate": 4.997729109400544e-05, + "loss": 0.6918, + "step": 215 + }, + { + "epoch": 0.015893308806622212, + "grad_norm": 1.265625, + "learning_rate": 4.9977043625206853e-05, + "loss": 1.2282, + "step": 216 + }, + { + "epoch": 0.015966888939986205, + "grad_norm": 0.83984375, + "learning_rate": 4.997679481593751e-05, + "loss": 0.6038, + "step": 217 + }, + { + "epoch": 0.016040469073350196, + "grad_norm": 1.265625, + "learning_rate": 4.9976544666210765e-05, + "loss": 1.4195, + "step": 218 + }, + { + "epoch": 0.016114049206714186, + "grad_norm": 1.0390625, + "learning_rate": 4.997629317604005e-05, + "loss": 1.161, + "step": 219 + }, + { + "epoch": 0.01618762934007818, + "grad_norm": 1.234375, + "learning_rate": 4.997604034543885e-05, + "loss": 1.6678, + "step": 220 + }, + { + "epoch": 0.01626120947344217, + "grad_norm": 0.9765625, + "learning_rate": 4.997578617442073e-05, + "loss": 1.1499, + "step": 221 + }, + { + "epoch": 0.016334789606806163, + "grad_norm": 1.0234375, + "learning_rate": 4.9975530662999344e-05, + "loss": 0.8027, + "step": 222 + }, + { + "epoch": 0.016408369740170153, + "grad_norm": 1.140625, + "learning_rate": 4.997527381118839e-05, + "loss": 1.3696, + "step": 223 + }, + { + "epoch": 0.016481949873534147, + "grad_norm": 1.203125, + "learning_rate": 4.997501561900167e-05, + "loss": 1.175, + "step": 224 + }, + { + "epoch": 0.016555530006898137, + "grad_norm": 0.890625, + "learning_rate": 4.997475608645304e-05, + "loss": 1.0295, + "step": 225 + }, + { + "epoch": 0.01662911014026213, + "grad_norm": 0.85546875, + "learning_rate": 4.9974495213556414e-05, + "loss": 0.6764, + "step": 226 + }, + { + "epoch": 0.01670269027362612, + "grad_norm": 1.1015625, + "learning_rate": 4.9974233000325806e-05, + "loss": 0.9065, + "step": 227 + }, + { + "epoch": 0.01677627040699011, + "grad_norm": 1.125, + "learning_rate": 4.9973969446775275e-05, + "loss": 1.1367, + "step": 228 + }, + { + "epoch": 0.016849850540354105, + "grad_norm": 1.15625, + "learning_rate": 4.9973704552918974e-05, + "loss": 1.4926, + "step": 229 + }, + { + "epoch": 0.016923430673718095, + "grad_norm": 1.3359375, + "learning_rate": 4.997343831877112e-05, + "loss": 1.0596, + "step": 230 + }, + { + "epoch": 0.01699701080708209, + "grad_norm": 1.3203125, + "learning_rate": 4.9973170744346e-05, + "loss": 1.6111, + "step": 231 + }, + { + "epoch": 0.01707059094044608, + "grad_norm": 0.96875, + "learning_rate": 4.997290182965797e-05, + "loss": 0.8489, + "step": 232 + }, + { + "epoch": 0.017144171073810072, + "grad_norm": 0.8359375, + "learning_rate": 4.997263157472147e-05, + "loss": 0.8698, + "step": 233 + }, + { + "epoch": 0.017217751207174063, + "grad_norm": 1.2265625, + "learning_rate": 4.9972359979551e-05, + "loss": 1.535, + "step": 234 + }, + { + "epoch": 0.017291331340538056, + "grad_norm": 1.140625, + "learning_rate": 4.997208704416113e-05, + "loss": 0.9235, + "step": 235 + }, + { + "epoch": 0.017364911473902046, + "grad_norm": 0.890625, + "learning_rate": 4.9971812768566527e-05, + "loss": 0.9751, + "step": 236 + }, + { + "epoch": 0.017438491607266036, + "grad_norm": 1.03125, + "learning_rate": 4.997153715278189e-05, + "loss": 1.4104, + "step": 237 + }, + { + "epoch": 0.01751207174063003, + "grad_norm": 1.3828125, + "learning_rate": 4.9971260196822015e-05, + "loss": 1.7236, + "step": 238 + }, + { + "epoch": 0.01758565187399402, + "grad_norm": 1.0703125, + "learning_rate": 4.9970981900701776e-05, + "loss": 1.0166, + "step": 239 + }, + { + "epoch": 0.017659232007358014, + "grad_norm": 1.1015625, + "learning_rate": 4.9970702264436095e-05, + "loss": 0.9333, + "step": 240 + }, + { + "epoch": 0.017732812140722004, + "grad_norm": 1.140625, + "learning_rate": 4.9970421288039994e-05, + "loss": 1.1386, + "step": 241 + }, + { + "epoch": 0.017806392274085998, + "grad_norm": 1.1875, + "learning_rate": 4.997013897152854e-05, + "loss": 1.3691, + "step": 242 + }, + { + "epoch": 0.017879972407449988, + "grad_norm": 0.9609375, + "learning_rate": 4.996985531491688e-05, + "loss": 0.8539, + "step": 243 + }, + { + "epoch": 0.01795355254081398, + "grad_norm": 0.890625, + "learning_rate": 4.996957031822026e-05, + "loss": 0.7394, + "step": 244 + }, + { + "epoch": 0.01802713267417797, + "grad_norm": 1.0078125, + "learning_rate": 4.996928398145396e-05, + "loss": 0.9116, + "step": 245 + }, + { + "epoch": 0.018100712807541965, + "grad_norm": 1.109375, + "learning_rate": 4.996899630463335e-05, + "loss": 1.235, + "step": 246 + }, + { + "epoch": 0.018174292940905955, + "grad_norm": 0.99609375, + "learning_rate": 4.996870728777386e-05, + "loss": 0.906, + "step": 247 + }, + { + "epoch": 0.018247873074269946, + "grad_norm": 0.984375, + "learning_rate": 4.9968416930891016e-05, + "loss": 0.5867, + "step": 248 + }, + { + "epoch": 0.01832145320763394, + "grad_norm": 1.375, + "learning_rate": 4.9968125234000396e-05, + "loss": 0.7491, + "step": 249 + }, + { + "epoch": 0.01839503334099793, + "grad_norm": 1.375, + "learning_rate": 4.9967832197117645e-05, + "loss": 1.0552, + "step": 250 + }, + { + "epoch": 0.018468613474361923, + "grad_norm": 0.87890625, + "learning_rate": 4.99675378202585e-05, + "loss": 1.0264, + "step": 251 + }, + { + "epoch": 0.018542193607725913, + "grad_norm": 1.2734375, + "learning_rate": 4.996724210343876e-05, + "loss": 1.0846, + "step": 252 + }, + { + "epoch": 0.018615773741089907, + "grad_norm": 1.0546875, + "learning_rate": 4.9966945046674294e-05, + "loss": 1.0058, + "step": 253 + }, + { + "epoch": 0.018689353874453897, + "grad_norm": 1.28125, + "learning_rate": 4.996664664998104e-05, + "loss": 1.09, + "step": 254 + }, + { + "epoch": 0.01876293400781789, + "grad_norm": 0.97265625, + "learning_rate": 4.9966346913375016e-05, + "loss": 0.8106, + "step": 255 + }, + { + "epoch": 0.01883651414118188, + "grad_norm": 1.03125, + "learning_rate": 4.996604583687231e-05, + "loss": 1.338, + "step": 256 + }, + { + "epoch": 0.01891009427454587, + "grad_norm": 0.88671875, + "learning_rate": 4.9965743420489076e-05, + "loss": 0.845, + "step": 257 + }, + { + "epoch": 0.018983674407909865, + "grad_norm": 1.15625, + "learning_rate": 4.996543966424155e-05, + "loss": 1.1945, + "step": 258 + }, + { + "epoch": 0.019057254541273855, + "grad_norm": 0.91796875, + "learning_rate": 4.996513456814602e-05, + "loss": 0.8249, + "step": 259 + }, + { + "epoch": 0.01913083467463785, + "grad_norm": 1.171875, + "learning_rate": 4.996482813221888e-05, + "loss": 1.2986, + "step": 260 + }, + { + "epoch": 0.01920441480800184, + "grad_norm": 0.9296875, + "learning_rate": 4.996452035647656e-05, + "loss": 0.6122, + "step": 261 + }, + { + "epoch": 0.019277994941365832, + "grad_norm": 0.984375, + "learning_rate": 4.996421124093559e-05, + "loss": 0.8912, + "step": 262 + }, + { + "epoch": 0.019351575074729822, + "grad_norm": 0.9765625, + "learning_rate": 4.9963900785612546e-05, + "loss": 1.1519, + "step": 263 + }, + { + "epoch": 0.019425155208093816, + "grad_norm": 1.046875, + "learning_rate": 4.99635889905241e-05, + "loss": 0.8285, + "step": 264 + }, + { + "epoch": 0.019498735341457806, + "grad_norm": 0.90234375, + "learning_rate": 4.996327585568699e-05, + "loss": 1.0632, + "step": 265 + }, + { + "epoch": 0.0195723154748218, + "grad_norm": 0.90234375, + "learning_rate": 4.9962961381118005e-05, + "loss": 1.0233, + "step": 266 + }, + { + "epoch": 0.01964589560818579, + "grad_norm": 0.953125, + "learning_rate": 4.996264556683403e-05, + "loss": 0.9842, + "step": 267 + }, + { + "epoch": 0.01971947574154978, + "grad_norm": 1.171875, + "learning_rate": 4.996232841285202e-05, + "loss": 1.4804, + "step": 268 + }, + { + "epoch": 0.019793055874913774, + "grad_norm": 1.296875, + "learning_rate": 4.996200991918899e-05, + "loss": 1.5742, + "step": 269 + }, + { + "epoch": 0.019866636008277764, + "grad_norm": 0.91796875, + "learning_rate": 4.9961690085862035e-05, + "loss": 0.7653, + "step": 270 + }, + { + "epoch": 0.019940216141641758, + "grad_norm": 1.0234375, + "learning_rate": 4.996136891288832e-05, + "loss": 0.8671, + "step": 271 + }, + { + "epoch": 0.020013796275005748, + "grad_norm": 0.94921875, + "learning_rate": 4.9961046400285075e-05, + "loss": 1.0324, + "step": 272 + }, + { + "epoch": 0.02008737640836974, + "grad_norm": 1.0625, + "learning_rate": 4.9960722548069624e-05, + "loss": 0.728, + "step": 273 + }, + { + "epoch": 0.02016095654173373, + "grad_norm": 0.8671875, + "learning_rate": 4.996039735625932e-05, + "loss": 0.7373, + "step": 274 + }, + { + "epoch": 0.020234536675097725, + "grad_norm": 0.98828125, + "learning_rate": 4.996007082487165e-05, + "loss": 0.9377, + "step": 275 + }, + { + "epoch": 0.020308116808461715, + "grad_norm": 1.1484375, + "learning_rate": 4.9959742953924125e-05, + "loss": 1.3766, + "step": 276 + }, + { + "epoch": 0.020381696941825705, + "grad_norm": 1.0, + "learning_rate": 4.995941374343432e-05, + "loss": 0.9286, + "step": 277 + }, + { + "epoch": 0.0204552770751897, + "grad_norm": 1.140625, + "learning_rate": 4.9959083193419934e-05, + "loss": 1.1085, + "step": 278 + }, + { + "epoch": 0.02052885720855369, + "grad_norm": 0.859375, + "learning_rate": 4.995875130389869e-05, + "loss": 0.7682, + "step": 279 + }, + { + "epoch": 0.020602437341917683, + "grad_norm": 1.03125, + "learning_rate": 4.995841807488841e-05, + "loss": 0.9412, + "step": 280 + }, + { + "epoch": 0.020676017475281673, + "grad_norm": 0.9765625, + "learning_rate": 4.995808350640697e-05, + "loss": 1.086, + "step": 281 + }, + { + "epoch": 0.020749597608645667, + "grad_norm": 0.98046875, + "learning_rate": 4.995774759847232e-05, + "loss": 1.0259, + "step": 282 + }, + { + "epoch": 0.020823177742009657, + "grad_norm": 1.2890625, + "learning_rate": 4.995741035110249e-05, + "loss": 1.4607, + "step": 283 + }, + { + "epoch": 0.02089675787537365, + "grad_norm": 0.9375, + "learning_rate": 4.9957071764315596e-05, + "loss": 0.8889, + "step": 284 + }, + { + "epoch": 0.02097033800873764, + "grad_norm": 1.0625, + "learning_rate": 4.99567318381298e-05, + "loss": 1.0494, + "step": 285 + }, + { + "epoch": 0.021043918142101634, + "grad_norm": 1.1171875, + "learning_rate": 4.995639057256334e-05, + "loss": 1.1224, + "step": 286 + }, + { + "epoch": 0.021117498275465624, + "grad_norm": 1.1015625, + "learning_rate": 4.995604796763453e-05, + "loss": 1.0876, + "step": 287 + }, + { + "epoch": 0.021191078408829615, + "grad_norm": 0.9921875, + "learning_rate": 4.995570402336176e-05, + "loss": 0.8649, + "step": 288 + }, + { + "epoch": 0.021264658542193608, + "grad_norm": 0.796875, + "learning_rate": 4.995535873976349e-05, + "loss": 0.5695, + "step": 289 + }, + { + "epoch": 0.0213382386755576, + "grad_norm": 0.94921875, + "learning_rate": 4.9955012116858256e-05, + "loss": 0.8223, + "step": 290 + }, + { + "epoch": 0.021411818808921592, + "grad_norm": 1.125, + "learning_rate": 4.995466415466465e-05, + "loss": 1.0022, + "step": 291 + }, + { + "epoch": 0.021485398942285582, + "grad_norm": 1.0390625, + "learning_rate": 4.9954314853201355e-05, + "loss": 0.894, + "step": 292 + }, + { + "epoch": 0.021558979075649576, + "grad_norm": 0.93359375, + "learning_rate": 4.995396421248712e-05, + "loss": 0.9251, + "step": 293 + }, + { + "epoch": 0.021632559209013566, + "grad_norm": 0.796875, + "learning_rate": 4.9953612232540734e-05, + "loss": 0.7351, + "step": 294 + }, + { + "epoch": 0.02170613934237756, + "grad_norm": 1.015625, + "learning_rate": 4.9953258913381126e-05, + "loss": 1.2922, + "step": 295 + }, + { + "epoch": 0.02177971947574155, + "grad_norm": 0.68359375, + "learning_rate": 4.9952904255027246e-05, + "loss": 0.5528, + "step": 296 + }, + { + "epoch": 0.02185329960910554, + "grad_norm": 1.1328125, + "learning_rate": 4.995254825749812e-05, + "loss": 0.9348, + "step": 297 + }, + { + "epoch": 0.021926879742469534, + "grad_norm": 1.09375, + "learning_rate": 4.9952190920812856e-05, + "loss": 0.7843, + "step": 298 + }, + { + "epoch": 0.022000459875833524, + "grad_norm": 1.015625, + "learning_rate": 4.995183224499064e-05, + "loss": 1.1061, + "step": 299 + }, + { + "epoch": 0.022074040009197517, + "grad_norm": 0.97265625, + "learning_rate": 4.99514722300507e-05, + "loss": 0.8966, + "step": 300 + }, + { + "epoch": 0.022147620142561508, + "grad_norm": 0.96484375, + "learning_rate": 4.995111087601239e-05, + "loss": 0.7591, + "step": 301 + }, + { + "epoch": 0.0222212002759255, + "grad_norm": 0.9375, + "learning_rate": 4.995074818289507e-05, + "loss": 0.8156, + "step": 302 + }, + { + "epoch": 0.02229478040928949, + "grad_norm": 1.03125, + "learning_rate": 4.995038415071823e-05, + "loss": 0.7609, + "step": 303 + }, + { + "epoch": 0.022368360542653485, + "grad_norm": 1.5234375, + "learning_rate": 4.995001877950139e-05, + "loss": 1.1763, + "step": 304 + }, + { + "epoch": 0.022441940676017475, + "grad_norm": 0.859375, + "learning_rate": 4.994965206926417e-05, + "loss": 1.0027, + "step": 305 + }, + { + "epoch": 0.022515520809381465, + "grad_norm": 1.1953125, + "learning_rate": 4.994928402002625e-05, + "loss": 1.2505, + "step": 306 + }, + { + "epoch": 0.02258910094274546, + "grad_norm": 1.03125, + "learning_rate": 4.994891463180737e-05, + "loss": 1.0475, + "step": 307 + }, + { + "epoch": 0.02266268107610945, + "grad_norm": 1.0625, + "learning_rate": 4.994854390462737e-05, + "loss": 1.2729, + "step": 308 + }, + { + "epoch": 0.022736261209473443, + "grad_norm": 0.8828125, + "learning_rate": 4.994817183850614e-05, + "loss": 0.6231, + "step": 309 + }, + { + "epoch": 0.022809841342837433, + "grad_norm": 1.2578125, + "learning_rate": 4.994779843346365e-05, + "loss": 1.3261, + "step": 310 + }, + { + "epoch": 0.022883421476201427, + "grad_norm": 0.98046875, + "learning_rate": 4.994742368951993e-05, + "loss": 1.1003, + "step": 311 + }, + { + "epoch": 0.022957001609565417, + "grad_norm": 1.1640625, + "learning_rate": 4.9947047606695104e-05, + "loss": 1.0592, + "step": 312 + }, + { + "epoch": 0.02303058174292941, + "grad_norm": 0.95703125, + "learning_rate": 4.994667018500935e-05, + "loss": 0.8715, + "step": 313 + }, + { + "epoch": 0.0231041618762934, + "grad_norm": 1.453125, + "learning_rate": 4.9946291424482927e-05, + "loss": 1.2742, + "step": 314 + }, + { + "epoch": 0.023177742009657394, + "grad_norm": 1.0078125, + "learning_rate": 4.994591132513616e-05, + "loss": 1.0977, + "step": 315 + }, + { + "epoch": 0.023251322143021384, + "grad_norm": 0.8515625, + "learning_rate": 4.994552988698945e-05, + "loss": 0.8418, + "step": 316 + }, + { + "epoch": 0.023324902276385374, + "grad_norm": 1.0390625, + "learning_rate": 4.9945147110063264e-05, + "loss": 1.0759, + "step": 317 + }, + { + "epoch": 0.023398482409749368, + "grad_norm": 0.7734375, + "learning_rate": 4.994476299437814e-05, + "loss": 0.8482, + "step": 318 + }, + { + "epoch": 0.023472062543113358, + "grad_norm": 0.91796875, + "learning_rate": 4.994437753995471e-05, + "loss": 0.8143, + "step": 319 + }, + { + "epoch": 0.023545642676477352, + "grad_norm": 1.0234375, + "learning_rate": 4.994399074681364e-05, + "loss": 0.8498, + "step": 320 + }, + { + "epoch": 0.023619222809841342, + "grad_norm": 1.109375, + "learning_rate": 4.9943602614975714e-05, + "loss": 0.9454, + "step": 321 + }, + { + "epoch": 0.023692802943205336, + "grad_norm": 1.015625, + "learning_rate": 4.994321314446174e-05, + "loss": 0.7395, + "step": 322 + }, + { + "epoch": 0.023766383076569326, + "grad_norm": 0.94921875, + "learning_rate": 4.9942822335292624e-05, + "loss": 0.9949, + "step": 323 + }, + { + "epoch": 0.02383996320993332, + "grad_norm": 0.80859375, + "learning_rate": 4.9942430187489354e-05, + "loss": 0.7036, + "step": 324 + }, + { + "epoch": 0.02391354334329731, + "grad_norm": 1.1484375, + "learning_rate": 4.994203670107295e-05, + "loss": 1.0528, + "step": 325 + }, + { + "epoch": 0.0239871234766613, + "grad_norm": 0.96484375, + "learning_rate": 4.994164187606456e-05, + "loss": 0.9755, + "step": 326 + }, + { + "epoch": 0.024060703610025293, + "grad_norm": 1.109375, + "learning_rate": 4.9941245712485354e-05, + "loss": 0.9171, + "step": 327 + }, + { + "epoch": 0.024134283743389284, + "grad_norm": 1.0390625, + "learning_rate": 4.994084821035659e-05, + "loss": 1.0351, + "step": 328 + }, + { + "epoch": 0.024207863876753277, + "grad_norm": 0.953125, + "learning_rate": 4.994044936969961e-05, + "loss": 0.8692, + "step": 329 + }, + { + "epoch": 0.024281444010117267, + "grad_norm": 1.15625, + "learning_rate": 4.9940049190535833e-05, + "loss": 1.2068, + "step": 330 + }, + { + "epoch": 0.02435502414348126, + "grad_norm": 0.80078125, + "learning_rate": 4.9939647672886714e-05, + "loss": 0.896, + "step": 331 + }, + { + "epoch": 0.02442860427684525, + "grad_norm": 1.3046875, + "learning_rate": 4.99392448167738e-05, + "loss": 1.3046, + "step": 332 + }, + { + "epoch": 0.024502184410209245, + "grad_norm": 0.875, + "learning_rate": 4.993884062221873e-05, + "loss": 0.6141, + "step": 333 + }, + { + "epoch": 0.024575764543573235, + "grad_norm": 0.79296875, + "learning_rate": 4.9938435089243187e-05, + "loss": 0.8297, + "step": 334 + }, + { + "epoch": 0.02464934467693723, + "grad_norm": 1.15625, + "learning_rate": 4.993802821786893e-05, + "loss": 1.2474, + "step": 335 + }, + { + "epoch": 0.02472292481030122, + "grad_norm": 1.171875, + "learning_rate": 4.99376200081178e-05, + "loss": 1.4081, + "step": 336 + }, + { + "epoch": 0.02479650494366521, + "grad_norm": 0.8671875, + "learning_rate": 4.99372104600117e-05, + "loss": 0.8162, + "step": 337 + }, + { + "epoch": 0.024870085077029203, + "grad_norm": 1.2109375, + "learning_rate": 4.9936799573572626e-05, + "loss": 0.7549, + "step": 338 + }, + { + "epoch": 0.024943665210393193, + "grad_norm": 1.21875, + "learning_rate": 4.9936387348822604e-05, + "loss": 0.8515, + "step": 339 + }, + { + "epoch": 0.025017245343757186, + "grad_norm": 0.7734375, + "learning_rate": 4.993597378578378e-05, + "loss": 0.4967, + "step": 340 + }, + { + "epoch": 0.025090825477121177, + "grad_norm": 0.953125, + "learning_rate": 4.993555888447834e-05, + "loss": 0.7248, + "step": 341 + }, + { + "epoch": 0.02516440561048517, + "grad_norm": 0.84765625, + "learning_rate": 4.9935142644928545e-05, + "loss": 0.8738, + "step": 342 + }, + { + "epoch": 0.02523798574384916, + "grad_norm": 1.1953125, + "learning_rate": 4.993472506715675e-05, + "loss": 1.2474, + "step": 343 + }, + { + "epoch": 0.025311565877213154, + "grad_norm": 0.9609375, + "learning_rate": 4.993430615118535e-05, + "loss": 0.834, + "step": 344 + }, + { + "epoch": 0.025385146010577144, + "grad_norm": 1.0625, + "learning_rate": 4.993388589703684e-05, + "loss": 1.1228, + "step": 345 + }, + { + "epoch": 0.025458726143941134, + "grad_norm": 0.96875, + "learning_rate": 4.993346430473376e-05, + "loss": 0.8792, + "step": 346 + }, + { + "epoch": 0.025532306277305128, + "grad_norm": 0.93359375, + "learning_rate": 4.993304137429874e-05, + "loss": 0.9142, + "step": 347 + }, + { + "epoch": 0.025605886410669118, + "grad_norm": 0.89453125, + "learning_rate": 4.9932617105754486e-05, + "loss": 1.0518, + "step": 348 + }, + { + "epoch": 0.02567946654403311, + "grad_norm": 0.9921875, + "learning_rate": 4.993219149912376e-05, + "loss": 1.2483, + "step": 349 + }, + { + "epoch": 0.025753046677397102, + "grad_norm": 0.90234375, + "learning_rate": 4.993176455442941e-05, + "loss": 1.0214, + "step": 350 + }, + { + "epoch": 0.025826626810761096, + "grad_norm": 1.09375, + "learning_rate": 4.993133627169435e-05, + "loss": 1.2017, + "step": 351 + }, + { + "epoch": 0.025900206944125086, + "grad_norm": 0.8203125, + "learning_rate": 4.993090665094156e-05, + "loss": 0.7452, + "step": 352 + }, + { + "epoch": 0.02597378707748908, + "grad_norm": 1.0234375, + "learning_rate": 4.993047569219408e-05, + "loss": 1.2199, + "step": 353 + }, + { + "epoch": 0.02604736721085307, + "grad_norm": 0.8984375, + "learning_rate": 4.993004339547508e-05, + "loss": 0.9061, + "step": 354 + }, + { + "epoch": 0.026120947344217063, + "grad_norm": 1.1328125, + "learning_rate": 4.992960976080772e-05, + "loss": 0.9567, + "step": 355 + }, + { + "epoch": 0.026194527477581053, + "grad_norm": 0.8828125, + "learning_rate": 4.9929174788215296e-05, + "loss": 0.8923, + "step": 356 + }, + { + "epoch": 0.026268107610945043, + "grad_norm": 1.1875, + "learning_rate": 4.992873847772115e-05, + "loss": 0.8817, + "step": 357 + }, + { + "epoch": 0.026341687744309037, + "grad_norm": 0.69921875, + "learning_rate": 4.992830082934869e-05, + "loss": 0.6113, + "step": 358 + }, + { + "epoch": 0.026415267877673027, + "grad_norm": 1.0546875, + "learning_rate": 4.9927861843121403e-05, + "loss": 1.0466, + "step": 359 + }, + { + "epoch": 0.02648884801103702, + "grad_norm": 1.28125, + "learning_rate": 4.9927421519062864e-05, + "loss": 1.1488, + "step": 360 + }, + { + "epoch": 0.02656242814440101, + "grad_norm": 0.94140625, + "learning_rate": 4.9926979857196686e-05, + "loss": 0.8077, + "step": 361 + }, + { + "epoch": 0.026636008277765005, + "grad_norm": 1.265625, + "learning_rate": 4.992653685754658e-05, + "loss": 1.7968, + "step": 362 + }, + { + "epoch": 0.026709588411128995, + "grad_norm": 1.0, + "learning_rate": 4.992609252013632e-05, + "loss": 1.1667, + "step": 363 + }, + { + "epoch": 0.02678316854449299, + "grad_norm": 0.88671875, + "learning_rate": 4.9925646844989756e-05, + "loss": 0.9725, + "step": 364 + }, + { + "epoch": 0.02685674867785698, + "grad_norm": 0.9921875, + "learning_rate": 4.99251998321308e-05, + "loss": 1.125, + "step": 365 + }, + { + "epoch": 0.02693032881122097, + "grad_norm": 1.078125, + "learning_rate": 4.992475148158344e-05, + "loss": 1.1913, + "step": 366 + }, + { + "epoch": 0.027003908944584962, + "grad_norm": 1.0078125, + "learning_rate": 4.992430179337176e-05, + "loss": 1.0423, + "step": 367 + }, + { + "epoch": 0.027077489077948953, + "grad_norm": 0.890625, + "learning_rate": 4.9923850767519865e-05, + "loss": 0.9187, + "step": 368 + }, + { + "epoch": 0.027151069211312946, + "grad_norm": 1.296875, + "learning_rate": 4.992339840405198e-05, + "loss": 1.6005, + "step": 369 + }, + { + "epoch": 0.027224649344676936, + "grad_norm": 0.83984375, + "learning_rate": 4.9922944702992375e-05, + "loss": 0.6701, + "step": 370 + }, + { + "epoch": 0.02729822947804093, + "grad_norm": 1.078125, + "learning_rate": 4.9922489664365405e-05, + "loss": 1.3002, + "step": 371 + }, + { + "epoch": 0.02737180961140492, + "grad_norm": 1.125, + "learning_rate": 4.992203328819548e-05, + "loss": 0.8571, + "step": 372 + }, + { + "epoch": 0.027445389744768914, + "grad_norm": 1.0234375, + "learning_rate": 4.9921575574507095e-05, + "loss": 0.9139, + "step": 373 + }, + { + "epoch": 0.027518969878132904, + "grad_norm": 1.0625, + "learning_rate": 4.992111652332483e-05, + "loss": 1.1773, + "step": 374 + }, + { + "epoch": 0.027592550011496894, + "grad_norm": 1.1796875, + "learning_rate": 4.99206561346733e-05, + "loss": 1.0986, + "step": 375 + }, + { + "epoch": 0.027666130144860888, + "grad_norm": 1.2421875, + "learning_rate": 4.992019440857724e-05, + "loss": 1.3481, + "step": 376 + }, + { + "epoch": 0.027739710278224878, + "grad_norm": 1.1171875, + "learning_rate": 4.9919731345061396e-05, + "loss": 1.1635, + "step": 377 + }, + { + "epoch": 0.02781329041158887, + "grad_norm": 1.0078125, + "learning_rate": 4.9919266944150643e-05, + "loss": 1.1497, + "step": 378 + }, + { + "epoch": 0.02788687054495286, + "grad_norm": 1.0390625, + "learning_rate": 4.9918801205869904e-05, + "loss": 1.1652, + "step": 379 + }, + { + "epoch": 0.027960450678316855, + "grad_norm": 0.84765625, + "learning_rate": 4.991833413024416e-05, + "loss": 0.95, + "step": 380 + }, + { + "epoch": 0.028034030811680846, + "grad_norm": 0.95703125, + "learning_rate": 4.99178657172985e-05, + "loss": 0.8521, + "step": 381 + }, + { + "epoch": 0.02810761094504484, + "grad_norm": 0.83203125, + "learning_rate": 4.991739596705804e-05, + "loss": 1.1388, + "step": 382 + }, + { + "epoch": 0.02818119107840883, + "grad_norm": 0.85546875, + "learning_rate": 4.9916924879548e-05, + "loss": 0.9586, + "step": 383 + }, + { + "epoch": 0.028254771211772823, + "grad_norm": 0.9453125, + "learning_rate": 4.991645245479367e-05, + "loss": 0.8291, + "step": 384 + }, + { + "epoch": 0.028328351345136813, + "grad_norm": 1.1875, + "learning_rate": 4.9915978692820395e-05, + "loss": 1.2366, + "step": 385 + }, + { + "epoch": 0.028401931478500803, + "grad_norm": 0.94921875, + "learning_rate": 4.99155035936536e-05, + "loss": 0.766, + "step": 386 + }, + { + "epoch": 0.028475511611864797, + "grad_norm": 0.8984375, + "learning_rate": 4.991502715731879e-05, + "loss": 1.0383, + "step": 387 + }, + { + "epoch": 0.028549091745228787, + "grad_norm": 0.9296875, + "learning_rate": 4.991454938384153e-05, + "loss": 0.8598, + "step": 388 + }, + { + "epoch": 0.02862267187859278, + "grad_norm": 0.9921875, + "learning_rate": 4.991407027324746e-05, + "loss": 0.9006, + "step": 389 + }, + { + "epoch": 0.02869625201195677, + "grad_norm": 0.85546875, + "learning_rate": 4.9913589825562294e-05, + "loss": 0.7649, + "step": 390 + }, + { + "epoch": 0.028769832145320764, + "grad_norm": 1.1484375, + "learning_rate": 4.991310804081182e-05, + "loss": 1.4923, + "step": 391 + }, + { + "epoch": 0.028843412278684755, + "grad_norm": 0.9921875, + "learning_rate": 4.991262491902189e-05, + "loss": 1.223, + "step": 392 + }, + { + "epoch": 0.02891699241204875, + "grad_norm": 0.84765625, + "learning_rate": 4.991214046021843e-05, + "loss": 0.8991, + "step": 393 + }, + { + "epoch": 0.02899057254541274, + "grad_norm": 1.046875, + "learning_rate": 4.991165466442744e-05, + "loss": 0.9163, + "step": 394 + }, + { + "epoch": 0.02906415267877673, + "grad_norm": 0.90234375, + "learning_rate": 4.9911167531675006e-05, + "loss": 0.7929, + "step": 395 + }, + { + "epoch": 0.029137732812140722, + "grad_norm": 0.92578125, + "learning_rate": 4.9910679061987266e-05, + "loss": 1.0313, + "step": 396 + }, + { + "epoch": 0.029211312945504712, + "grad_norm": 1.28125, + "learning_rate": 4.9910189255390414e-05, + "loss": 0.9564, + "step": 397 + }, + { + "epoch": 0.029284893078868706, + "grad_norm": 0.8125, + "learning_rate": 4.990969811191076e-05, + "loss": 0.674, + "step": 398 + }, + { + "epoch": 0.029358473212232696, + "grad_norm": 0.94140625, + "learning_rate": 4.990920563157466e-05, + "loss": 0.7931, + "step": 399 + }, + { + "epoch": 0.02943205334559669, + "grad_norm": 0.9609375, + "learning_rate": 4.990871181440854e-05, + "loss": 0.898, + "step": 400 + }, + { + "epoch": 0.02950563347896068, + "grad_norm": 1.1171875, + "learning_rate": 4.99082166604389e-05, + "loss": 1.0065, + "step": 401 + }, + { + "epoch": 0.029579213612324674, + "grad_norm": 1.1328125, + "learning_rate": 4.9907720169692315e-05, + "loss": 1.0143, + "step": 402 + }, + { + "epoch": 0.029652793745688664, + "grad_norm": 0.98828125, + "learning_rate": 4.990722234219544e-05, + "loss": 1.067, + "step": 403 + }, + { + "epoch": 0.029726373879052657, + "grad_norm": 0.87890625, + "learning_rate": 4.9906723177974976e-05, + "loss": 0.8685, + "step": 404 + }, + { + "epoch": 0.029799954012416648, + "grad_norm": 0.97265625, + "learning_rate": 4.990622267705772e-05, + "loss": 0.858, + "step": 405 + }, + { + "epoch": 0.029873534145780638, + "grad_norm": 1.0234375, + "learning_rate": 4.990572083947054e-05, + "loss": 0.9264, + "step": 406 + }, + { + "epoch": 0.02994711427914463, + "grad_norm": 0.97265625, + "learning_rate": 4.990521766524037e-05, + "loss": 1.0102, + "step": 407 + }, + { + "epoch": 0.03002069441250862, + "grad_norm": 1.203125, + "learning_rate": 4.9904713154394197e-05, + "loss": 1.2105, + "step": 408 + }, + { + "epoch": 0.030094274545872615, + "grad_norm": 1.1796875, + "learning_rate": 4.9904207306959104e-05, + "loss": 1.1406, + "step": 409 + }, + { + "epoch": 0.030167854679236605, + "grad_norm": 1.0703125, + "learning_rate": 4.990370012296225e-05, + "loss": 0.9657, + "step": 410 + }, + { + "epoch": 0.0302414348126006, + "grad_norm": 1.0, + "learning_rate": 4.990319160243084e-05, + "loss": 1.0093, + "step": 411 + }, + { + "epoch": 0.03031501494596459, + "grad_norm": 1.1640625, + "learning_rate": 4.9902681745392186e-05, + "loss": 0.8834, + "step": 412 + }, + { + "epoch": 0.030388595079328583, + "grad_norm": 1.0234375, + "learning_rate": 4.990217055187362e-05, + "loss": 1.045, + "step": 413 + }, + { + "epoch": 0.030462175212692573, + "grad_norm": 1.03125, + "learning_rate": 4.990165802190261e-05, + "loss": 1.1901, + "step": 414 + }, + { + "epoch": 0.030535755346056563, + "grad_norm": 0.97265625, + "learning_rate": 4.990114415550663e-05, + "loss": 1.0724, + "step": 415 + }, + { + "epoch": 0.030609335479420557, + "grad_norm": 1.4140625, + "learning_rate": 4.990062895271329e-05, + "loss": 0.7789, + "step": 416 + }, + { + "epoch": 0.030682915612784547, + "grad_norm": 1.140625, + "learning_rate": 4.9900112413550216e-05, + "loss": 1.0514, + "step": 417 + }, + { + "epoch": 0.03075649574614854, + "grad_norm": 1.1328125, + "learning_rate": 4.9899594538045136e-05, + "loss": 0.9318, + "step": 418 + }, + { + "epoch": 0.03083007587951253, + "grad_norm": 0.86328125, + "learning_rate": 4.989907532622585e-05, + "loss": 0.7865, + "step": 419 + }, + { + "epoch": 0.030903656012876524, + "grad_norm": 0.96484375, + "learning_rate": 4.989855477812022e-05, + "loss": 1.2269, + "step": 420 + }, + { + "epoch": 0.030977236146240514, + "grad_norm": 1.140625, + "learning_rate": 4.989803289375618e-05, + "loss": 1.0722, + "step": 421 + }, + { + "epoch": 0.031050816279604508, + "grad_norm": 1.0546875, + "learning_rate": 4.989750967316174e-05, + "loss": 0.9473, + "step": 422 + }, + { + "epoch": 0.0311243964129685, + "grad_norm": 1.1328125, + "learning_rate": 4.989698511636498e-05, + "loss": 1.2507, + "step": 423 + }, + { + "epoch": 0.031197976546332492, + "grad_norm": 1.15625, + "learning_rate": 4.989645922339406e-05, + "loss": 0.9462, + "step": 424 + }, + { + "epoch": 0.03127155667969648, + "grad_norm": 1.03125, + "learning_rate": 4.9895931994277187e-05, + "loss": 1.3416, + "step": 425 + }, + { + "epoch": 0.031345136813060476, + "grad_norm": 0.79296875, + "learning_rate": 4.989540342904267e-05, + "loss": 0.8029, + "step": 426 + }, + { + "epoch": 0.031418716946424466, + "grad_norm": 0.8984375, + "learning_rate": 4.989487352771887e-05, + "loss": 1.3004, + "step": 427 + }, + { + "epoch": 0.031492297079788456, + "grad_norm": 0.8359375, + "learning_rate": 4.9894342290334227e-05, + "loss": 0.7786, + "step": 428 + }, + { + "epoch": 0.031565877213152446, + "grad_norm": 0.88671875, + "learning_rate": 4.989380971691725e-05, + "loss": 0.834, + "step": 429 + }, + { + "epoch": 0.03163945734651644, + "grad_norm": 0.76953125, + "learning_rate": 4.989327580749653e-05, + "loss": 0.6176, + "step": 430 + }, + { + "epoch": 0.031713037479880433, + "grad_norm": 1.0703125, + "learning_rate": 4.989274056210071e-05, + "loss": 1.259, + "step": 431 + }, + { + "epoch": 0.031786617613244424, + "grad_norm": 1.03125, + "learning_rate": 4.989220398075852e-05, + "loss": 0.9209, + "step": 432 + }, + { + "epoch": 0.031860197746608414, + "grad_norm": 1.0859375, + "learning_rate": 4.9891666063498756e-05, + "loss": 0.8342, + "step": 433 + }, + { + "epoch": 0.03193377787997241, + "grad_norm": 0.87109375, + "learning_rate": 4.989112681035029e-05, + "loss": 0.747, + "step": 434 + }, + { + "epoch": 0.0320073580133364, + "grad_norm": 0.94140625, + "learning_rate": 4.9890586221342064e-05, + "loss": 1.0977, + "step": 435 + }, + { + "epoch": 0.03208093814670039, + "grad_norm": 0.890625, + "learning_rate": 4.989004429650308e-05, + "loss": 0.8333, + "step": 436 + }, + { + "epoch": 0.03215451828006438, + "grad_norm": 1.1484375, + "learning_rate": 4.9889501035862426e-05, + "loss": 1.3032, + "step": 437 + }, + { + "epoch": 0.03222809841342837, + "grad_norm": 0.80859375, + "learning_rate": 4.9888956439449274e-05, + "loss": 0.6674, + "step": 438 + }, + { + "epoch": 0.03230167854679237, + "grad_norm": 0.96484375, + "learning_rate": 4.9888410507292825e-05, + "loss": 0.7147, + "step": 439 + }, + { + "epoch": 0.03237525868015636, + "grad_norm": 0.89453125, + "learning_rate": 4.988786323942241e-05, + "loss": 0.9657, + "step": 440 + }, + { + "epoch": 0.03244883881352035, + "grad_norm": 1.0625, + "learning_rate": 4.988731463586737e-05, + "loss": 1.1865, + "step": 441 + }, + { + "epoch": 0.03252241894688434, + "grad_norm": 0.96484375, + "learning_rate": 4.988676469665715e-05, + "loss": 0.7875, + "step": 442 + }, + { + "epoch": 0.032595999080248336, + "grad_norm": 1.1015625, + "learning_rate": 4.988621342182128e-05, + "loss": 1.2024, + "step": 443 + }, + { + "epoch": 0.032669579213612326, + "grad_norm": 1.2734375, + "learning_rate": 4.9885660811389347e-05, + "loss": 1.3675, + "step": 444 + }, + { + "epoch": 0.03274315934697632, + "grad_norm": 0.97265625, + "learning_rate": 4.988510686539099e-05, + "loss": 1.0933, + "step": 445 + }, + { + "epoch": 0.03281673948034031, + "grad_norm": 0.96484375, + "learning_rate": 4.9884551583855954e-05, + "loss": 0.7227, + "step": 446 + }, + { + "epoch": 0.0328903196137043, + "grad_norm": 0.8828125, + "learning_rate": 4.9883994966814034e-05, + "loss": 0.6531, + "step": 447 + }, + { + "epoch": 0.032963899747068294, + "grad_norm": 0.91796875, + "learning_rate": 4.98834370142951e-05, + "loss": 0.8958, + "step": 448 + }, + { + "epoch": 0.033037479880432284, + "grad_norm": 0.91796875, + "learning_rate": 4.9882877726329106e-05, + "loss": 0.6407, + "step": 449 + }, + { + "epoch": 0.033111060013796274, + "grad_norm": 0.83203125, + "learning_rate": 4.988231710294606e-05, + "loss": 0.8751, + "step": 450 + }, + { + "epoch": 0.033184640147160265, + "grad_norm": 0.890625, + "learning_rate": 4.9881755144176044e-05, + "loss": 0.7451, + "step": 451 + }, + { + "epoch": 0.03325822028052426, + "grad_norm": 1.234375, + "learning_rate": 4.988119185004923e-05, + "loss": 0.927, + "step": 452 + }, + { + "epoch": 0.03333180041388825, + "grad_norm": 1.171875, + "learning_rate": 4.988062722059585e-05, + "loss": 1.1373, + "step": 453 + }, + { + "epoch": 0.03340538054725224, + "grad_norm": 0.98046875, + "learning_rate": 4.988006125584619e-05, + "loss": 1.0013, + "step": 454 + }, + { + "epoch": 0.03347896068061623, + "grad_norm": 1.078125, + "learning_rate": 4.987949395583064e-05, + "loss": 0.8468, + "step": 455 + }, + { + "epoch": 0.03355254081398022, + "grad_norm": 0.96875, + "learning_rate": 4.987892532057964e-05, + "loss": 0.9814, + "step": 456 + }, + { + "epoch": 0.03362612094734422, + "grad_norm": 0.9765625, + "learning_rate": 4.98783553501237e-05, + "loss": 1.2678, + "step": 457 + }, + { + "epoch": 0.03369970108070821, + "grad_norm": 0.90234375, + "learning_rate": 4.987778404449342e-05, + "loss": 0.8017, + "step": 458 + }, + { + "epoch": 0.0337732812140722, + "grad_norm": 1.078125, + "learning_rate": 4.987721140371946e-05, + "loss": 1.1828, + "step": 459 + }, + { + "epoch": 0.03384686134743619, + "grad_norm": 1.1015625, + "learning_rate": 4.987663742783255e-05, + "loss": 1.1213, + "step": 460 + }, + { + "epoch": 0.03392044148080019, + "grad_norm": 0.87109375, + "learning_rate": 4.98760621168635e-05, + "loss": 0.8114, + "step": 461 + }, + { + "epoch": 0.03399402161416418, + "grad_norm": 1.03125, + "learning_rate": 4.987548547084317e-05, + "loss": 0.9586, + "step": 462 + }, + { + "epoch": 0.03406760174752817, + "grad_norm": 0.94921875, + "learning_rate": 4.9874907489802526e-05, + "loss": 1.0378, + "step": 463 + }, + { + "epoch": 0.03414118188089216, + "grad_norm": 0.99609375, + "learning_rate": 4.987432817377258e-05, + "loss": 1.0166, + "step": 464 + }, + { + "epoch": 0.03421476201425615, + "grad_norm": 1.03125, + "learning_rate": 4.987374752278441e-05, + "loss": 1.1897, + "step": 465 + }, + { + "epoch": 0.034288342147620145, + "grad_norm": 1.0390625, + "learning_rate": 4.987316553686921e-05, + "loss": 0.9125, + "step": 466 + }, + { + "epoch": 0.034361922280984135, + "grad_norm": 1.1171875, + "learning_rate": 4.9872582216058174e-05, + "loss": 0.8171, + "step": 467 + }, + { + "epoch": 0.034435502414348125, + "grad_norm": 1.203125, + "learning_rate": 4.987199756038263e-05, + "loss": 1.4177, + "step": 468 + }, + { + "epoch": 0.034509082547712115, + "grad_norm": 0.9609375, + "learning_rate": 4.987141156987396e-05, + "loss": 1.0405, + "step": 469 + }, + { + "epoch": 0.03458266268107611, + "grad_norm": 1.109375, + "learning_rate": 4.987082424456361e-05, + "loss": 1.4172, + "step": 470 + }, + { + "epoch": 0.0346562428144401, + "grad_norm": 0.84765625, + "learning_rate": 4.9870235584483096e-05, + "loss": 0.5733, + "step": 471 + }, + { + "epoch": 0.03472982294780409, + "grad_norm": 1.0078125, + "learning_rate": 4.9869645589664e-05, + "loss": 1.2018, + "step": 472 + }, + { + "epoch": 0.03480340308116808, + "grad_norm": 0.921875, + "learning_rate": 4.9869054260138015e-05, + "loss": 0.7661, + "step": 473 + }, + { + "epoch": 0.03487698321453207, + "grad_norm": 1.03125, + "learning_rate": 4.986846159593685e-05, + "loss": 0.9603, + "step": 474 + }, + { + "epoch": 0.03495056334789607, + "grad_norm": 0.9609375, + "learning_rate": 4.986786759709232e-05, + "loss": 1.0224, + "step": 475 + }, + { + "epoch": 0.03502414348126006, + "grad_norm": 1.1484375, + "learning_rate": 4.986727226363631e-05, + "loss": 1.3507, + "step": 476 + }, + { + "epoch": 0.03509772361462405, + "grad_norm": 1.265625, + "learning_rate": 4.986667559560075e-05, + "loss": 1.1848, + "step": 477 + }, + { + "epoch": 0.03517130374798804, + "grad_norm": 0.921875, + "learning_rate": 4.98660775930177e-05, + "loss": 0.8361, + "step": 478 + }, + { + "epoch": 0.03524488388135204, + "grad_norm": 1.21875, + "learning_rate": 4.986547825591922e-05, + "loss": 1.3683, + "step": 479 + }, + { + "epoch": 0.03531846401471603, + "grad_norm": 0.7890625, + "learning_rate": 4.986487758433748e-05, + "loss": 0.673, + "step": 480 + }, + { + "epoch": 0.03539204414808002, + "grad_norm": 0.9375, + "learning_rate": 4.986427557830473e-05, + "loss": 0.9089, + "step": 481 + }, + { + "epoch": 0.03546562428144401, + "grad_norm": 0.8984375, + "learning_rate": 4.9863672237853274e-05, + "loss": 0.6803, + "step": 482 + }, + { + "epoch": 0.035539204414808005, + "grad_norm": 0.98046875, + "learning_rate": 4.986306756301548e-05, + "loss": 0.9858, + "step": 483 + }, + { + "epoch": 0.035612784548171995, + "grad_norm": 1.0390625, + "learning_rate": 4.986246155382381e-05, + "loss": 0.9134, + "step": 484 + }, + { + "epoch": 0.035686364681535986, + "grad_norm": 0.9453125, + "learning_rate": 4.98618542103108e-05, + "loss": 1.0081, + "step": 485 + }, + { + "epoch": 0.035759944814899976, + "grad_norm": 0.953125, + "learning_rate": 4.986124553250903e-05, + "loss": 1.3159, + "step": 486 + }, + { + "epoch": 0.035833524948263966, + "grad_norm": 1.03125, + "learning_rate": 4.986063552045116e-05, + "loss": 1.2957, + "step": 487 + }, + { + "epoch": 0.03590710508162796, + "grad_norm": 0.9609375, + "learning_rate": 4.9860024174169936e-05, + "loss": 0.8686, + "step": 488 + }, + { + "epoch": 0.03598068521499195, + "grad_norm": 1.1640625, + "learning_rate": 4.985941149369817e-05, + "loss": 0.875, + "step": 489 + }, + { + "epoch": 0.03605426534835594, + "grad_norm": 1.0390625, + "learning_rate": 4.9858797479068746e-05, + "loss": 0.9396, + "step": 490 + }, + { + "epoch": 0.036127845481719933, + "grad_norm": 0.90234375, + "learning_rate": 4.985818213031461e-05, + "loss": 0.9777, + "step": 491 + }, + { + "epoch": 0.03620142561508393, + "grad_norm": 0.9921875, + "learning_rate": 4.985756544746879e-05, + "loss": 1.4404, + "step": 492 + }, + { + "epoch": 0.03627500574844792, + "grad_norm": 0.8046875, + "learning_rate": 4.985694743056438e-05, + "loss": 0.8215, + "step": 493 + }, + { + "epoch": 0.03634858588181191, + "grad_norm": 1.1015625, + "learning_rate": 4.985632807963456e-05, + "loss": 1.0838, + "step": 494 + }, + { + "epoch": 0.0364221660151759, + "grad_norm": 0.83203125, + "learning_rate": 4.9855707394712546e-05, + "loss": 0.9544, + "step": 495 + }, + { + "epoch": 0.03649574614853989, + "grad_norm": 1.3828125, + "learning_rate": 4.9855085375831665e-05, + "loss": 1.1742, + "step": 496 + }, + { + "epoch": 0.03656932628190389, + "grad_norm": 1.015625, + "learning_rate": 4.98544620230253e-05, + "loss": 1.3149, + "step": 497 + }, + { + "epoch": 0.03664290641526788, + "grad_norm": 1.09375, + "learning_rate": 4.98538373363269e-05, + "loss": 0.9338, + "step": 498 + }, + { + "epoch": 0.03671648654863187, + "grad_norm": 1.015625, + "learning_rate": 4.985321131576999e-05, + "loss": 1.1112, + "step": 499 + }, + { + "epoch": 0.03679006668199586, + "grad_norm": 0.87890625, + "learning_rate": 4.985258396138817e-05, + "loss": 1.1511, + "step": 500 + }, + { + "epoch": 0.036863646815359856, + "grad_norm": 0.96484375, + "learning_rate": 4.985195527321511e-05, + "loss": 0.9077, + "step": 501 + }, + { + "epoch": 0.036937226948723846, + "grad_norm": 0.9453125, + "learning_rate": 4.985132525128455e-05, + "loss": 0.801, + "step": 502 + }, + { + "epoch": 0.037010807082087836, + "grad_norm": 1.140625, + "learning_rate": 4.9850693895630305e-05, + "loss": 1.2966, + "step": 503 + }, + { + "epoch": 0.037084387215451826, + "grad_norm": 0.78125, + "learning_rate": 4.985006120628625e-05, + "loss": 0.6195, + "step": 504 + }, + { + "epoch": 0.03715796734881582, + "grad_norm": 0.8828125, + "learning_rate": 4.984942718328635e-05, + "loss": 1.0418, + "step": 505 + }, + { + "epoch": 0.037231547482179814, + "grad_norm": 1.0234375, + "learning_rate": 4.984879182666462e-05, + "loss": 1.0322, + "step": 506 + }, + { + "epoch": 0.037305127615543804, + "grad_norm": 0.7890625, + "learning_rate": 4.984815513645516e-05, + "loss": 1.0583, + "step": 507 + }, + { + "epoch": 0.037378707748907794, + "grad_norm": 0.99609375, + "learning_rate": 4.984751711269216e-05, + "loss": 0.9566, + "step": 508 + }, + { + "epoch": 0.037452287882271784, + "grad_norm": 1.3359375, + "learning_rate": 4.9846877755409836e-05, + "loss": 1.5125, + "step": 509 + }, + { + "epoch": 0.03752586801563578, + "grad_norm": 1.0234375, + "learning_rate": 4.984623706464252e-05, + "loss": 0.9958, + "step": 510 + }, + { + "epoch": 0.03759944814899977, + "grad_norm": 0.90625, + "learning_rate": 4.984559504042459e-05, + "loss": 0.9692, + "step": 511 + }, + { + "epoch": 0.03767302828236376, + "grad_norm": 0.88671875, + "learning_rate": 4.984495168279049e-05, + "loss": 0.8629, + "step": 512 + }, + { + "epoch": 0.03774660841572775, + "grad_norm": 0.828125, + "learning_rate": 4.984430699177477e-05, + "loss": 0.7285, + "step": 513 + }, + { + "epoch": 0.03782018854909174, + "grad_norm": 0.875, + "learning_rate": 4.984366096741201e-05, + "loss": 0.6445, + "step": 514 + }, + { + "epoch": 0.03789376868245574, + "grad_norm": 0.9296875, + "learning_rate": 4.984301360973689e-05, + "loss": 0.963, + "step": 515 + }, + { + "epoch": 0.03796734881581973, + "grad_norm": 1.0625, + "learning_rate": 4.984236491878415e-05, + "loss": 0.9509, + "step": 516 + }, + { + "epoch": 0.03804092894918372, + "grad_norm": 1.1328125, + "learning_rate": 4.984171489458861e-05, + "loss": 0.7679, + "step": 517 + }, + { + "epoch": 0.03811450908254771, + "grad_norm": 0.890625, + "learning_rate": 4.984106353718515e-05, + "loss": 1.0823, + "step": 518 + }, + { + "epoch": 0.03818808921591171, + "grad_norm": 0.8203125, + "learning_rate": 4.984041084660872e-05, + "loss": 0.7272, + "step": 519 + }, + { + "epoch": 0.0382616693492757, + "grad_norm": 0.97265625, + "learning_rate": 4.983975682289437e-05, + "loss": 0.9714, + "step": 520 + }, + { + "epoch": 0.03833524948263969, + "grad_norm": 1.0625, + "learning_rate": 4.9839101466077173e-05, + "loss": 1.0269, + "step": 521 + }, + { + "epoch": 0.03840882961600368, + "grad_norm": 0.98828125, + "learning_rate": 4.983844477619233e-05, + "loss": 0.883, + "step": 522 + }, + { + "epoch": 0.03848240974936767, + "grad_norm": 0.78125, + "learning_rate": 4.983778675327506e-05, + "loss": 0.7904, + "step": 523 + }, + { + "epoch": 0.038555989882731664, + "grad_norm": 1.078125, + "learning_rate": 4.9837127397360684e-05, + "loss": 1.0432, + "step": 524 + }, + { + "epoch": 0.038629570016095655, + "grad_norm": 0.93359375, + "learning_rate": 4.98364667084846e-05, + "loss": 1.0571, + "step": 525 + }, + { + "epoch": 0.038703150149459645, + "grad_norm": 0.953125, + "learning_rate": 4.983580468668225e-05, + "loss": 0.7834, + "step": 526 + }, + { + "epoch": 0.038776730282823635, + "grad_norm": 0.95703125, + "learning_rate": 4.9835141331989175e-05, + "loss": 0.9083, + "step": 527 + }, + { + "epoch": 0.03885031041618763, + "grad_norm": 1.0546875, + "learning_rate": 4.983447664444097e-05, + "loss": 1.0536, + "step": 528 + }, + { + "epoch": 0.03892389054955162, + "grad_norm": 1.03125, + "learning_rate": 4.983381062407331e-05, + "loss": 1.0827, + "step": 529 + }, + { + "epoch": 0.03899747068291561, + "grad_norm": 0.8984375, + "learning_rate": 4.9833143270921936e-05, + "loss": 0.891, + "step": 530 + }, + { + "epoch": 0.0390710508162796, + "grad_norm": 1.15625, + "learning_rate": 4.983247458502267e-05, + "loss": 0.8906, + "step": 531 + }, + { + "epoch": 0.0391446309496436, + "grad_norm": 1.03125, + "learning_rate": 4.983180456641139e-05, + "loss": 0.7901, + "step": 532 + }, + { + "epoch": 0.03921821108300759, + "grad_norm": 0.95703125, + "learning_rate": 4.983113321512406e-05, + "loss": 0.8367, + "step": 533 + }, + { + "epoch": 0.03929179121637158, + "grad_norm": 1.1953125, + "learning_rate": 4.983046053119671e-05, + "loss": 1.4232, + "step": 534 + }, + { + "epoch": 0.03936537134973557, + "grad_norm": 0.97265625, + "learning_rate": 4.982978651466544e-05, + "loss": 0.9988, + "step": 535 + }, + { + "epoch": 0.03943895148309956, + "grad_norm": 1.109375, + "learning_rate": 4.982911116556643e-05, + "loss": 1.3246, + "step": 536 + }, + { + "epoch": 0.03951253161646356, + "grad_norm": 0.87109375, + "learning_rate": 4.9828434483935915e-05, + "loss": 0.6563, + "step": 537 + }, + { + "epoch": 0.03958611174982755, + "grad_norm": 1.2265625, + "learning_rate": 4.9827756469810216e-05, + "loss": 1.0454, + "step": 538 + }, + { + "epoch": 0.03965969188319154, + "grad_norm": 0.92578125, + "learning_rate": 4.982707712322572e-05, + "loss": 0.6676, + "step": 539 + }, + { + "epoch": 0.03973327201655553, + "grad_norm": 1.0546875, + "learning_rate": 4.982639644421889e-05, + "loss": 0.8891, + "step": 540 + }, + { + "epoch": 0.039806852149919525, + "grad_norm": 1.1796875, + "learning_rate": 4.982571443282625e-05, + "loss": 0.9849, + "step": 541 + }, + { + "epoch": 0.039880432283283515, + "grad_norm": 0.99609375, + "learning_rate": 4.982503108908441e-05, + "loss": 0.9774, + "step": 542 + }, + { + "epoch": 0.039954012416647505, + "grad_norm": 0.9765625, + "learning_rate": 4.982434641303003e-05, + "loss": 0.8891, + "step": 543 + }, + { + "epoch": 0.040027592550011495, + "grad_norm": 0.984375, + "learning_rate": 4.982366040469988e-05, + "loss": 0.7822, + "step": 544 + }, + { + "epoch": 0.040101172683375486, + "grad_norm": 1.1171875, + "learning_rate": 4.982297306413075e-05, + "loss": 1.4514, + "step": 545 + }, + { + "epoch": 0.04017475281673948, + "grad_norm": 0.8359375, + "learning_rate": 4.982228439135954e-05, + "loss": 0.6972, + "step": 546 + }, + { + "epoch": 0.04024833295010347, + "grad_norm": 1.125, + "learning_rate": 4.982159438642321e-05, + "loss": 0.8759, + "step": 547 + }, + { + "epoch": 0.04032191308346746, + "grad_norm": 0.80078125, + "learning_rate": 4.982090304935879e-05, + "loss": 0.6461, + "step": 548 + }, + { + "epoch": 0.04039549321683145, + "grad_norm": 0.796875, + "learning_rate": 4.982021038020338e-05, + "loss": 0.6901, + "step": 549 + }, + { + "epoch": 0.04046907335019545, + "grad_norm": 0.92578125, + "learning_rate": 4.981951637899417e-05, + "loss": 1.034, + "step": 550 + }, + { + "epoch": 0.04054265348355944, + "grad_norm": 1.34375, + "learning_rate": 4.9818821045768384e-05, + "loss": 1.7883, + "step": 551 + }, + { + "epoch": 0.04061623361692343, + "grad_norm": 0.83203125, + "learning_rate": 4.981812438056335e-05, + "loss": 0.6118, + "step": 552 + }, + { + "epoch": 0.04068981375028742, + "grad_norm": 1.0390625, + "learning_rate": 4.9817426383416456e-05, + "loss": 1.4013, + "step": 553 + }, + { + "epoch": 0.04076339388365141, + "grad_norm": 0.90625, + "learning_rate": 4.981672705436516e-05, + "loss": 0.8169, + "step": 554 + }, + { + "epoch": 0.04083697401701541, + "grad_norm": 1.3046875, + "learning_rate": 4.981602639344699e-05, + "loss": 1.2825, + "step": 555 + }, + { + "epoch": 0.0409105541503794, + "grad_norm": 1.0546875, + "learning_rate": 4.9815324400699564e-05, + "loss": 0.9921, + "step": 556 + }, + { + "epoch": 0.04098413428374339, + "grad_norm": 1.0546875, + "learning_rate": 4.981462107616054e-05, + "loss": 0.7899, + "step": 557 + }, + { + "epoch": 0.04105771441710738, + "grad_norm": 0.890625, + "learning_rate": 4.981391641986768e-05, + "loss": 0.8169, + "step": 558 + }, + { + "epoch": 0.041131294550471376, + "grad_norm": 0.89453125, + "learning_rate": 4.981321043185878e-05, + "loss": 0.7177, + "step": 559 + }, + { + "epoch": 0.041204874683835366, + "grad_norm": 0.7421875, + "learning_rate": 4.9812503112171746e-05, + "loss": 0.6038, + "step": 560 + }, + { + "epoch": 0.041278454817199356, + "grad_norm": 0.8125, + "learning_rate": 4.981179446084454e-05, + "loss": 0.6604, + "step": 561 + }, + { + "epoch": 0.041352034950563346, + "grad_norm": 0.80078125, + "learning_rate": 4.9811084477915184e-05, + "loss": 0.7858, + "step": 562 + }, + { + "epoch": 0.041425615083927336, + "grad_norm": 0.78125, + "learning_rate": 4.9810373163421786e-05, + "loss": 0.6443, + "step": 563 + }, + { + "epoch": 0.04149919521729133, + "grad_norm": 0.7265625, + "learning_rate": 4.980966051740252e-05, + "loss": 0.6786, + "step": 564 + }, + { + "epoch": 0.041572775350655324, + "grad_norm": 1.078125, + "learning_rate": 4.980894653989563e-05, + "loss": 1.1585, + "step": 565 + }, + { + "epoch": 0.041646355484019314, + "grad_norm": 1.0, + "learning_rate": 4.980823123093945e-05, + "loss": 0.679, + "step": 566 + }, + { + "epoch": 0.041719935617383304, + "grad_norm": 1.1875, + "learning_rate": 4.980751459057234e-05, + "loss": 1.1809, + "step": 567 + }, + { + "epoch": 0.0417935157507473, + "grad_norm": 0.85546875, + "learning_rate": 4.9806796618832786e-05, + "loss": 0.8619, + "step": 568 + }, + { + "epoch": 0.04186709588411129, + "grad_norm": 1.1015625, + "learning_rate": 4.98060773157593e-05, + "loss": 0.9854, + "step": 569 + }, + { + "epoch": 0.04194067601747528, + "grad_norm": 0.87109375, + "learning_rate": 4.980535668139051e-05, + "loss": 0.9795, + "step": 570 + }, + { + "epoch": 0.04201425615083927, + "grad_norm": 1.15625, + "learning_rate": 4.980463471576507e-05, + "loss": 1.1124, + "step": 571 + }, + { + "epoch": 0.04208783628420327, + "grad_norm": 1.015625, + "learning_rate": 4.9803911418921736e-05, + "loss": 0.9321, + "step": 572 + }, + { + "epoch": 0.04216141641756726, + "grad_norm": 0.98046875, + "learning_rate": 4.980318679089933e-05, + "loss": 0.8999, + "step": 573 + }, + { + "epoch": 0.04223499655093125, + "grad_norm": 0.921875, + "learning_rate": 4.980246083173672e-05, + "loss": 0.7831, + "step": 574 + }, + { + "epoch": 0.04230857668429524, + "grad_norm": 0.890625, + "learning_rate": 4.98017335414729e-05, + "loss": 0.9732, + "step": 575 + }, + { + "epoch": 0.04238215681765923, + "grad_norm": 0.921875, + "learning_rate": 4.980100492014687e-05, + "loss": 0.7804, + "step": 576 + }, + { + "epoch": 0.042455736951023226, + "grad_norm": 1.03125, + "learning_rate": 4.9800274967797755e-05, + "loss": 1.0128, + "step": 577 + }, + { + "epoch": 0.042529317084387216, + "grad_norm": 0.953125, + "learning_rate": 4.979954368446472e-05, + "loss": 1.0641, + "step": 578 + }, + { + "epoch": 0.04260289721775121, + "grad_norm": 1.0546875, + "learning_rate": 4.9798811070187025e-05, + "loss": 1.2838, + "step": 579 + }, + { + "epoch": 0.0426764773511152, + "grad_norm": 0.84765625, + "learning_rate": 4.979807712500397e-05, + "loss": 0.8511, + "step": 580 + }, + { + "epoch": 0.042750057484479194, + "grad_norm": 0.8671875, + "learning_rate": 4.979734184895495e-05, + "loss": 0.8227, + "step": 581 + }, + { + "epoch": 0.042823637617843184, + "grad_norm": 0.76953125, + "learning_rate": 4.979660524207943e-05, + "loss": 0.6334, + "step": 582 + }, + { + "epoch": 0.042897217751207174, + "grad_norm": 0.90625, + "learning_rate": 4.9795867304416945e-05, + "loss": 1.0858, + "step": 583 + }, + { + "epoch": 0.042970797884571164, + "grad_norm": 1.0, + "learning_rate": 4.9795128036007096e-05, + "loss": 0.9474, + "step": 584 + }, + { + "epoch": 0.043044378017935155, + "grad_norm": 0.75, + "learning_rate": 4.9794387436889555e-05, + "loss": 0.5507, + "step": 585 + }, + { + "epoch": 0.04311795815129915, + "grad_norm": 0.8046875, + "learning_rate": 4.979364550710407e-05, + "loss": 0.7823, + "step": 586 + }, + { + "epoch": 0.04319153828466314, + "grad_norm": 0.734375, + "learning_rate": 4.979290224669045e-05, + "loss": 0.8691, + "step": 587 + }, + { + "epoch": 0.04326511841802713, + "grad_norm": 0.796875, + "learning_rate": 4.97921576556886e-05, + "loss": 0.795, + "step": 588 + }, + { + "epoch": 0.04333869855139112, + "grad_norm": 1.015625, + "learning_rate": 4.979141173413848e-05, + "loss": 0.9836, + "step": 589 + }, + { + "epoch": 0.04341227868475512, + "grad_norm": 1.109375, + "learning_rate": 4.9790664482080114e-05, + "loss": 1.2645, + "step": 590 + }, + { + "epoch": 0.04348585881811911, + "grad_norm": 0.84375, + "learning_rate": 4.978991589955361e-05, + "loss": 0.7881, + "step": 591 + }, + { + "epoch": 0.0435594389514831, + "grad_norm": 1.046875, + "learning_rate": 4.978916598659913e-05, + "loss": 1.2378, + "step": 592 + }, + { + "epoch": 0.04363301908484709, + "grad_norm": 0.890625, + "learning_rate": 4.978841474325694e-05, + "loss": 1.2396, + "step": 593 + }, + { + "epoch": 0.04370659921821108, + "grad_norm": 1.140625, + "learning_rate": 4.978766216956735e-05, + "loss": 0.9337, + "step": 594 + }, + { + "epoch": 0.04378017935157508, + "grad_norm": 0.86328125, + "learning_rate": 4.9786908265570746e-05, + "loss": 0.7447, + "step": 595 + }, + { + "epoch": 0.04385375948493907, + "grad_norm": 0.8984375, + "learning_rate": 4.978615303130759e-05, + "loss": 0.862, + "step": 596 + }, + { + "epoch": 0.04392733961830306, + "grad_norm": 0.98046875, + "learning_rate": 4.9785396466818426e-05, + "loss": 1.2706, + "step": 597 + }, + { + "epoch": 0.04400091975166705, + "grad_norm": 1.046875, + "learning_rate": 4.9784638572143835e-05, + "loss": 1.3343, + "step": 598 + }, + { + "epoch": 0.044074499885031045, + "grad_norm": 1.0546875, + "learning_rate": 4.9783879347324503e-05, + "loss": 1.4976, + "step": 599 + }, + { + "epoch": 0.044148080018395035, + "grad_norm": 0.875, + "learning_rate": 4.978311879240118e-05, + "loss": 0.8082, + "step": 600 + }, + { + "epoch": 0.044221660151759025, + "grad_norm": 0.9609375, + "learning_rate": 4.9782356907414686e-05, + "loss": 0.9675, + "step": 601 + }, + { + "epoch": 0.044295240285123015, + "grad_norm": 0.93359375, + "learning_rate": 4.9781593692405896e-05, + "loss": 0.8128, + "step": 602 + }, + { + "epoch": 0.044368820418487005, + "grad_norm": 1.125, + "learning_rate": 4.978082914741577e-05, + "loss": 1.0967, + "step": 603 + }, + { + "epoch": 0.044442400551851, + "grad_norm": 1.0546875, + "learning_rate": 4.978006327248537e-05, + "loss": 0.8955, + "step": 604 + }, + { + "epoch": 0.04451598068521499, + "grad_norm": 2.078125, + "learning_rate": 4.977929606765576e-05, + "loss": 1.1202, + "step": 605 + }, + { + "epoch": 0.04458956081857898, + "grad_norm": 0.8984375, + "learning_rate": 4.977852753296814e-05, + "loss": 0.8542, + "step": 606 + }, + { + "epoch": 0.04466314095194297, + "grad_norm": 0.82421875, + "learning_rate": 4.9777757668463744e-05, + "loss": 0.7782, + "step": 607 + }, + { + "epoch": 0.04473672108530697, + "grad_norm": 1.1015625, + "learning_rate": 4.9776986474183894e-05, + "loss": 0.8192, + "step": 608 + }, + { + "epoch": 0.04481030121867096, + "grad_norm": 1.203125, + "learning_rate": 4.977621395016998e-05, + "loss": 1.1688, + "step": 609 + }, + { + "epoch": 0.04488388135203495, + "grad_norm": 1.0234375, + "learning_rate": 4.9775440096463447e-05, + "loss": 0.7741, + "step": 610 + }, + { + "epoch": 0.04495746148539894, + "grad_norm": 1.1640625, + "learning_rate": 4.977466491310585e-05, + "loss": 1.3333, + "step": 611 + }, + { + "epoch": 0.04503104161876293, + "grad_norm": 0.83984375, + "learning_rate": 4.977388840013877e-05, + "loss": 0.7828, + "step": 612 + }, + { + "epoch": 0.04510462175212693, + "grad_norm": 0.82421875, + "learning_rate": 4.97731105576039e-05, + "loss": 0.9337, + "step": 613 + }, + { + "epoch": 0.04517820188549092, + "grad_norm": 0.97265625, + "learning_rate": 4.977233138554297e-05, + "loss": 0.9152, + "step": 614 + }, + { + "epoch": 0.04525178201885491, + "grad_norm": 0.9375, + "learning_rate": 4.977155088399781e-05, + "loss": 0.6864, + "step": 615 + }, + { + "epoch": 0.0453253621522189, + "grad_norm": 1.046875, + "learning_rate": 4.977076905301029e-05, + "loss": 0.883, + "step": 616 + }, + { + "epoch": 0.045398942285582895, + "grad_norm": 1.0, + "learning_rate": 4.9769985892622393e-05, + "loss": 0.9834, + "step": 617 + }, + { + "epoch": 0.045472522418946885, + "grad_norm": 0.87890625, + "learning_rate": 4.976920140287613e-05, + "loss": 0.9779, + "step": 618 + }, + { + "epoch": 0.045546102552310876, + "grad_norm": 0.9921875, + "learning_rate": 4.976841558381361e-05, + "loss": 0.8692, + "step": 619 + }, + { + "epoch": 0.045619682685674866, + "grad_norm": 1.15625, + "learning_rate": 4.976762843547701e-05, + "loss": 1.6504, + "step": 620 + }, + { + "epoch": 0.04569326281903886, + "grad_norm": 0.859375, + "learning_rate": 4.976683995790856e-05, + "loss": 0.8912, + "step": 621 + }, + { + "epoch": 0.04576684295240285, + "grad_norm": 1.09375, + "learning_rate": 4.97660501511506e-05, + "loss": 1.1648, + "step": 622 + }, + { + "epoch": 0.04584042308576684, + "grad_norm": 0.96484375, + "learning_rate": 4.97652590152455e-05, + "loss": 0.9922, + "step": 623 + }, + { + "epoch": 0.04591400321913083, + "grad_norm": 0.8828125, + "learning_rate": 4.976446655023572e-05, + "loss": 0.9775, + "step": 624 + }, + { + "epoch": 0.045987583352494824, + "grad_norm": 1.046875, + "learning_rate": 4.9763672756163804e-05, + "loss": 1.387, + "step": 625 + }, + { + "epoch": 0.04606116348585882, + "grad_norm": 0.90234375, + "learning_rate": 4.976287763307234e-05, + "loss": 0.9029, + "step": 626 + }, + { + "epoch": 0.04613474361922281, + "grad_norm": 0.76171875, + "learning_rate": 4.976208118100399e-05, + "loss": 0.7147, + "step": 627 + }, + { + "epoch": 0.0462083237525868, + "grad_norm": 1.140625, + "learning_rate": 4.976128340000153e-05, + "loss": 1.1444, + "step": 628 + }, + { + "epoch": 0.04628190388595079, + "grad_norm": 1.0390625, + "learning_rate": 4.976048429010775e-05, + "loss": 0.9068, + "step": 629 + }, + { + "epoch": 0.04635548401931479, + "grad_norm": 0.98046875, + "learning_rate": 4.9759683851365545e-05, + "loss": 0.7561, + "step": 630 + }, + { + "epoch": 0.04642906415267878, + "grad_norm": 0.859375, + "learning_rate": 4.975888208381787e-05, + "loss": 0.6867, + "step": 631 + }, + { + "epoch": 0.04650264428604277, + "grad_norm": 1.375, + "learning_rate": 4.975807898750776e-05, + "loss": 1.1213, + "step": 632 + }, + { + "epoch": 0.04657622441940676, + "grad_norm": 1.0390625, + "learning_rate": 4.975727456247831e-05, + "loss": 0.7334, + "step": 633 + }, + { + "epoch": 0.04664980455277075, + "grad_norm": 1.0625, + "learning_rate": 4.9756468808772696e-05, + "loss": 0.6628, + "step": 634 + }, + { + "epoch": 0.046723384686134746, + "grad_norm": 0.99609375, + "learning_rate": 4.975566172643415e-05, + "loss": 0.7996, + "step": 635 + }, + { + "epoch": 0.046796964819498736, + "grad_norm": 0.7734375, + "learning_rate": 4.975485331550601e-05, + "loss": 0.6378, + "step": 636 + }, + { + "epoch": 0.046870544952862726, + "grad_norm": 0.8515625, + "learning_rate": 4.9754043576031636e-05, + "loss": 0.9027, + "step": 637 + }, + { + "epoch": 0.046944125086226716, + "grad_norm": 1.2421875, + "learning_rate": 4.9753232508054506e-05, + "loss": 1.0751, + "step": 638 + }, + { + "epoch": 0.047017705219590714, + "grad_norm": 0.9140625, + "learning_rate": 4.9752420111618136e-05, + "loss": 0.7651, + "step": 639 + }, + { + "epoch": 0.047091285352954704, + "grad_norm": 0.9296875, + "learning_rate": 4.975160638676612e-05, + "loss": 1.1114, + "step": 640 + }, + { + "epoch": 0.047164865486318694, + "grad_norm": 1.0625, + "learning_rate": 4.9750791333542154e-05, + "loss": 1.0256, + "step": 641 + }, + { + "epoch": 0.047238445619682684, + "grad_norm": 0.828125, + "learning_rate": 4.974997495198996e-05, + "loss": 0.6675, + "step": 642 + }, + { + "epoch": 0.047312025753046674, + "grad_norm": 1.0390625, + "learning_rate": 4.9749157242153354e-05, + "loss": 1.1, + "step": 643 + }, + { + "epoch": 0.04738560588641067, + "grad_norm": 1.3828125, + "learning_rate": 4.974833820407622e-05, + "loss": 1.2962, + "step": 644 + }, + { + "epoch": 0.04745918601977466, + "grad_norm": 1.0859375, + "learning_rate": 4.974751783780253e-05, + "loss": 1.1137, + "step": 645 + }, + { + "epoch": 0.04753276615313865, + "grad_norm": 0.93359375, + "learning_rate": 4.974669614337628e-05, + "loss": 0.8745, + "step": 646 + }, + { + "epoch": 0.04760634628650264, + "grad_norm": 1.046875, + "learning_rate": 4.9745873120841603e-05, + "loss": 1.1793, + "step": 647 + }, + { + "epoch": 0.04767992641986664, + "grad_norm": 1.109375, + "learning_rate": 4.9745048770242645e-05, + "loss": 1.2452, + "step": 648 + }, + { + "epoch": 0.04775350655323063, + "grad_norm": 0.94921875, + "learning_rate": 4.974422309162366e-05, + "loss": 1.0531, + "step": 649 + }, + { + "epoch": 0.04782708668659462, + "grad_norm": 0.8203125, + "learning_rate": 4.974339608502896e-05, + "loss": 0.6883, + "step": 650 + }, + { + "epoch": 0.04790066681995861, + "grad_norm": 1.1484375, + "learning_rate": 4.974256775050292e-05, + "loss": 1.2653, + "step": 651 + }, + { + "epoch": 0.0479742469533226, + "grad_norm": 1.125, + "learning_rate": 4.9741738088090006e-05, + "loss": 1.1366, + "step": 652 + }, + { + "epoch": 0.0480478270866866, + "grad_norm": 1.078125, + "learning_rate": 4.974090709783474e-05, + "loss": 1.3409, + "step": 653 + }, + { + "epoch": 0.04812140722005059, + "grad_norm": 0.96875, + "learning_rate": 4.974007477978171e-05, + "loss": 1.1575, + "step": 654 + }, + { + "epoch": 0.04819498735341458, + "grad_norm": 1.234375, + "learning_rate": 4.97392411339756e-05, + "loss": 1.6028, + "step": 655 + }, + { + "epoch": 0.04826856748677857, + "grad_norm": 0.96875, + "learning_rate": 4.973840616046115e-05, + "loss": 0.7437, + "step": 656 + }, + { + "epoch": 0.048342147620142564, + "grad_norm": 1.0390625, + "learning_rate": 4.973756985928316e-05, + "loss": 0.8768, + "step": 657 + }, + { + "epoch": 0.048415727753506554, + "grad_norm": 0.9375, + "learning_rate": 4.973673223048652e-05, + "loss": 0.885, + "step": 658 + }, + { + "epoch": 0.048489307886870545, + "grad_norm": 1.234375, + "learning_rate": 4.973589327411617e-05, + "loss": 1.4515, + "step": 659 + }, + { + "epoch": 0.048562888020234535, + "grad_norm": 1.1796875, + "learning_rate": 4.9735052990217165e-05, + "loss": 1.393, + "step": 660 + }, + { + "epoch": 0.048636468153598525, + "grad_norm": 1.1640625, + "learning_rate": 4.973421137883458e-05, + "loss": 1.3093, + "step": 661 + }, + { + "epoch": 0.04871004828696252, + "grad_norm": 0.80859375, + "learning_rate": 4.9733368440013585e-05, + "loss": 0.8184, + "step": 662 + }, + { + "epoch": 0.04878362842032651, + "grad_norm": 0.8984375, + "learning_rate": 4.973252417379941e-05, + "loss": 0.8445, + "step": 663 + }, + { + "epoch": 0.0488572085536905, + "grad_norm": 0.77734375, + "learning_rate": 4.9731678580237386e-05, + "loss": 0.6878, + "step": 664 + }, + { + "epoch": 0.04893078868705449, + "grad_norm": 0.91015625, + "learning_rate": 4.9730831659372886e-05, + "loss": 0.7854, + "step": 665 + }, + { + "epoch": 0.04900436882041849, + "grad_norm": 0.85546875, + "learning_rate": 4.9729983411251355e-05, + "loss": 0.8962, + "step": 666 + }, + { + "epoch": 0.04907794895378248, + "grad_norm": 0.8984375, + "learning_rate": 4.972913383591833e-05, + "loss": 1.0992, + "step": 667 + }, + { + "epoch": 0.04915152908714647, + "grad_norm": 0.796875, + "learning_rate": 4.972828293341939e-05, + "loss": 0.7466, + "step": 668 + }, + { + "epoch": 0.04922510922051046, + "grad_norm": 1.2265625, + "learning_rate": 4.9727430703800214e-05, + "loss": 1.155, + "step": 669 + }, + { + "epoch": 0.04929868935387446, + "grad_norm": 0.96875, + "learning_rate": 4.972657714710653e-05, + "loss": 0.9673, + "step": 670 + }, + { + "epoch": 0.04937226948723845, + "grad_norm": 1.3046875, + "learning_rate": 4.972572226338416e-05, + "loss": 1.392, + "step": 671 + }, + { + "epoch": 0.04944584962060244, + "grad_norm": 1.015625, + "learning_rate": 4.9724866052678974e-05, + "loss": 1.004, + "step": 672 + }, + { + "epoch": 0.04951942975396643, + "grad_norm": 0.73828125, + "learning_rate": 4.972400851503693e-05, + "loss": 0.6966, + "step": 673 + }, + { + "epoch": 0.04959300988733042, + "grad_norm": 0.98046875, + "learning_rate": 4.972314965050404e-05, + "loss": 1.0663, + "step": 674 + }, + { + "epoch": 0.049666590020694415, + "grad_norm": 0.83203125, + "learning_rate": 4.97222894591264e-05, + "loss": 0.9235, + "step": 675 + }, + { + "epoch": 0.049740170154058405, + "grad_norm": 0.92578125, + "learning_rate": 4.972142794095019e-05, + "loss": 0.9954, + "step": 676 + }, + { + "epoch": 0.049813750287422395, + "grad_norm": 1.03125, + "learning_rate": 4.972056509602163e-05, + "loss": 1.1419, + "step": 677 + }, + { + "epoch": 0.049887330420786385, + "grad_norm": 1.125, + "learning_rate": 4.971970092438702e-05, + "loss": 0.7516, + "step": 678 + }, + { + "epoch": 0.04996091055415038, + "grad_norm": 0.9375, + "learning_rate": 4.9718835426092766e-05, + "loss": 1.015, + "step": 679 + }, + { + "epoch": 0.05003449068751437, + "grad_norm": 0.76171875, + "learning_rate": 4.97179686011853e-05, + "loss": 0.5791, + "step": 680 + }, + { + "epoch": 0.05010807082087836, + "grad_norm": 1.03125, + "learning_rate": 4.971710044971114e-05, + "loss": 1.0074, + "step": 681 + }, + { + "epoch": 0.05018165095424235, + "grad_norm": 1.265625, + "learning_rate": 4.971623097171688e-05, + "loss": 0.9841, + "step": 682 + }, + { + "epoch": 0.05025523108760634, + "grad_norm": 1.125, + "learning_rate": 4.971536016724919e-05, + "loss": 0.8017, + "step": 683 + }, + { + "epoch": 0.05032881122097034, + "grad_norm": 1.0703125, + "learning_rate": 4.9714488036354803e-05, + "loss": 1.0498, + "step": 684 + }, + { + "epoch": 0.05040239135433433, + "grad_norm": 1.046875, + "learning_rate": 4.971361457908053e-05, + "loss": 1.3636, + "step": 685 + }, + { + "epoch": 0.05047597148769832, + "grad_norm": 1.15625, + "learning_rate": 4.971273979547322e-05, + "loss": 0.8792, + "step": 686 + }, + { + "epoch": 0.05054955162106231, + "grad_norm": 1.0390625, + "learning_rate": 4.9711863685579855e-05, + "loss": 0.9422, + "step": 687 + }, + { + "epoch": 0.05062313175442631, + "grad_norm": 1.015625, + "learning_rate": 4.9710986249447436e-05, + "loss": 0.8948, + "step": 688 + }, + { + "epoch": 0.0506967118877903, + "grad_norm": 0.83203125, + "learning_rate": 4.9710107487123054e-05, + "loss": 0.8737, + "step": 689 + }, + { + "epoch": 0.05077029202115429, + "grad_norm": 1.21875, + "learning_rate": 4.970922739865388e-05, + "loss": 1.5617, + "step": 690 + }, + { + "epoch": 0.05084387215451828, + "grad_norm": 0.7734375, + "learning_rate": 4.9708345984087137e-05, + "loss": 0.7197, + "step": 691 + }, + { + "epoch": 0.05091745228788227, + "grad_norm": 0.94140625, + "learning_rate": 4.970746324347013e-05, + "loss": 1.0125, + "step": 692 + }, + { + "epoch": 0.050991032421246266, + "grad_norm": 0.84375, + "learning_rate": 4.9706579176850246e-05, + "loss": 0.742, + "step": 693 + }, + { + "epoch": 0.051064612554610256, + "grad_norm": 0.73046875, + "learning_rate": 4.970569378427491e-05, + "loss": 0.6562, + "step": 694 + }, + { + "epoch": 0.051138192687974246, + "grad_norm": 1.140625, + "learning_rate": 4.9704807065791656e-05, + "loss": 1.3955, + "step": 695 + }, + { + "epoch": 0.051211772821338236, + "grad_norm": 0.984375, + "learning_rate": 4.970391902144806e-05, + "loss": 0.8316, + "step": 696 + }, + { + "epoch": 0.05128535295470223, + "grad_norm": 0.828125, + "learning_rate": 4.9703029651291806e-05, + "loss": 0.9463, + "step": 697 + }, + { + "epoch": 0.05135893308806622, + "grad_norm": 1.03125, + "learning_rate": 4.97021389553706e-05, + "loss": 1.069, + "step": 698 + }, + { + "epoch": 0.051432513221430214, + "grad_norm": 0.9375, + "learning_rate": 4.970124693373225e-05, + "loss": 0.939, + "step": 699 + }, + { + "epoch": 0.051506093354794204, + "grad_norm": 0.921875, + "learning_rate": 4.9700353586424634e-05, + "loss": 0.7969, + "step": 700 + }, + { + "epoch": 0.051579673488158194, + "grad_norm": 1.25, + "learning_rate": 4.969945891349569e-05, + "loss": 1.1327, + "step": 701 + }, + { + "epoch": 0.05165325362152219, + "grad_norm": 1.1015625, + "learning_rate": 4.969856291499344e-05, + "loss": 1.6125, + "step": 702 + }, + { + "epoch": 0.05172683375488618, + "grad_norm": 0.94140625, + "learning_rate": 4.9697665590965964e-05, + "loss": 0.6791, + "step": 703 + }, + { + "epoch": 0.05180041388825017, + "grad_norm": 0.84765625, + "learning_rate": 4.969676694146143e-05, + "loss": 0.8961, + "step": 704 + }, + { + "epoch": 0.05187399402161416, + "grad_norm": 0.88671875, + "learning_rate": 4.9695866966528046e-05, + "loss": 0.89, + "step": 705 + }, + { + "epoch": 0.05194757415497816, + "grad_norm": 1.1328125, + "learning_rate": 4.9694965666214136e-05, + "loss": 1.1385, + "step": 706 + }, + { + "epoch": 0.05202115428834215, + "grad_norm": 0.89453125, + "learning_rate": 4.969406304056806e-05, + "loss": 0.8156, + "step": 707 + }, + { + "epoch": 0.05209473442170614, + "grad_norm": 0.98046875, + "learning_rate": 4.969315908963826e-05, + "loss": 0.7964, + "step": 708 + }, + { + "epoch": 0.05216831455507013, + "grad_norm": 1.0, + "learning_rate": 4.969225381347325e-05, + "loss": 1.0184, + "step": 709 + }, + { + "epoch": 0.052241894688434126, + "grad_norm": 1.015625, + "learning_rate": 4.969134721212161e-05, + "loss": 0.986, + "step": 710 + }, + { + "epoch": 0.052315474821798116, + "grad_norm": 1.1328125, + "learning_rate": 4.9690439285632006e-05, + "loss": 1.3072, + "step": 711 + }, + { + "epoch": 0.052389054955162107, + "grad_norm": 0.98046875, + "learning_rate": 4.9689530034053165e-05, + "loss": 1.2645, + "step": 712 + }, + { + "epoch": 0.0524626350885261, + "grad_norm": 1.203125, + "learning_rate": 4.968861945743387e-05, + "loss": 1.2966, + "step": 713 + }, + { + "epoch": 0.05253621522189009, + "grad_norm": 0.88671875, + "learning_rate": 4.968770755582301e-05, + "loss": 0.8053, + "step": 714 + }, + { + "epoch": 0.052609795355254084, + "grad_norm": 0.96875, + "learning_rate": 4.96867943292695e-05, + "loss": 0.9755, + "step": 715 + }, + { + "epoch": 0.052683375488618074, + "grad_norm": 0.9609375, + "learning_rate": 4.968587977782237e-05, + "loss": 0.8545, + "step": 716 + }, + { + "epoch": 0.052756955621982064, + "grad_norm": 0.93359375, + "learning_rate": 4.96849639015307e-05, + "loss": 0.998, + "step": 717 + }, + { + "epoch": 0.052830535755346054, + "grad_norm": 0.85546875, + "learning_rate": 4.9684046700443636e-05, + "loss": 1.0482, + "step": 718 + }, + { + "epoch": 0.05290411588871005, + "grad_norm": 1.0546875, + "learning_rate": 4.968312817461041e-05, + "loss": 1.1994, + "step": 719 + }, + { + "epoch": 0.05297769602207404, + "grad_norm": 0.93359375, + "learning_rate": 4.968220832408031e-05, + "loss": 0.9737, + "step": 720 + }, + { + "epoch": 0.05305127615543803, + "grad_norm": 1.15625, + "learning_rate": 4.968128714890272e-05, + "loss": 1.1198, + "step": 721 + }, + { + "epoch": 0.05312485628880202, + "grad_norm": 0.9296875, + "learning_rate": 4.9680364649127054e-05, + "loss": 0.7691, + "step": 722 + }, + { + "epoch": 0.05319843642216601, + "grad_norm": 1.1015625, + "learning_rate": 4.967944082480284e-05, + "loss": 1.0819, + "step": 723 + }, + { + "epoch": 0.05327201655553001, + "grad_norm": 1.421875, + "learning_rate": 4.967851567597964e-05, + "loss": 1.0186, + "step": 724 + }, + { + "epoch": 0.053345596688894, + "grad_norm": 0.99609375, + "learning_rate": 4.967758920270712e-05, + "loss": 1.0298, + "step": 725 + }, + { + "epoch": 0.05341917682225799, + "grad_norm": 1.03125, + "learning_rate": 4.9676661405035e-05, + "loss": 0.9255, + "step": 726 + }, + { + "epoch": 0.05349275695562198, + "grad_norm": 1.0234375, + "learning_rate": 4.9675732283013064e-05, + "loss": 1.2375, + "step": 727 + }, + { + "epoch": 0.05356633708898598, + "grad_norm": 0.95703125, + "learning_rate": 4.967480183669118e-05, + "loss": 1.0639, + "step": 728 + }, + { + "epoch": 0.05363991722234997, + "grad_norm": 1.1484375, + "learning_rate": 4.9673870066119294e-05, + "loss": 1.2414, + "step": 729 + }, + { + "epoch": 0.05371349735571396, + "grad_norm": 1.0625, + "learning_rate": 4.9672936971347394e-05, + "loss": 1.0286, + "step": 730 + }, + { + "epoch": 0.05378707748907795, + "grad_norm": 0.95703125, + "learning_rate": 4.967200255242558e-05, + "loss": 1.2123, + "step": 731 + }, + { + "epoch": 0.05386065762244194, + "grad_norm": 1.03125, + "learning_rate": 4.9671066809403976e-05, + "loss": 0.9672, + "step": 732 + }, + { + "epoch": 0.053934237755805935, + "grad_norm": 0.9921875, + "learning_rate": 4.967012974233282e-05, + "loss": 0.9823, + "step": 733 + }, + { + "epoch": 0.054007817889169925, + "grad_norm": 1.015625, + "learning_rate": 4.966919135126239e-05, + "loss": 0.7568, + "step": 734 + }, + { + "epoch": 0.054081398022533915, + "grad_norm": 1.03125, + "learning_rate": 4.9668251636243065e-05, + "loss": 1.4507, + "step": 735 + }, + { + "epoch": 0.054154978155897905, + "grad_norm": 1.0390625, + "learning_rate": 4.9667310597325255e-05, + "loss": 1.4083, + "step": 736 + }, + { + "epoch": 0.0542285582892619, + "grad_norm": 0.921875, + "learning_rate": 4.966636823455948e-05, + "loss": 0.9924, + "step": 737 + }, + { + "epoch": 0.05430213842262589, + "grad_norm": 0.96484375, + "learning_rate": 4.9665424547996306e-05, + "loss": 0.9729, + "step": 738 + }, + { + "epoch": 0.05437571855598988, + "grad_norm": 0.9140625, + "learning_rate": 4.966447953768639e-05, + "loss": 0.9148, + "step": 739 + }, + { + "epoch": 0.05444929868935387, + "grad_norm": 1.1484375, + "learning_rate": 4.966353320368044e-05, + "loss": 1.2248, + "step": 740 + }, + { + "epoch": 0.05452287882271786, + "grad_norm": 1.4375, + "learning_rate": 4.9662585546029246e-05, + "loss": 1.2476, + "step": 741 + }, + { + "epoch": 0.05459645895608186, + "grad_norm": 0.77734375, + "learning_rate": 4.9661636564783664e-05, + "loss": 0.703, + "step": 742 + }, + { + "epoch": 0.05467003908944585, + "grad_norm": 0.9140625, + "learning_rate": 4.966068625999463e-05, + "loss": 0.9597, + "step": 743 + }, + { + "epoch": 0.05474361922280984, + "grad_norm": 1.0703125, + "learning_rate": 4.965973463171314e-05, + "loss": 0.8979, + "step": 744 + }, + { + "epoch": 0.05481719935617383, + "grad_norm": 0.875, + "learning_rate": 4.9658781679990265e-05, + "loss": 0.8228, + "step": 745 + }, + { + "epoch": 0.05489077948953783, + "grad_norm": 0.81640625, + "learning_rate": 4.965782740487715e-05, + "loss": 0.7687, + "step": 746 + }, + { + "epoch": 0.05496435962290182, + "grad_norm": 1.0234375, + "learning_rate": 4.965687180642501e-05, + "loss": 0.897, + "step": 747 + }, + { + "epoch": 0.05503793975626581, + "grad_norm": 1.1953125, + "learning_rate": 4.965591488468514e-05, + "loss": 0.9762, + "step": 748 + }, + { + "epoch": 0.0551115198896298, + "grad_norm": 1.0234375, + "learning_rate": 4.9654956639708885e-05, + "loss": 1.0854, + "step": 749 + }, + { + "epoch": 0.05518510002299379, + "grad_norm": 1.078125, + "learning_rate": 4.965399707154766e-05, + "loss": 0.8447, + "step": 750 + }, + { + "epoch": 0.055258680156357785, + "grad_norm": 0.87890625, + "learning_rate": 4.965303618025299e-05, + "loss": 1.0321, + "step": 751 + }, + { + "epoch": 0.055332260289721776, + "grad_norm": 0.94921875, + "learning_rate": 4.9652073965876425e-05, + "loss": 0.9771, + "step": 752 + }, + { + "epoch": 0.055405840423085766, + "grad_norm": 0.8828125, + "learning_rate": 4.9651110428469614e-05, + "loss": 0.6425, + "step": 753 + }, + { + "epoch": 0.055479420556449756, + "grad_norm": 0.921875, + "learning_rate": 4.9650145568084266e-05, + "loss": 0.84, + "step": 754 + }, + { + "epoch": 0.05555300068981375, + "grad_norm": 1.0, + "learning_rate": 4.964917938477216e-05, + "loss": 1.1714, + "step": 755 + }, + { + "epoch": 0.05562658082317774, + "grad_norm": 0.91015625, + "learning_rate": 4.964821187858516e-05, + "loss": 0.7757, + "step": 756 + }, + { + "epoch": 0.05570016095654173, + "grad_norm": 1.015625, + "learning_rate": 4.964724304957518e-05, + "loss": 0.9814, + "step": 757 + }, + { + "epoch": 0.05577374108990572, + "grad_norm": 0.9765625, + "learning_rate": 4.964627289779421e-05, + "loss": 0.7757, + "step": 758 + }, + { + "epoch": 0.05584732122326972, + "grad_norm": 0.71875, + "learning_rate": 4.964530142329433e-05, + "loss": 0.6509, + "step": 759 + }, + { + "epoch": 0.05592090135663371, + "grad_norm": 0.92578125, + "learning_rate": 4.964432862612767e-05, + "loss": 0.8458, + "step": 760 + }, + { + "epoch": 0.0559944814899977, + "grad_norm": 1.171875, + "learning_rate": 4.964335450634644e-05, + "loss": 1.7253, + "step": 761 + }, + { + "epoch": 0.05606806162336169, + "grad_norm": 1.3203125, + "learning_rate": 4.964237906400291e-05, + "loss": 1.1215, + "step": 762 + }, + { + "epoch": 0.05614164175672568, + "grad_norm": 0.84765625, + "learning_rate": 4.964140229914944e-05, + "loss": 0.8251, + "step": 763 + }, + { + "epoch": 0.05621522189008968, + "grad_norm": 1.0703125, + "learning_rate": 4.964042421183846e-05, + "loss": 1.0398, + "step": 764 + }, + { + "epoch": 0.05628880202345367, + "grad_norm": 1.046875, + "learning_rate": 4.9639444802122445e-05, + "loss": 0.9634, + "step": 765 + }, + { + "epoch": 0.05636238215681766, + "grad_norm": 0.87109375, + "learning_rate": 4.963846407005397e-05, + "loss": 1.187, + "step": 766 + }, + { + "epoch": 0.05643596229018165, + "grad_norm": 0.8359375, + "learning_rate": 4.963748201568565e-05, + "loss": 0.861, + "step": 767 + }, + { + "epoch": 0.056509542423545646, + "grad_norm": 0.93359375, + "learning_rate": 4.963649863907022e-05, + "loss": 0.7786, + "step": 768 + }, + { + "epoch": 0.056583122556909636, + "grad_norm": 1.1015625, + "learning_rate": 4.9635513940260434e-05, + "loss": 1.0423, + "step": 769 + }, + { + "epoch": 0.056656702690273626, + "grad_norm": 0.97265625, + "learning_rate": 4.9634527919309145e-05, + "loss": 1.2678, + "step": 770 + }, + { + "epoch": 0.056730282823637616, + "grad_norm": 0.93359375, + "learning_rate": 4.9633540576269265e-05, + "loss": 0.9998, + "step": 771 + }, + { + "epoch": 0.05680386295700161, + "grad_norm": 0.69140625, + "learning_rate": 4.96325519111938e-05, + "loss": 0.6784, + "step": 772 + }, + { + "epoch": 0.056877443090365604, + "grad_norm": 0.98046875, + "learning_rate": 4.963156192413579e-05, + "loss": 0.9524, + "step": 773 + }, + { + "epoch": 0.056951023223729594, + "grad_norm": 0.90234375, + "learning_rate": 4.963057061514838e-05, + "loss": 1.2812, + "step": 774 + }, + { + "epoch": 0.057024603357093584, + "grad_norm": 0.98828125, + "learning_rate": 4.962957798428475e-05, + "loss": 0.8375, + "step": 775 + }, + { + "epoch": 0.057098183490457574, + "grad_norm": 0.74609375, + "learning_rate": 4.96285840315982e-05, + "loss": 0.6929, + "step": 776 + }, + { + "epoch": 0.05717176362382157, + "grad_norm": 1.15625, + "learning_rate": 4.962758875714206e-05, + "loss": 0.6131, + "step": 777 + }, + { + "epoch": 0.05724534375718556, + "grad_norm": 1.078125, + "learning_rate": 4.962659216096976e-05, + "loss": 1.079, + "step": 778 + }, + { + "epoch": 0.05731892389054955, + "grad_norm": 0.78515625, + "learning_rate": 4.9625594243134755e-05, + "loss": 0.8023, + "step": 779 + }, + { + "epoch": 0.05739250402391354, + "grad_norm": 0.76953125, + "learning_rate": 4.962459500369062e-05, + "loss": 0.6319, + "step": 780 + }, + { + "epoch": 0.05746608415727753, + "grad_norm": 1.1796875, + "learning_rate": 4.962359444269098e-05, + "loss": 0.9587, + "step": 781 + }, + { + "epoch": 0.05753966429064153, + "grad_norm": 1.1015625, + "learning_rate": 4.962259256018954e-05, + "loss": 1.3477, + "step": 782 + }, + { + "epoch": 0.05761324442400552, + "grad_norm": 1.03125, + "learning_rate": 4.962158935624006e-05, + "loss": 1.0746, + "step": 783 + }, + { + "epoch": 0.05768682455736951, + "grad_norm": 0.9765625, + "learning_rate": 4.962058483089638e-05, + "loss": 0.9957, + "step": 784 + }, + { + "epoch": 0.0577604046907335, + "grad_norm": 1.40625, + "learning_rate": 4.9619578984212415e-05, + "loss": 1.3374, + "step": 785 + }, + { + "epoch": 0.0578339848240975, + "grad_norm": 1.1640625, + "learning_rate": 4.961857181624214e-05, + "loss": 1.3233, + "step": 786 + }, + { + "epoch": 0.05790756495746149, + "grad_norm": 1.2421875, + "learning_rate": 4.961756332703962e-05, + "loss": 1.5237, + "step": 787 + }, + { + "epoch": 0.05798114509082548, + "grad_norm": 0.84375, + "learning_rate": 4.9616553516658974e-05, + "loss": 1.0261, + "step": 788 + }, + { + "epoch": 0.05805472522418947, + "grad_norm": 0.8359375, + "learning_rate": 4.961554238515439e-05, + "loss": 0.7153, + "step": 789 + }, + { + "epoch": 0.05812830535755346, + "grad_norm": 1.046875, + "learning_rate": 4.961452993258015e-05, + "loss": 1.025, + "step": 790 + }, + { + "epoch": 0.058201885490917454, + "grad_norm": 1.109375, + "learning_rate": 4.961351615899057e-05, + "loss": 1.0908, + "step": 791 + }, + { + "epoch": 0.058275465624281444, + "grad_norm": 1.046875, + "learning_rate": 4.961250106444007e-05, + "loss": 1.0561, + "step": 792 + }, + { + "epoch": 0.058349045757645435, + "grad_norm": 1.109375, + "learning_rate": 4.9611484648983106e-05, + "loss": 1.1467, + "step": 793 + }, + { + "epoch": 0.058422625891009425, + "grad_norm": 1.109375, + "learning_rate": 4.961046691267427e-05, + "loss": 1.1308, + "step": 794 + }, + { + "epoch": 0.05849620602437342, + "grad_norm": 0.94921875, + "learning_rate": 4.960944785556814e-05, + "loss": 1.2358, + "step": 795 + }, + { + "epoch": 0.05856978615773741, + "grad_norm": 1.1328125, + "learning_rate": 4.960842747771943e-05, + "loss": 0.8898, + "step": 796 + }, + { + "epoch": 0.0586433662911014, + "grad_norm": 0.83984375, + "learning_rate": 4.96074057791829e-05, + "loss": 1.0806, + "step": 797 + }, + { + "epoch": 0.05871694642446539, + "grad_norm": 1.4375, + "learning_rate": 4.9606382760013374e-05, + "loss": 0.9684, + "step": 798 + }, + { + "epoch": 0.05879052655782939, + "grad_norm": 1.2109375, + "learning_rate": 4.960535842026577e-05, + "loss": 1.5033, + "step": 799 + }, + { + "epoch": 0.05886410669119338, + "grad_norm": 0.87109375, + "learning_rate": 4.960433275999504e-05, + "loss": 0.816, + "step": 800 + }, + { + "epoch": 0.05893768682455737, + "grad_norm": 1.0390625, + "learning_rate": 4.960330577925625e-05, + "loss": 1.0746, + "step": 801 + }, + { + "epoch": 0.05901126695792136, + "grad_norm": 2.765625, + "learning_rate": 4.9602277478104496e-05, + "loss": 1.2213, + "step": 802 + }, + { + "epoch": 0.05908484709128535, + "grad_norm": 1.0, + "learning_rate": 4.960124785659499e-05, + "loss": 1.298, + "step": 803 + }, + { + "epoch": 0.05915842722464935, + "grad_norm": 0.828125, + "learning_rate": 4.960021691478297e-05, + "loss": 0.7752, + "step": 804 + }, + { + "epoch": 0.05923200735801334, + "grad_norm": 0.89453125, + "learning_rate": 4.959918465272377e-05, + "loss": 0.9438, + "step": 805 + }, + { + "epoch": 0.05930558749137733, + "grad_norm": 0.9140625, + "learning_rate": 4.95981510704728e-05, + "loss": 0.7657, + "step": 806 + }, + { + "epoch": 0.05937916762474132, + "grad_norm": 1.0078125, + "learning_rate": 4.959711616808551e-05, + "loss": 1.2213, + "step": 807 + }, + { + "epoch": 0.059452747758105315, + "grad_norm": 1.1796875, + "learning_rate": 4.959607994561746e-05, + "loss": 1.065, + "step": 808 + }, + { + "epoch": 0.059526327891469305, + "grad_norm": 0.84765625, + "learning_rate": 4.9595042403124254e-05, + "loss": 0.8997, + "step": 809 + }, + { + "epoch": 0.059599908024833295, + "grad_norm": 0.85546875, + "learning_rate": 4.959400354066157e-05, + "loss": 0.8904, + "step": 810 + }, + { + "epoch": 0.059673488158197285, + "grad_norm": 0.83984375, + "learning_rate": 4.959296335828517e-05, + "loss": 0.7214, + "step": 811 + }, + { + "epoch": 0.059747068291561276, + "grad_norm": 0.94140625, + "learning_rate": 4.959192185605088e-05, + "loss": 0.8603, + "step": 812 + }, + { + "epoch": 0.05982064842492527, + "grad_norm": 0.94921875, + "learning_rate": 4.959087903401459e-05, + "loss": 1.0214, + "step": 813 + }, + { + "epoch": 0.05989422855828926, + "grad_norm": 1.0390625, + "learning_rate": 4.958983489223227e-05, + "loss": 1.1437, + "step": 814 + }, + { + "epoch": 0.05996780869165325, + "grad_norm": 1.0546875, + "learning_rate": 4.958878943075995e-05, + "loss": 1.3009, + "step": 815 + }, + { + "epoch": 0.06004138882501724, + "grad_norm": 0.9609375, + "learning_rate": 4.9587742649653746e-05, + "loss": 1.1063, + "step": 816 + }, + { + "epoch": 0.06011496895838124, + "grad_norm": 1.8125, + "learning_rate": 4.958669454896983e-05, + "loss": 0.7659, + "step": 817 + }, + { + "epoch": 0.06018854909174523, + "grad_norm": 0.92578125, + "learning_rate": 4.9585645128764456e-05, + "loss": 1.0277, + "step": 818 + }, + { + "epoch": 0.06026212922510922, + "grad_norm": 0.7890625, + "learning_rate": 4.958459438909395e-05, + "loss": 0.6037, + "step": 819 + }, + { + "epoch": 0.06033570935847321, + "grad_norm": 8.4375, + "learning_rate": 4.9583542330014694e-05, + "loss": 0.9885, + "step": 820 + }, + { + "epoch": 0.0604092894918372, + "grad_norm": 1.09375, + "learning_rate": 4.958248895158315e-05, + "loss": 1.1207, + "step": 821 + }, + { + "epoch": 0.0604828696252012, + "grad_norm": 0.97265625, + "learning_rate": 4.9581434253855855e-05, + "loss": 0.9179, + "step": 822 + }, + { + "epoch": 0.06055644975856519, + "grad_norm": 0.90234375, + "learning_rate": 4.958037823688941e-05, + "loss": 1.0005, + "step": 823 + }, + { + "epoch": 0.06063002989192918, + "grad_norm": 1.1171875, + "learning_rate": 4.95793209007405e-05, + "loss": 1.3325, + "step": 824 + }, + { + "epoch": 0.06070361002529317, + "grad_norm": 0.92578125, + "learning_rate": 4.957826224546585e-05, + "loss": 1.1585, + "step": 825 + }, + { + "epoch": 0.060777190158657166, + "grad_norm": 0.7734375, + "learning_rate": 4.9577202271122286e-05, + "loss": 0.8979, + "step": 826 + }, + { + "epoch": 0.060850770292021156, + "grad_norm": 0.98828125, + "learning_rate": 4.95761409777667e-05, + "loss": 0.8202, + "step": 827 + }, + { + "epoch": 0.060924350425385146, + "grad_norm": 0.98828125, + "learning_rate": 4.9575078365456043e-05, + "loss": 1.1022, + "step": 828 + }, + { + "epoch": 0.060997930558749136, + "grad_norm": 0.90234375, + "learning_rate": 4.957401443424735e-05, + "loss": 0.763, + "step": 829 + }, + { + "epoch": 0.061071510692113126, + "grad_norm": 1.21875, + "learning_rate": 4.957294918419772e-05, + "loss": 1.3115, + "step": 830 + }, + { + "epoch": 0.06114509082547712, + "grad_norm": 0.90625, + "learning_rate": 4.957188261536431e-05, + "loss": 0.6945, + "step": 831 + }, + { + "epoch": 0.061218670958841113, + "grad_norm": 0.85546875, + "learning_rate": 4.9570814727804374e-05, + "loss": 1.0005, + "step": 832 + }, + { + "epoch": 0.061292251092205104, + "grad_norm": 0.8515625, + "learning_rate": 4.956974552157522e-05, + "loss": 0.7747, + "step": 833 + }, + { + "epoch": 0.061365831225569094, + "grad_norm": 1.03125, + "learning_rate": 4.9568674996734224e-05, + "loss": 0.9603, + "step": 834 + }, + { + "epoch": 0.06143941135893309, + "grad_norm": 1.109375, + "learning_rate": 4.9567603153338847e-05, + "loss": 1.3527, + "step": 835 + }, + { + "epoch": 0.06151299149229708, + "grad_norm": 0.80859375, + "learning_rate": 4.956652999144661e-05, + "loss": 0.7923, + "step": 836 + }, + { + "epoch": 0.06158657162566107, + "grad_norm": 1.0, + "learning_rate": 4.956545551111511e-05, + "loss": 1.0774, + "step": 837 + }, + { + "epoch": 0.06166015175902506, + "grad_norm": 0.8671875, + "learning_rate": 4.956437971240201e-05, + "loss": 0.9355, + "step": 838 + }, + { + "epoch": 0.06173373189238905, + "grad_norm": 0.87109375, + "learning_rate": 4.9563302595365045e-05, + "loss": 0.8825, + "step": 839 + }, + { + "epoch": 0.06180731202575305, + "grad_norm": 0.7734375, + "learning_rate": 4.9562224160062024e-05, + "loss": 0.7766, + "step": 840 + }, + { + "epoch": 0.06188089215911704, + "grad_norm": 0.8828125, + "learning_rate": 4.956114440655083e-05, + "loss": 0.9358, + "step": 841 + }, + { + "epoch": 0.06195447229248103, + "grad_norm": 0.84765625, + "learning_rate": 4.9560063334889396e-05, + "loss": 0.8375, + "step": 842 + }, + { + "epoch": 0.06202805242584502, + "grad_norm": 1.0625, + "learning_rate": 4.955898094513576e-05, + "loss": 1.0116, + "step": 843 + }, + { + "epoch": 0.062101632559209016, + "grad_norm": 0.93359375, + "learning_rate": 4.955789723734799e-05, + "loss": 0.9917, + "step": 844 + }, + { + "epoch": 0.062175212692573006, + "grad_norm": 1.0234375, + "learning_rate": 4.955681221158426e-05, + "loss": 1.3232, + "step": 845 + }, + { + "epoch": 0.062248792825937, + "grad_norm": 0.9609375, + "learning_rate": 4.955572586790281e-05, + "loss": 0.8649, + "step": 846 + }, + { + "epoch": 0.06232237295930099, + "grad_norm": 0.82421875, + "learning_rate": 4.9554638206361924e-05, + "loss": 0.8878, + "step": 847 + }, + { + "epoch": 0.062395953092664984, + "grad_norm": 1.0546875, + "learning_rate": 4.955354922701998e-05, + "loss": 1.0983, + "step": 848 + }, + { + "epoch": 0.062469533226028974, + "grad_norm": 0.9453125, + "learning_rate": 4.955245892993543e-05, + "loss": 1.2533, + "step": 849 + }, + { + "epoch": 0.06254311335939296, + "grad_norm": 0.98046875, + "learning_rate": 4.955136731516679e-05, + "loss": 0.9845, + "step": 850 + }, + { + "epoch": 0.06261669349275696, + "grad_norm": 1.28125, + "learning_rate": 4.955027438277262e-05, + "loss": 1.934, + "step": 851 + }, + { + "epoch": 0.06269027362612095, + "grad_norm": 0.91015625, + "learning_rate": 4.9549180132811603e-05, + "loss": 0.8889, + "step": 852 + }, + { + "epoch": 0.06276385375948494, + "grad_norm": 0.8828125, + "learning_rate": 4.954808456534245e-05, + "loss": 0.9598, + "step": 853 + }, + { + "epoch": 0.06283743389284893, + "grad_norm": 1.0078125, + "learning_rate": 4.954698768042396e-05, + "loss": 1.1668, + "step": 854 + }, + { + "epoch": 0.06291101402621292, + "grad_norm": 0.84765625, + "learning_rate": 4.954588947811502e-05, + "loss": 0.716, + "step": 855 + }, + { + "epoch": 0.06298459415957691, + "grad_norm": 0.98046875, + "learning_rate": 4.9544789958474535e-05, + "loss": 0.8584, + "step": 856 + }, + { + "epoch": 0.0630581742929409, + "grad_norm": 0.78515625, + "learning_rate": 4.954368912156154e-05, + "loss": 0.6127, + "step": 857 + }, + { + "epoch": 0.06313175442630489, + "grad_norm": 1.046875, + "learning_rate": 4.954258696743511e-05, + "loss": 1.0402, + "step": 858 + }, + { + "epoch": 0.06320533455966888, + "grad_norm": 0.90625, + "learning_rate": 4.954148349615439e-05, + "loss": 0.9507, + "step": 859 + }, + { + "epoch": 0.06327891469303289, + "grad_norm": 1.1328125, + "learning_rate": 4.954037870777859e-05, + "loss": 1.3577, + "step": 860 + }, + { + "epoch": 0.06335249482639688, + "grad_norm": 1.2578125, + "learning_rate": 4.9539272602367025e-05, + "loss": 1.1217, + "step": 861 + }, + { + "epoch": 0.06342607495976087, + "grad_norm": 0.9765625, + "learning_rate": 4.953816517997905e-05, + "loss": 0.8124, + "step": 862 + }, + { + "epoch": 0.06349965509312486, + "grad_norm": 0.890625, + "learning_rate": 4.953705644067409e-05, + "loss": 0.9313, + "step": 863 + }, + { + "epoch": 0.06357323522648885, + "grad_norm": 1.25, + "learning_rate": 4.953594638451166e-05, + "loss": 0.9996, + "step": 864 + }, + { + "epoch": 0.06364681535985284, + "grad_norm": 3.390625, + "learning_rate": 4.953483501155133e-05, + "loss": 0.6392, + "step": 865 + }, + { + "epoch": 0.06372039549321683, + "grad_norm": 1.203125, + "learning_rate": 4.9533722321852737e-05, + "loss": 1.9765, + "step": 866 + }, + { + "epoch": 0.06379397562658082, + "grad_norm": 1.1875, + "learning_rate": 4.953260831547561e-05, + "loss": 1.0975, + "step": 867 + }, + { + "epoch": 0.06386755575994482, + "grad_norm": 1.0078125, + "learning_rate": 4.953149299247973e-05, + "loss": 0.8411, + "step": 868 + }, + { + "epoch": 0.06394113589330881, + "grad_norm": 0.84765625, + "learning_rate": 4.953037635292495e-05, + "loss": 0.6156, + "step": 869 + }, + { + "epoch": 0.0640147160266728, + "grad_norm": 0.83984375, + "learning_rate": 4.952925839687121e-05, + "loss": 0.6623, + "step": 870 + }, + { + "epoch": 0.06408829616003679, + "grad_norm": 1.109375, + "learning_rate": 4.952813912437849e-05, + "loss": 1.2113, + "step": 871 + }, + { + "epoch": 0.06416187629340078, + "grad_norm": 1.3046875, + "learning_rate": 4.9527018535506885e-05, + "loss": 0.7784, + "step": 872 + }, + { + "epoch": 0.06423545642676477, + "grad_norm": 1.0546875, + "learning_rate": 4.952589663031651e-05, + "loss": 0.9496, + "step": 873 + }, + { + "epoch": 0.06430903656012876, + "grad_norm": 0.83984375, + "learning_rate": 4.9524773408867584e-05, + "loss": 0.7851, + "step": 874 + }, + { + "epoch": 0.06438261669349275, + "grad_norm": 0.78125, + "learning_rate": 4.95236488712204e-05, + "loss": 0.578, + "step": 875 + }, + { + "epoch": 0.06445619682685674, + "grad_norm": 0.89453125, + "learning_rate": 4.9522523017435294e-05, + "loss": 1.0817, + "step": 876 + }, + { + "epoch": 0.06452977696022075, + "grad_norm": 1.140625, + "learning_rate": 4.952139584757269e-05, + "loss": 1.3044, + "step": 877 + }, + { + "epoch": 0.06460335709358474, + "grad_norm": 1.046875, + "learning_rate": 4.9520267361693095e-05, + "loss": 1.0066, + "step": 878 + }, + { + "epoch": 0.06467693722694873, + "grad_norm": 1.109375, + "learning_rate": 4.951913755985705e-05, + "loss": 1.0606, + "step": 879 + }, + { + "epoch": 0.06475051736031272, + "grad_norm": 1.125, + "learning_rate": 4.951800644212522e-05, + "loss": 1.3807, + "step": 880 + }, + { + "epoch": 0.06482409749367671, + "grad_norm": 0.88671875, + "learning_rate": 4.951687400855828e-05, + "loss": 0.8976, + "step": 881 + }, + { + "epoch": 0.0648976776270407, + "grad_norm": 0.953125, + "learning_rate": 4.951574025921701e-05, + "loss": 1.037, + "step": 882 + }, + { + "epoch": 0.06497125776040469, + "grad_norm": 0.83203125, + "learning_rate": 4.9514605194162275e-05, + "loss": 0.7599, + "step": 883 + }, + { + "epoch": 0.06504483789376868, + "grad_norm": 0.78125, + "learning_rate": 4.951346881345498e-05, + "loss": 0.6583, + "step": 884 + }, + { + "epoch": 0.06511841802713267, + "grad_norm": 0.9453125, + "learning_rate": 4.951233111715611e-05, + "loss": 0.9101, + "step": 885 + }, + { + "epoch": 0.06519199816049667, + "grad_norm": 0.85546875, + "learning_rate": 4.951119210532673e-05, + "loss": 0.8401, + "step": 886 + }, + { + "epoch": 0.06526557829386066, + "grad_norm": 0.80078125, + "learning_rate": 4.951005177802796e-05, + "loss": 0.8865, + "step": 887 + }, + { + "epoch": 0.06533915842722465, + "grad_norm": 1.109375, + "learning_rate": 4.950891013532101e-05, + "loss": 0.9345, + "step": 888 + }, + { + "epoch": 0.06541273856058864, + "grad_norm": 1.09375, + "learning_rate": 4.950776717726715e-05, + "loss": 1.1009, + "step": 889 + }, + { + "epoch": 0.06548631869395263, + "grad_norm": 1.0703125, + "learning_rate": 4.95066229039277e-05, + "loss": 1.0859, + "step": 890 + }, + { + "epoch": 0.06555989882731662, + "grad_norm": 0.99609375, + "learning_rate": 4.950547731536409e-05, + "loss": 0.9128, + "step": 891 + }, + { + "epoch": 0.06563347896068061, + "grad_norm": 0.96875, + "learning_rate": 4.9504330411637794e-05, + "loss": 0.8637, + "step": 892 + }, + { + "epoch": 0.0657070590940446, + "grad_norm": 0.87109375, + "learning_rate": 4.950318219281037e-05, + "loss": 1.0806, + "step": 893 + }, + { + "epoch": 0.0657806392274086, + "grad_norm": 0.90234375, + "learning_rate": 4.950203265894344e-05, + "loss": 0.6772, + "step": 894 + }, + { + "epoch": 0.0658542193607726, + "grad_norm": 0.7578125, + "learning_rate": 4.950088181009869e-05, + "loss": 0.7715, + "step": 895 + }, + { + "epoch": 0.06592779949413659, + "grad_norm": 1.0390625, + "learning_rate": 4.949972964633789e-05, + "loss": 1.4542, + "step": 896 + }, + { + "epoch": 0.06600137962750058, + "grad_norm": 1.234375, + "learning_rate": 4.949857616772288e-05, + "loss": 1.1663, + "step": 897 + }, + { + "epoch": 0.06607495976086457, + "grad_norm": 0.95703125, + "learning_rate": 4.949742137431555e-05, + "loss": 0.7811, + "step": 898 + }, + { + "epoch": 0.06614853989422856, + "grad_norm": 1.078125, + "learning_rate": 4.949626526617789e-05, + "loss": 0.9057, + "step": 899 + }, + { + "epoch": 0.06622212002759255, + "grad_norm": 0.984375, + "learning_rate": 4.9495107843371934e-05, + "loss": 0.9346, + "step": 900 + }, + { + "epoch": 0.06629570016095654, + "grad_norm": 0.98828125, + "learning_rate": 4.949394910595981e-05, + "loss": 0.9646, + "step": 901 + }, + { + "epoch": 0.06636928029432053, + "grad_norm": 0.953125, + "learning_rate": 4.9492789054003695e-05, + "loss": 0.945, + "step": 902 + }, + { + "epoch": 0.06644286042768452, + "grad_norm": 0.78125, + "learning_rate": 4.9491627687565855e-05, + "loss": 0.7129, + "step": 903 + }, + { + "epoch": 0.06651644056104852, + "grad_norm": 0.89453125, + "learning_rate": 4.949046500670862e-05, + "loss": 0.8891, + "step": 904 + }, + { + "epoch": 0.06659002069441251, + "grad_norm": 0.98046875, + "learning_rate": 4.948930101149437e-05, + "loss": 0.7404, + "step": 905 + }, + { + "epoch": 0.0666636008277765, + "grad_norm": 0.9375, + "learning_rate": 4.94881357019856e-05, + "loss": 1.1495, + "step": 906 + }, + { + "epoch": 0.0667371809611405, + "grad_norm": 1.234375, + "learning_rate": 4.9486969078244835e-05, + "loss": 1.0328, + "step": 907 + }, + { + "epoch": 0.06681076109450448, + "grad_norm": 1.0078125, + "learning_rate": 4.9485801140334687e-05, + "loss": 0.9204, + "step": 908 + }, + { + "epoch": 0.06688434122786847, + "grad_norm": 0.93359375, + "learning_rate": 4.948463188831784e-05, + "loss": 0.9084, + "step": 909 + }, + { + "epoch": 0.06695792136123246, + "grad_norm": 1.15625, + "learning_rate": 4.9483461322257054e-05, + "loss": 1.5526, + "step": 910 + }, + { + "epoch": 0.06703150149459645, + "grad_norm": 0.90625, + "learning_rate": 4.948228944221513e-05, + "loss": 0.904, + "step": 911 + }, + { + "epoch": 0.06710508162796044, + "grad_norm": 1.296875, + "learning_rate": 4.9481116248254976e-05, + "loss": 1.0075, + "step": 912 + }, + { + "epoch": 0.06717866176132445, + "grad_norm": 0.87109375, + "learning_rate": 4.947994174043955e-05, + "loss": 0.7878, + "step": 913 + }, + { + "epoch": 0.06725224189468844, + "grad_norm": 1.1328125, + "learning_rate": 4.947876591883189e-05, + "loss": 1.3461, + "step": 914 + }, + { + "epoch": 0.06732582202805243, + "grad_norm": 0.8515625, + "learning_rate": 4.947758878349509e-05, + "loss": 0.9798, + "step": 915 + }, + { + "epoch": 0.06739940216141642, + "grad_norm": 0.82421875, + "learning_rate": 4.947641033449234e-05, + "loss": 0.7173, + "step": 916 + }, + { + "epoch": 0.06747298229478041, + "grad_norm": 0.890625, + "learning_rate": 4.947523057188686e-05, + "loss": 0.8946, + "step": 917 + }, + { + "epoch": 0.0675465624281444, + "grad_norm": 1.1484375, + "learning_rate": 4.9474049495742006e-05, + "loss": 1.2353, + "step": 918 + }, + { + "epoch": 0.06762014256150839, + "grad_norm": 0.875, + "learning_rate": 4.947286710612112e-05, + "loss": 0.956, + "step": 919 + }, + { + "epoch": 0.06769372269487238, + "grad_norm": 1.171875, + "learning_rate": 4.9471683403087695e-05, + "loss": 1.2927, + "step": 920 + }, + { + "epoch": 0.06776730282823637, + "grad_norm": 1.0390625, + "learning_rate": 4.9470498386705236e-05, + "loss": 1.1632, + "step": 921 + }, + { + "epoch": 0.06784088296160037, + "grad_norm": 0.9609375, + "learning_rate": 4.9469312057037346e-05, + "loss": 1.243, + "step": 922 + }, + { + "epoch": 0.06791446309496436, + "grad_norm": 1.078125, + "learning_rate": 4.94681244141477e-05, + "loss": 0.7535, + "step": 923 + }, + { + "epoch": 0.06798804322832835, + "grad_norm": 1.0, + "learning_rate": 4.946693545810002e-05, + "loss": 1.3773, + "step": 924 + }, + { + "epoch": 0.06806162336169234, + "grad_norm": 1.1328125, + "learning_rate": 4.946574518895813e-05, + "loss": 1.0478, + "step": 925 + }, + { + "epoch": 0.06813520349505633, + "grad_norm": 1.390625, + "learning_rate": 4.94645536067859e-05, + "loss": 2.1322, + "step": 926 + }, + { + "epoch": 0.06820878362842032, + "grad_norm": 1.2890625, + "learning_rate": 4.94633607116473e-05, + "loss": 1.1893, + "step": 927 + }, + { + "epoch": 0.06828236376178431, + "grad_norm": 1.171875, + "learning_rate": 4.946216650360632e-05, + "loss": 1.2635, + "step": 928 + }, + { + "epoch": 0.0683559438951483, + "grad_norm": 1.03125, + "learning_rate": 4.9460970982727074e-05, + "loss": 1.1346, + "step": 929 + }, + { + "epoch": 0.0684295240285123, + "grad_norm": 0.8359375, + "learning_rate": 4.945977414907371e-05, + "loss": 1.0327, + "step": 930 + }, + { + "epoch": 0.0685031041618763, + "grad_norm": 1.28125, + "learning_rate": 4.9458576002710474e-05, + "loss": 0.8676, + "step": 931 + }, + { + "epoch": 0.06857668429524029, + "grad_norm": 0.828125, + "learning_rate": 4.945737654370165e-05, + "loss": 0.7823, + "step": 932 + }, + { + "epoch": 0.06865026442860428, + "grad_norm": 1.0625, + "learning_rate": 4.9456175772111624e-05, + "loss": 1.0325, + "step": 933 + }, + { + "epoch": 0.06872384456196827, + "grad_norm": 1.0546875, + "learning_rate": 4.9454973688004835e-05, + "loss": 1.1964, + "step": 934 + }, + { + "epoch": 0.06879742469533226, + "grad_norm": 1.0, + "learning_rate": 4.94537702914458e-05, + "loss": 0.9395, + "step": 935 + }, + { + "epoch": 0.06887100482869625, + "grad_norm": 0.84375, + "learning_rate": 4.94525655824991e-05, + "loss": 1.1721, + "step": 936 + }, + { + "epoch": 0.06894458496206024, + "grad_norm": 0.84765625, + "learning_rate": 4.945135956122938e-05, + "loss": 0.8203, + "step": 937 + }, + { + "epoch": 0.06901816509542423, + "grad_norm": 1.0390625, + "learning_rate": 4.945015222770139e-05, + "loss": 0.9418, + "step": 938 + }, + { + "epoch": 0.06909174522878822, + "grad_norm": 0.9140625, + "learning_rate": 4.94489435819799e-05, + "loss": 0.9211, + "step": 939 + }, + { + "epoch": 0.06916532536215222, + "grad_norm": 0.9609375, + "learning_rate": 4.944773362412979e-05, + "loss": 0.974, + "step": 940 + }, + { + "epoch": 0.06923890549551621, + "grad_norm": 0.890625, + "learning_rate": 4.944652235421599e-05, + "loss": 1.1142, + "step": 941 + }, + { + "epoch": 0.0693124856288802, + "grad_norm": 0.9921875, + "learning_rate": 4.94453097723035e-05, + "loss": 1.0195, + "step": 942 + }, + { + "epoch": 0.0693860657622442, + "grad_norm": 0.95703125, + "learning_rate": 4.944409587845741e-05, + "loss": 1.0767, + "step": 943 + }, + { + "epoch": 0.06945964589560819, + "grad_norm": 0.88671875, + "learning_rate": 4.9442880672742866e-05, + "loss": 1.1185, + "step": 944 + }, + { + "epoch": 0.06953322602897218, + "grad_norm": 0.95703125, + "learning_rate": 4.944166415522509e-05, + "loss": 1.3641, + "step": 945 + }, + { + "epoch": 0.06960680616233617, + "grad_norm": 1.0078125, + "learning_rate": 4.944044632596935e-05, + "loss": 0.9978, + "step": 946 + }, + { + "epoch": 0.06968038629570016, + "grad_norm": 1.03125, + "learning_rate": 4.9439227185041015e-05, + "loss": 1.0883, + "step": 947 + }, + { + "epoch": 0.06975396642906415, + "grad_norm": 1.2890625, + "learning_rate": 4.943800673250553e-05, + "loss": 1.2315, + "step": 948 + }, + { + "epoch": 0.06982754656242815, + "grad_norm": 0.90625, + "learning_rate": 4.943678496842837e-05, + "loss": 0.934, + "step": 949 + }, + { + "epoch": 0.06990112669579214, + "grad_norm": 0.6484375, + "learning_rate": 4.9435561892875114e-05, + "loss": 0.6817, + "step": 950 + }, + { + "epoch": 0.06997470682915613, + "grad_norm": 0.90625, + "learning_rate": 4.943433750591141e-05, + "loss": 0.9523, + "step": 951 + }, + { + "epoch": 0.07004828696252012, + "grad_norm": 0.73046875, + "learning_rate": 4.9433111807602957e-05, + "loss": 0.6998, + "step": 952 + }, + { + "epoch": 0.07012186709588411, + "grad_norm": 0.94140625, + "learning_rate": 4.943188479801554e-05, + "loss": 1.1778, + "step": 953 + }, + { + "epoch": 0.0701954472292481, + "grad_norm": 0.96484375, + "learning_rate": 4.9430656477215016e-05, + "loss": 0.8764, + "step": 954 + }, + { + "epoch": 0.07026902736261209, + "grad_norm": 0.99609375, + "learning_rate": 4.94294268452673e-05, + "loss": 1.3627, + "step": 955 + }, + { + "epoch": 0.07034260749597608, + "grad_norm": 1.1875, + "learning_rate": 4.942819590223838e-05, + "loss": 0.9452, + "step": 956 + }, + { + "epoch": 0.07041618762934007, + "grad_norm": 1.2109375, + "learning_rate": 4.942696364819433e-05, + "loss": 1.3631, + "step": 957 + }, + { + "epoch": 0.07048976776270408, + "grad_norm": 1.0859375, + "learning_rate": 4.942573008320128e-05, + "loss": 1.4654, + "step": 958 + }, + { + "epoch": 0.07056334789606807, + "grad_norm": 0.91015625, + "learning_rate": 4.942449520732543e-05, + "loss": 0.801, + "step": 959 + }, + { + "epoch": 0.07063692802943206, + "grad_norm": 0.8203125, + "learning_rate": 4.942325902063305e-05, + "loss": 0.868, + "step": 960 + }, + { + "epoch": 0.07071050816279605, + "grad_norm": 1.0859375, + "learning_rate": 4.9422021523190486e-05, + "loss": 0.9592, + "step": 961 + }, + { + "epoch": 0.07078408829616004, + "grad_norm": 0.96875, + "learning_rate": 4.9420782715064154e-05, + "loss": 0.941, + "step": 962 + }, + { + "epoch": 0.07085766842952403, + "grad_norm": 1.15625, + "learning_rate": 4.9419542596320544e-05, + "loss": 1.2685, + "step": 963 + }, + { + "epoch": 0.07093124856288802, + "grad_norm": 0.8828125, + "learning_rate": 4.94183011670262e-05, + "loss": 0.8669, + "step": 964 + }, + { + "epoch": 0.071004828696252, + "grad_norm": 0.86328125, + "learning_rate": 4.941705842724775e-05, + "loss": 0.8033, + "step": 965 + }, + { + "epoch": 0.07107840882961601, + "grad_norm": 0.859375, + "learning_rate": 4.941581437705189e-05, + "loss": 0.7802, + "step": 966 + }, + { + "epoch": 0.07115198896298, + "grad_norm": 0.83203125, + "learning_rate": 4.9414569016505394e-05, + "loss": 0.8719, + "step": 967 + }, + { + "epoch": 0.07122556909634399, + "grad_norm": 1.109375, + "learning_rate": 4.941332234567509e-05, + "loss": 1.311, + "step": 968 + }, + { + "epoch": 0.07129914922970798, + "grad_norm": 0.875, + "learning_rate": 4.941207436462788e-05, + "loss": 1.0031, + "step": 969 + }, + { + "epoch": 0.07137272936307197, + "grad_norm": 0.90234375, + "learning_rate": 4.9410825073430756e-05, + "loss": 1.2397, + "step": 970 + }, + { + "epoch": 0.07144630949643596, + "grad_norm": 1.0390625, + "learning_rate": 4.940957447215075e-05, + "loss": 1.4965, + "step": 971 + }, + { + "epoch": 0.07151988962979995, + "grad_norm": 1.046875, + "learning_rate": 4.9408322560854985e-05, + "loss": 1.0148, + "step": 972 + }, + { + "epoch": 0.07159346976316394, + "grad_norm": 0.8125, + "learning_rate": 4.940706933961065e-05, + "loss": 0.6665, + "step": 973 + }, + { + "epoch": 0.07166704989652793, + "grad_norm": 0.90234375, + "learning_rate": 4.9405814808485e-05, + "loss": 0.7528, + "step": 974 + }, + { + "epoch": 0.07174063002989194, + "grad_norm": 1.0859375, + "learning_rate": 4.940455896754537e-05, + "loss": 0.9434, + "step": 975 + }, + { + "epoch": 0.07181421016325593, + "grad_norm": 0.89453125, + "learning_rate": 4.940330181685915e-05, + "loss": 0.7931, + "step": 976 + }, + { + "epoch": 0.07188779029661992, + "grad_norm": 0.9765625, + "learning_rate": 4.9402043356493816e-05, + "loss": 1.1085, + "step": 977 + }, + { + "epoch": 0.0719613704299839, + "grad_norm": 0.8359375, + "learning_rate": 4.9400783586516895e-05, + "loss": 0.6421, + "step": 978 + }, + { + "epoch": 0.0720349505633479, + "grad_norm": 0.8515625, + "learning_rate": 4.939952250699602e-05, + "loss": 0.7732, + "step": 979 + }, + { + "epoch": 0.07210853069671189, + "grad_norm": 0.703125, + "learning_rate": 4.939826011799885e-05, + "loss": 0.713, + "step": 980 + }, + { + "epoch": 0.07218211083007588, + "grad_norm": 0.78125, + "learning_rate": 4.9396996419593136e-05, + "loss": 0.6547, + "step": 981 + }, + { + "epoch": 0.07225569096343987, + "grad_norm": 1.0234375, + "learning_rate": 4.939573141184671e-05, + "loss": 0.7736, + "step": 982 + }, + { + "epoch": 0.07232927109680386, + "grad_norm": 1.171875, + "learning_rate": 4.9394465094827446e-05, + "loss": 1.1406, + "step": 983 + }, + { + "epoch": 0.07240285123016786, + "grad_norm": 1.296875, + "learning_rate": 4.9393197468603324e-05, + "loss": 1.4124, + "step": 984 + }, + { + "epoch": 0.07247643136353185, + "grad_norm": 1.203125, + "learning_rate": 4.939192853324236e-05, + "loss": 1.4157, + "step": 985 + }, + { + "epoch": 0.07255001149689584, + "grad_norm": 0.9453125, + "learning_rate": 4.9390658288812675e-05, + "loss": 0.9396, + "step": 986 + }, + { + "epoch": 0.07262359163025983, + "grad_norm": 1.046875, + "learning_rate": 4.938938673538242e-05, + "loss": 0.6932, + "step": 987 + }, + { + "epoch": 0.07269717176362382, + "grad_norm": 1.0078125, + "learning_rate": 4.9388113873019835e-05, + "loss": 1.0153, + "step": 988 + }, + { + "epoch": 0.07277075189698781, + "grad_norm": 0.91015625, + "learning_rate": 4.938683970179325e-05, + "loss": 1.0883, + "step": 989 + }, + { + "epoch": 0.0728443320303518, + "grad_norm": 0.92578125, + "learning_rate": 4.938556422177104e-05, + "loss": 0.9006, + "step": 990 + }, + { + "epoch": 0.07291791216371579, + "grad_norm": 0.828125, + "learning_rate": 4.9384287433021646e-05, + "loss": 0.8341, + "step": 991 + }, + { + "epoch": 0.07299149229707978, + "grad_norm": 1.1171875, + "learning_rate": 4.9383009335613606e-05, + "loss": 1.3068, + "step": 992 + }, + { + "epoch": 0.07306507243044379, + "grad_norm": 1.0078125, + "learning_rate": 4.938172992961551e-05, + "loss": 1.0953, + "step": 993 + }, + { + "epoch": 0.07313865256380778, + "grad_norm": 0.92578125, + "learning_rate": 4.938044921509602e-05, + "loss": 0.9872, + "step": 994 + }, + { + "epoch": 0.07321223269717177, + "grad_norm": 0.953125, + "learning_rate": 4.9379167192123864e-05, + "loss": 1.1377, + "step": 995 + }, + { + "epoch": 0.07328581283053576, + "grad_norm": 0.8203125, + "learning_rate": 4.937788386076786e-05, + "loss": 0.9147, + "step": 996 + }, + { + "epoch": 0.07335939296389975, + "grad_norm": 1.3828125, + "learning_rate": 4.9376599221096865e-05, + "loss": 1.6023, + "step": 997 + }, + { + "epoch": 0.07343297309726374, + "grad_norm": 0.8671875, + "learning_rate": 4.937531327317983e-05, + "loss": 1.0771, + "step": 998 + }, + { + "epoch": 0.07350655323062773, + "grad_norm": 0.890625, + "learning_rate": 4.9374026017085776e-05, + "loss": 0.7791, + "step": 999 + }, + { + "epoch": 0.07358013336399172, + "grad_norm": 0.84375, + "learning_rate": 4.9372737452883774e-05, + "loss": 1.045, + "step": 1000 + }, + { + "epoch": 0.07365371349735571, + "grad_norm": 0.8515625, + "learning_rate": 4.937144758064299e-05, + "loss": 0.5552, + "step": 1001 + }, + { + "epoch": 0.07372729363071971, + "grad_norm": 0.79296875, + "learning_rate": 4.937015640043264e-05, + "loss": 0.6764, + "step": 1002 + }, + { + "epoch": 0.0738008737640837, + "grad_norm": 0.98046875, + "learning_rate": 4.9368863912322036e-05, + "loss": 1.0907, + "step": 1003 + }, + { + "epoch": 0.07387445389744769, + "grad_norm": 0.83984375, + "learning_rate": 4.936757011638052e-05, + "loss": 0.7189, + "step": 1004 + }, + { + "epoch": 0.07394803403081168, + "grad_norm": 0.98828125, + "learning_rate": 4.936627501267755e-05, + "loss": 0.6947, + "step": 1005 + }, + { + "epoch": 0.07402161416417567, + "grad_norm": 0.890625, + "learning_rate": 4.936497860128262e-05, + "loss": 0.9126, + "step": 1006 + }, + { + "epoch": 0.07409519429753966, + "grad_norm": 0.75, + "learning_rate": 4.93636808822653e-05, + "loss": 0.8022, + "step": 1007 + }, + { + "epoch": 0.07416877443090365, + "grad_norm": 0.9296875, + "learning_rate": 4.936238185569525e-05, + "loss": 0.8789, + "step": 1008 + }, + { + "epoch": 0.07424235456426764, + "grad_norm": 0.8671875, + "learning_rate": 4.9361081521642176e-05, + "loss": 1.1579, + "step": 1009 + }, + { + "epoch": 0.07431593469763163, + "grad_norm": 0.9453125, + "learning_rate": 4.935977988017587e-05, + "loss": 1.1439, + "step": 1010 + }, + { + "epoch": 0.07438951483099564, + "grad_norm": 0.95703125, + "learning_rate": 4.935847693136619e-05, + "loss": 1.4934, + "step": 1011 + }, + { + "epoch": 0.07446309496435963, + "grad_norm": 0.765625, + "learning_rate": 4.935717267528305e-05, + "loss": 0.7336, + "step": 1012 + }, + { + "epoch": 0.07453667509772362, + "grad_norm": 0.69140625, + "learning_rate": 4.935586711199647e-05, + "loss": 0.7628, + "step": 1013 + }, + { + "epoch": 0.07461025523108761, + "grad_norm": 1.0625, + "learning_rate": 4.9354560241576494e-05, + "loss": 0.9268, + "step": 1014 + }, + { + "epoch": 0.0746838353644516, + "grad_norm": 0.81640625, + "learning_rate": 4.935325206409327e-05, + "loss": 0.8797, + "step": 1015 + }, + { + "epoch": 0.07475741549781559, + "grad_norm": 0.7421875, + "learning_rate": 4.935194257961701e-05, + "loss": 0.6649, + "step": 1016 + }, + { + "epoch": 0.07483099563117958, + "grad_norm": 0.8984375, + "learning_rate": 4.935063178821798e-05, + "loss": 1.018, + "step": 1017 + }, + { + "epoch": 0.07490457576454357, + "grad_norm": 0.91015625, + "learning_rate": 4.934931968996654e-05, + "loss": 1.0185, + "step": 1018 + }, + { + "epoch": 0.07497815589790756, + "grad_norm": 1.1171875, + "learning_rate": 4.9348006284933094e-05, + "loss": 1.1034, + "step": 1019 + }, + { + "epoch": 0.07505173603127156, + "grad_norm": 1.0546875, + "learning_rate": 4.934669157318814e-05, + "loss": 1.3453, + "step": 1020 + }, + { + "epoch": 0.07512531616463555, + "grad_norm": 0.98046875, + "learning_rate": 4.9345375554802246e-05, + "loss": 1.0863, + "step": 1021 + }, + { + "epoch": 0.07519889629799954, + "grad_norm": 1.328125, + "learning_rate": 4.9344058229846015e-05, + "loss": 1.431, + "step": 1022 + }, + { + "epoch": 0.07527247643136353, + "grad_norm": 0.82421875, + "learning_rate": 4.934273959839015e-05, + "loss": 1.0453, + "step": 1023 + }, + { + "epoch": 0.07534605656472752, + "grad_norm": 1.1015625, + "learning_rate": 4.9341419660505444e-05, + "loss": 1.4968, + "step": 1024 + }, + { + "epoch": 0.07541963669809151, + "grad_norm": 1.1953125, + "learning_rate": 4.9340098416262714e-05, + "loss": 1.0828, + "step": 1025 + }, + { + "epoch": 0.0754932168314555, + "grad_norm": 0.875, + "learning_rate": 4.9338775865732874e-05, + "loss": 0.8269, + "step": 1026 + }, + { + "epoch": 0.0755667969648195, + "grad_norm": 1.171875, + "learning_rate": 4.93374520089869e-05, + "loss": 1.1307, + "step": 1027 + }, + { + "epoch": 0.07564037709818348, + "grad_norm": 0.91796875, + "learning_rate": 4.9336126846095846e-05, + "loss": 0.7401, + "step": 1028 + }, + { + "epoch": 0.07571395723154749, + "grad_norm": 1.0625, + "learning_rate": 4.933480037713083e-05, + "loss": 1.185, + "step": 1029 + }, + { + "epoch": 0.07578753736491148, + "grad_norm": 0.76171875, + "learning_rate": 4.9333472602163035e-05, + "loss": 0.6656, + "step": 1030 + }, + { + "epoch": 0.07586111749827547, + "grad_norm": 0.94921875, + "learning_rate": 4.933214352126373e-05, + "loss": 1.0686, + "step": 1031 + }, + { + "epoch": 0.07593469763163946, + "grad_norm": 0.8359375, + "learning_rate": 4.933081313450423e-05, + "loss": 0.6938, + "step": 1032 + }, + { + "epoch": 0.07600827776500345, + "grad_norm": 0.8984375, + "learning_rate": 4.932948144195596e-05, + "loss": 0.8716, + "step": 1033 + }, + { + "epoch": 0.07608185789836744, + "grad_norm": 1.0234375, + "learning_rate": 4.9328148443690356e-05, + "loss": 1.202, + "step": 1034 + }, + { + "epoch": 0.07615543803173143, + "grad_norm": 1.0078125, + "learning_rate": 4.9326814139778985e-05, + "loss": 1.1178, + "step": 1035 + }, + { + "epoch": 0.07622901816509542, + "grad_norm": 0.79296875, + "learning_rate": 4.932547853029344e-05, + "loss": 0.6975, + "step": 1036 + }, + { + "epoch": 0.07630259829845941, + "grad_norm": 0.90234375, + "learning_rate": 4.932414161530541e-05, + "loss": 0.7133, + "step": 1037 + }, + { + "epoch": 0.07637617843182341, + "grad_norm": 1.203125, + "learning_rate": 4.932280339488664e-05, + "loss": 1.4606, + "step": 1038 + }, + { + "epoch": 0.0764497585651874, + "grad_norm": 0.83984375, + "learning_rate": 4.9321463869108954e-05, + "loss": 0.9715, + "step": 1039 + }, + { + "epoch": 0.0765233386985514, + "grad_norm": 0.90625, + "learning_rate": 4.932012303804423e-05, + "loss": 1.0028, + "step": 1040 + }, + { + "epoch": 0.07659691883191538, + "grad_norm": 0.875, + "learning_rate": 4.931878090176445e-05, + "loss": 0.9707, + "step": 1041 + }, + { + "epoch": 0.07667049896527937, + "grad_norm": 0.9140625, + "learning_rate": 4.931743746034162e-05, + "loss": 0.8805, + "step": 1042 + }, + { + "epoch": 0.07674407909864336, + "grad_norm": 0.7578125, + "learning_rate": 4.931609271384785e-05, + "loss": 0.7408, + "step": 1043 + }, + { + "epoch": 0.07681765923200735, + "grad_norm": 1.046875, + "learning_rate": 4.931474666235531e-05, + "loss": 1.489, + "step": 1044 + }, + { + "epoch": 0.07689123936537134, + "grad_norm": 0.78515625, + "learning_rate": 4.931339930593625e-05, + "loss": 0.636, + "step": 1045 + }, + { + "epoch": 0.07696481949873533, + "grad_norm": 0.91796875, + "learning_rate": 4.931205064466297e-05, + "loss": 1.081, + "step": 1046 + }, + { + "epoch": 0.07703839963209934, + "grad_norm": 0.890625, + "learning_rate": 4.931070067860785e-05, + "loss": 0.7789, + "step": 1047 + }, + { + "epoch": 0.07711197976546333, + "grad_norm": 0.82421875, + "learning_rate": 4.930934940784333e-05, + "loss": 0.7309, + "step": 1048 + }, + { + "epoch": 0.07718555989882732, + "grad_norm": 1.0078125, + "learning_rate": 4.930799683244195e-05, + "loss": 1.0883, + "step": 1049 + }, + { + "epoch": 0.07725914003219131, + "grad_norm": 1.1171875, + "learning_rate": 4.930664295247629e-05, + "loss": 1.576, + "step": 1050 + }, + { + "epoch": 0.0773327201655553, + "grad_norm": 1.0625, + "learning_rate": 4.930528776801901e-05, + "loss": 0.8766, + "step": 1051 + }, + { + "epoch": 0.07740630029891929, + "grad_norm": 0.6796875, + "learning_rate": 4.9303931279142844e-05, + "loss": 0.705, + "step": 1052 + }, + { + "epoch": 0.07747988043228328, + "grad_norm": 1.6953125, + "learning_rate": 4.9302573485920587e-05, + "loss": 0.6259, + "step": 1053 + }, + { + "epoch": 0.07755346056564727, + "grad_norm": 1.171875, + "learning_rate": 4.930121438842512e-05, + "loss": 1.1955, + "step": 1054 + }, + { + "epoch": 0.07762704069901127, + "grad_norm": 0.83984375, + "learning_rate": 4.9299853986729365e-05, + "loss": 0.6949, + "step": 1055 + }, + { + "epoch": 0.07770062083237526, + "grad_norm": 0.9609375, + "learning_rate": 4.929849228090635e-05, + "loss": 0.8881, + "step": 1056 + }, + { + "epoch": 0.07777420096573925, + "grad_norm": 0.89453125, + "learning_rate": 4.929712927102914e-05, + "loss": 0.8625, + "step": 1057 + }, + { + "epoch": 0.07784778109910324, + "grad_norm": 0.9296875, + "learning_rate": 4.92957649571709e-05, + "loss": 1.0278, + "step": 1058 + }, + { + "epoch": 0.07792136123246723, + "grad_norm": 0.83984375, + "learning_rate": 4.929439933940484e-05, + "loss": 0.8735, + "step": 1059 + }, + { + "epoch": 0.07799494136583122, + "grad_norm": 0.875, + "learning_rate": 4.929303241780425e-05, + "loss": 0.8721, + "step": 1060 + }, + { + "epoch": 0.07806852149919521, + "grad_norm": 0.80078125, + "learning_rate": 4.92916641924425e-05, + "loss": 0.7614, + "step": 1061 + }, + { + "epoch": 0.0781421016325592, + "grad_norm": 1.0078125, + "learning_rate": 4.929029466339301e-05, + "loss": 1.2894, + "step": 1062 + }, + { + "epoch": 0.0782156817659232, + "grad_norm": 1.0078125, + "learning_rate": 4.928892383072928e-05, + "loss": 0.9365, + "step": 1063 + }, + { + "epoch": 0.0782892618992872, + "grad_norm": 1.0703125, + "learning_rate": 4.9287551694524894e-05, + "loss": 1.2815, + "step": 1064 + }, + { + "epoch": 0.07836284203265119, + "grad_norm": 1.0078125, + "learning_rate": 4.928617825485347e-05, + "loss": 0.7414, + "step": 1065 + }, + { + "epoch": 0.07843642216601518, + "grad_norm": 1.2578125, + "learning_rate": 4.928480351178873e-05, + "loss": 0.8891, + "step": 1066 + }, + { + "epoch": 0.07851000229937917, + "grad_norm": 0.8515625, + "learning_rate": 4.928342746540446e-05, + "loss": 0.8198, + "step": 1067 + }, + { + "epoch": 0.07858358243274316, + "grad_norm": 0.7578125, + "learning_rate": 4.92820501157745e-05, + "loss": 0.7424, + "step": 1068 + }, + { + "epoch": 0.07865716256610715, + "grad_norm": 1.0625, + "learning_rate": 4.928067146297277e-05, + "loss": 1.1136, + "step": 1069 + }, + { + "epoch": 0.07873074269947114, + "grad_norm": 0.98046875, + "learning_rate": 4.927929150707326e-05, + "loss": 1.0194, + "step": 1070 + }, + { + "epoch": 0.07880432283283513, + "grad_norm": 0.87109375, + "learning_rate": 4.927791024815004e-05, + "loss": 0.828, + "step": 1071 + }, + { + "epoch": 0.07887790296619912, + "grad_norm": 0.9296875, + "learning_rate": 4.927652768627722e-05, + "loss": 1.1475, + "step": 1072 + }, + { + "epoch": 0.07895148309956312, + "grad_norm": 1.0703125, + "learning_rate": 4.9275143821529025e-05, + "loss": 1.2907, + "step": 1073 + }, + { + "epoch": 0.07902506323292711, + "grad_norm": 0.87109375, + "learning_rate": 4.927375865397969e-05, + "loss": 0.7493, + "step": 1074 + }, + { + "epoch": 0.0790986433662911, + "grad_norm": 0.90234375, + "learning_rate": 4.9272372183703594e-05, + "loss": 0.8228, + "step": 1075 + }, + { + "epoch": 0.0791722234996551, + "grad_norm": 0.7890625, + "learning_rate": 4.9270984410775125e-05, + "loss": 0.7071, + "step": 1076 + }, + { + "epoch": 0.07924580363301909, + "grad_norm": 0.87109375, + "learning_rate": 4.926959533526876e-05, + "loss": 0.98, + "step": 1077 + }, + { + "epoch": 0.07931938376638308, + "grad_norm": 0.91796875, + "learning_rate": 4.926820495725905e-05, + "loss": 0.968, + "step": 1078 + }, + { + "epoch": 0.07939296389974707, + "grad_norm": 1.09375, + "learning_rate": 4.926681327682061e-05, + "loss": 1.0755, + "step": 1079 + }, + { + "epoch": 0.07946654403311106, + "grad_norm": 0.94140625, + "learning_rate": 4.926542029402815e-05, + "loss": 1.4966, + "step": 1080 + }, + { + "epoch": 0.07954012416647505, + "grad_norm": 1.0625, + "learning_rate": 4.9264026008956403e-05, + "loss": 0.8471, + "step": 1081 + }, + { + "epoch": 0.07961370429983905, + "grad_norm": 0.6875, + "learning_rate": 4.926263042168021e-05, + "loss": 0.5744, + "step": 1082 + }, + { + "epoch": 0.07968728443320304, + "grad_norm": 0.92578125, + "learning_rate": 4.926123353227447e-05, + "loss": 1.1622, + "step": 1083 + }, + { + "epoch": 0.07976086456656703, + "grad_norm": 0.84765625, + "learning_rate": 4.925983534081416e-05, + "loss": 0.9022, + "step": 1084 + }, + { + "epoch": 0.07983444469993102, + "grad_norm": 0.90625, + "learning_rate": 4.9258435847374286e-05, + "loss": 0.7833, + "step": 1085 + }, + { + "epoch": 0.07990802483329501, + "grad_norm": 1.140625, + "learning_rate": 4.925703505202999e-05, + "loss": 0.8629, + "step": 1086 + }, + { + "epoch": 0.079981604966659, + "grad_norm": 1.2421875, + "learning_rate": 4.925563295485644e-05, + "loss": 1.7629, + "step": 1087 + }, + { + "epoch": 0.08005518510002299, + "grad_norm": 1.046875, + "learning_rate": 4.925422955592887e-05, + "loss": 1.1017, + "step": 1088 + }, + { + "epoch": 0.08012876523338698, + "grad_norm": 1.109375, + "learning_rate": 4.9252824855322624e-05, + "loss": 1.2518, + "step": 1089 + }, + { + "epoch": 0.08020234536675097, + "grad_norm": 0.84765625, + "learning_rate": 4.925141885311306e-05, + "loss": 1.0385, + "step": 1090 + }, + { + "epoch": 0.08027592550011498, + "grad_norm": 0.94921875, + "learning_rate": 4.9250011549375664e-05, + "loss": 0.9763, + "step": 1091 + }, + { + "epoch": 0.08034950563347897, + "grad_norm": 0.94921875, + "learning_rate": 4.924860294418594e-05, + "loss": 1.0588, + "step": 1092 + }, + { + "epoch": 0.08042308576684296, + "grad_norm": 0.92578125, + "learning_rate": 4.92471930376195e-05, + "loss": 0.6802, + "step": 1093 + }, + { + "epoch": 0.08049666590020695, + "grad_norm": 1.078125, + "learning_rate": 4.924578182975201e-05, + "loss": 0.8913, + "step": 1094 + }, + { + "epoch": 0.08057024603357094, + "grad_norm": 2.5, + "learning_rate": 4.9244369320659204e-05, + "loss": 1.4806, + "step": 1095 + }, + { + "epoch": 0.08064382616693493, + "grad_norm": 1.2578125, + "learning_rate": 4.9242955510416877e-05, + "loss": 1.084, + "step": 1096 + }, + { + "epoch": 0.08071740630029892, + "grad_norm": 1.1328125, + "learning_rate": 4.924154039910092e-05, + "loss": 0.9423, + "step": 1097 + }, + { + "epoch": 0.0807909864336629, + "grad_norm": 1.078125, + "learning_rate": 4.924012398678728e-05, + "loss": 1.2512, + "step": 1098 + }, + { + "epoch": 0.0808645665670269, + "grad_norm": 0.8515625, + "learning_rate": 4.923870627355196e-05, + "loss": 0.8654, + "step": 1099 + }, + { + "epoch": 0.0809381467003909, + "grad_norm": 0.8984375, + "learning_rate": 4.923728725947106e-05, + "loss": 0.7415, + "step": 1100 + }, + { + "epoch": 0.08101172683375489, + "grad_norm": 0.88671875, + "learning_rate": 4.923586694462073e-05, + "loss": 0.9244, + "step": 1101 + }, + { + "epoch": 0.08108530696711888, + "grad_norm": 0.99609375, + "learning_rate": 4.9234445329077207e-05, + "loss": 0.7599, + "step": 1102 + }, + { + "epoch": 0.08115888710048287, + "grad_norm": 0.88671875, + "learning_rate": 4.9233022412916766e-05, + "loss": 0.9252, + "step": 1103 + }, + { + "epoch": 0.08123246723384686, + "grad_norm": 0.8828125, + "learning_rate": 4.923159819621578e-05, + "loss": 0.7523, + "step": 1104 + }, + { + "epoch": 0.08130604736721085, + "grad_norm": 0.875, + "learning_rate": 4.9230172679050686e-05, + "loss": 1.0046, + "step": 1105 + }, + { + "epoch": 0.08137962750057484, + "grad_norm": 0.90234375, + "learning_rate": 4.922874586149799e-05, + "loss": 0.7878, + "step": 1106 + }, + { + "epoch": 0.08145320763393883, + "grad_norm": 0.8828125, + "learning_rate": 4.922731774363426e-05, + "loss": 0.9076, + "step": 1107 + }, + { + "epoch": 0.08152678776730282, + "grad_norm": 1.203125, + "learning_rate": 4.9225888325536154e-05, + "loss": 1.5259, + "step": 1108 + }, + { + "epoch": 0.08160036790066683, + "grad_norm": 0.75, + "learning_rate": 4.922445760728037e-05, + "loss": 0.7926, + "step": 1109 + }, + { + "epoch": 0.08167394803403082, + "grad_norm": 1.0, + "learning_rate": 4.9223025588943704e-05, + "loss": 0.9363, + "step": 1110 + }, + { + "epoch": 0.0817475281673948, + "grad_norm": 0.91796875, + "learning_rate": 4.922159227060301e-05, + "loss": 0.8192, + "step": 1111 + }, + { + "epoch": 0.0818211083007588, + "grad_norm": 0.99609375, + "learning_rate": 4.92201576523352e-05, + "loss": 1.3752, + "step": 1112 + }, + { + "epoch": 0.08189468843412279, + "grad_norm": 0.8671875, + "learning_rate": 4.9218721734217274e-05, + "loss": 0.7666, + "step": 1113 + }, + { + "epoch": 0.08196826856748678, + "grad_norm": 1.0546875, + "learning_rate": 4.9217284516326304e-05, + "loss": 1.1508, + "step": 1114 + }, + { + "epoch": 0.08204184870085077, + "grad_norm": 3.3125, + "learning_rate": 4.921584599873941e-05, + "loss": 0.8317, + "step": 1115 + }, + { + "epoch": 0.08211542883421476, + "grad_norm": 1.03125, + "learning_rate": 4.9214406181533795e-05, + "loss": 1.1477, + "step": 1116 + }, + { + "epoch": 0.08218900896757875, + "grad_norm": 0.859375, + "learning_rate": 4.921296506478674e-05, + "loss": 0.7427, + "step": 1117 + }, + { + "epoch": 0.08226258910094275, + "grad_norm": 0.8828125, + "learning_rate": 4.921152264857557e-05, + "loss": 0.7214, + "step": 1118 + }, + { + "epoch": 0.08233616923430674, + "grad_norm": 0.890625, + "learning_rate": 4.921007893297772e-05, + "loss": 0.8582, + "step": 1119 + }, + { + "epoch": 0.08240974936767073, + "grad_norm": 0.74609375, + "learning_rate": 4.920863391807066e-05, + "loss": 0.7496, + "step": 1120 + }, + { + "epoch": 0.08248332950103472, + "grad_norm": 0.7578125, + "learning_rate": 4.920718760393194e-05, + "loss": 0.6417, + "step": 1121 + }, + { + "epoch": 0.08255690963439871, + "grad_norm": 1.203125, + "learning_rate": 4.920573999063918e-05, + "loss": 1.1525, + "step": 1122 + }, + { + "epoch": 0.0826304897677627, + "grad_norm": 1.28125, + "learning_rate": 4.920429107827007e-05, + "loss": 1.1709, + "step": 1123 + }, + { + "epoch": 0.08270406990112669, + "grad_norm": 1.015625, + "learning_rate": 4.9202840866902374e-05, + "loss": 0.9924, + "step": 1124 + }, + { + "epoch": 0.08277765003449068, + "grad_norm": 0.95703125, + "learning_rate": 4.9201389356613925e-05, + "loss": 1.2457, + "step": 1125 + }, + { + "epoch": 0.08285123016785467, + "grad_norm": 0.87109375, + "learning_rate": 4.919993654748262e-05, + "loss": 0.7907, + "step": 1126 + }, + { + "epoch": 0.08292481030121868, + "grad_norm": 0.9296875, + "learning_rate": 4.919848243958642e-05, + "loss": 0.8108, + "step": 1127 + }, + { + "epoch": 0.08299839043458267, + "grad_norm": 1.109375, + "learning_rate": 4.9197027033003376e-05, + "loss": 0.9923, + "step": 1128 + }, + { + "epoch": 0.08307197056794666, + "grad_norm": 0.85546875, + "learning_rate": 4.9195570327811595e-05, + "loss": 0.747, + "step": 1129 + }, + { + "epoch": 0.08314555070131065, + "grad_norm": 0.73828125, + "learning_rate": 4.9194112324089256e-05, + "loss": 0.7146, + "step": 1130 + }, + { + "epoch": 0.08321913083467464, + "grad_norm": 1.5546875, + "learning_rate": 4.91926530219146e-05, + "loss": 0.8503, + "step": 1131 + }, + { + "epoch": 0.08329271096803863, + "grad_norm": 0.921875, + "learning_rate": 4.919119242136595e-05, + "loss": 0.8781, + "step": 1132 + }, + { + "epoch": 0.08336629110140262, + "grad_norm": 0.93359375, + "learning_rate": 4.918973052252169e-05, + "loss": 0.7988, + "step": 1133 + }, + { + "epoch": 0.08343987123476661, + "grad_norm": 0.8203125, + "learning_rate": 4.918826732546029e-05, + "loss": 0.923, + "step": 1134 + }, + { + "epoch": 0.0835134513681306, + "grad_norm": 0.7421875, + "learning_rate": 4.918680283026026e-05, + "loss": 0.6523, + "step": 1135 + }, + { + "epoch": 0.0835870315014946, + "grad_norm": 0.99609375, + "learning_rate": 4.9185337037000215e-05, + "loss": 1.0275, + "step": 1136 + }, + { + "epoch": 0.08366061163485859, + "grad_norm": 1.25, + "learning_rate": 4.9183869945758795e-05, + "loss": 1.8992, + "step": 1137 + }, + { + "epoch": 0.08373419176822258, + "grad_norm": 0.80859375, + "learning_rate": 4.918240155661477e-05, + "loss": 1.0567, + "step": 1138 + }, + { + "epoch": 0.08380777190158657, + "grad_norm": 0.984375, + "learning_rate": 4.9180931869646904e-05, + "loss": 0.8504, + "step": 1139 + }, + { + "epoch": 0.08388135203495056, + "grad_norm": 1.296875, + "learning_rate": 4.917946088493412e-05, + "loss": 1.3163, + "step": 1140 + }, + { + "epoch": 0.08395493216831455, + "grad_norm": 0.875, + "learning_rate": 4.917798860255533e-05, + "loss": 0.9608, + "step": 1141 + }, + { + "epoch": 0.08402851230167854, + "grad_norm": 0.82421875, + "learning_rate": 4.917651502258955e-05, + "loss": 0.8979, + "step": 1142 + }, + { + "epoch": 0.08410209243504253, + "grad_norm": 0.8671875, + "learning_rate": 4.917504014511587e-05, + "loss": 0.7785, + "step": 1143 + }, + { + "epoch": 0.08417567256840654, + "grad_norm": 0.91796875, + "learning_rate": 4.917356397021346e-05, + "loss": 0.8761, + "step": 1144 + }, + { + "epoch": 0.08424925270177053, + "grad_norm": 0.99609375, + "learning_rate": 4.917208649796152e-05, + "loss": 0.8724, + "step": 1145 + }, + { + "epoch": 0.08432283283513452, + "grad_norm": 0.984375, + "learning_rate": 4.9170607728439355e-05, + "loss": 0.8965, + "step": 1146 + }, + { + "epoch": 0.08439641296849851, + "grad_norm": 0.875, + "learning_rate": 4.916912766172632e-05, + "loss": 0.82, + "step": 1147 + }, + { + "epoch": 0.0844699931018625, + "grad_norm": 0.83203125, + "learning_rate": 4.916764629790186e-05, + "loss": 0.9645, + "step": 1148 + }, + { + "epoch": 0.08454357323522649, + "grad_norm": 0.984375, + "learning_rate": 4.916616363704547e-05, + "loss": 0.9855, + "step": 1149 + }, + { + "epoch": 0.08461715336859048, + "grad_norm": 0.921875, + "learning_rate": 4.916467967923671e-05, + "loss": 0.7346, + "step": 1150 + }, + { + "epoch": 0.08469073350195447, + "grad_norm": 0.80859375, + "learning_rate": 4.916319442455524e-05, + "loss": 0.8516, + "step": 1151 + }, + { + "epoch": 0.08476431363531846, + "grad_norm": 1.015625, + "learning_rate": 4.916170787308076e-05, + "loss": 0.9306, + "step": 1152 + }, + { + "epoch": 0.08483789376868246, + "grad_norm": 0.91796875, + "learning_rate": 4.9160220024893064e-05, + "loss": 1.1613, + "step": 1153 + }, + { + "epoch": 0.08491147390204645, + "grad_norm": 0.76171875, + "learning_rate": 4.915873088007198e-05, + "loss": 0.7963, + "step": 1154 + }, + { + "epoch": 0.08498505403541044, + "grad_norm": 1.1171875, + "learning_rate": 4.9157240438697446e-05, + "loss": 0.9718, + "step": 1155 + }, + { + "epoch": 0.08505863416877443, + "grad_norm": 1.0078125, + "learning_rate": 4.915574870084944e-05, + "loss": 1.3683, + "step": 1156 + }, + { + "epoch": 0.08513221430213842, + "grad_norm": 0.85546875, + "learning_rate": 4.9154255666608026e-05, + "loss": 0.5786, + "step": 1157 + }, + { + "epoch": 0.08520579443550241, + "grad_norm": 1.1875, + "learning_rate": 4.915276133605333e-05, + "loss": 0.793, + "step": 1158 + }, + { + "epoch": 0.0852793745688664, + "grad_norm": 1.0703125, + "learning_rate": 4.915126570926555e-05, + "loss": 1.0918, + "step": 1159 + }, + { + "epoch": 0.0853529547022304, + "grad_norm": 3.828125, + "learning_rate": 4.914976878632496e-05, + "loss": 1.3251, + "step": 1160 + }, + { + "epoch": 0.08542653483559438, + "grad_norm": 0.98828125, + "learning_rate": 4.914827056731188e-05, + "loss": 0.9405, + "step": 1161 + }, + { + "epoch": 0.08550011496895839, + "grad_norm": 0.73828125, + "learning_rate": 4.914677105230674e-05, + "loss": 0.6488, + "step": 1162 + }, + { + "epoch": 0.08557369510232238, + "grad_norm": 1.1171875, + "learning_rate": 4.914527024139e-05, + "loss": 1.388, + "step": 1163 + }, + { + "epoch": 0.08564727523568637, + "grad_norm": 0.98828125, + "learning_rate": 4.914376813464221e-05, + "loss": 0.9075, + "step": 1164 + }, + { + "epoch": 0.08572085536905036, + "grad_norm": 0.75, + "learning_rate": 4.914226473214399e-05, + "loss": 0.6648, + "step": 1165 + }, + { + "epoch": 0.08579443550241435, + "grad_norm": 1.1640625, + "learning_rate": 4.9140760033976016e-05, + "loss": 1.1211, + "step": 1166 + }, + { + "epoch": 0.08586801563577834, + "grad_norm": 0.765625, + "learning_rate": 4.913925404021905e-05, + "loss": 0.6689, + "step": 1167 + }, + { + "epoch": 0.08594159576914233, + "grad_norm": 0.84765625, + "learning_rate": 4.9137746750953907e-05, + "loss": 0.8045, + "step": 1168 + }, + { + "epoch": 0.08601517590250632, + "grad_norm": 0.984375, + "learning_rate": 4.9136238166261485e-05, + "loss": 0.9566, + "step": 1169 + }, + { + "epoch": 0.08608875603587031, + "grad_norm": 0.9609375, + "learning_rate": 4.913472828622275e-05, + "loss": 1.16, + "step": 1170 + }, + { + "epoch": 0.08616233616923431, + "grad_norm": 1.046875, + "learning_rate": 4.9133217110918726e-05, + "loss": 1.3695, + "step": 1171 + }, + { + "epoch": 0.0862359163025983, + "grad_norm": 0.92578125, + "learning_rate": 4.913170464043053e-05, + "loss": 0.8043, + "step": 1172 + }, + { + "epoch": 0.0863094964359623, + "grad_norm": 1.1484375, + "learning_rate": 4.913019087483932e-05, + "loss": 1.3014, + "step": 1173 + }, + { + "epoch": 0.08638307656932628, + "grad_norm": 1.0234375, + "learning_rate": 4.9128675814226346e-05, + "loss": 1.0793, + "step": 1174 + }, + { + "epoch": 0.08645665670269027, + "grad_norm": 0.90234375, + "learning_rate": 4.912715945867291e-05, + "loss": 1.1457, + "step": 1175 + }, + { + "epoch": 0.08653023683605426, + "grad_norm": 0.80078125, + "learning_rate": 4.9125641808260395e-05, + "loss": 0.7501, + "step": 1176 + }, + { + "epoch": 0.08660381696941825, + "grad_norm": 1.0390625, + "learning_rate": 4.9124122863070255e-05, + "loss": 0.9969, + "step": 1177 + }, + { + "epoch": 0.08667739710278224, + "grad_norm": 0.97265625, + "learning_rate": 4.9122602623184e-05, + "loss": 0.8683, + "step": 1178 + }, + { + "epoch": 0.08675097723614623, + "grad_norm": 0.828125, + "learning_rate": 4.912108108868322e-05, + "loss": 0.7663, + "step": 1179 + }, + { + "epoch": 0.08682455736951024, + "grad_norm": 0.96484375, + "learning_rate": 4.911955825964958e-05, + "loss": 0.8899, + "step": 1180 + }, + { + "epoch": 0.08689813750287423, + "grad_norm": 0.71484375, + "learning_rate": 4.91180341361648e-05, + "loss": 0.7079, + "step": 1181 + }, + { + "epoch": 0.08697171763623822, + "grad_norm": 0.95703125, + "learning_rate": 4.91165087183107e-05, + "loss": 0.8011, + "step": 1182 + }, + { + "epoch": 0.08704529776960221, + "grad_norm": 1.0546875, + "learning_rate": 4.911498200616911e-05, + "loss": 1.0859, + "step": 1183 + }, + { + "epoch": 0.0871188779029662, + "grad_norm": 0.9609375, + "learning_rate": 4.911345399982198e-05, + "loss": 1.1429, + "step": 1184 + }, + { + "epoch": 0.08719245803633019, + "grad_norm": 0.8671875, + "learning_rate": 4.911192469935132e-05, + "loss": 0.8551, + "step": 1185 + }, + { + "epoch": 0.08726603816969418, + "grad_norm": 0.78515625, + "learning_rate": 4.9110394104839206e-05, + "loss": 0.9391, + "step": 1186 + }, + { + "epoch": 0.08733961830305817, + "grad_norm": 1.046875, + "learning_rate": 4.910886221636777e-05, + "loss": 1.4532, + "step": 1187 + }, + { + "epoch": 0.08741319843642216, + "grad_norm": 0.94140625, + "learning_rate": 4.9107329034019245e-05, + "loss": 0.7465, + "step": 1188 + }, + { + "epoch": 0.08748677856978616, + "grad_norm": 1.015625, + "learning_rate": 4.910579455787589e-05, + "loss": 1.4572, + "step": 1189 + }, + { + "epoch": 0.08756035870315015, + "grad_norm": 1.0078125, + "learning_rate": 4.910425878802008e-05, + "loss": 0.8336, + "step": 1190 + }, + { + "epoch": 0.08763393883651414, + "grad_norm": 1.1015625, + "learning_rate": 4.9102721724534233e-05, + "loss": 0.9518, + "step": 1191 + }, + { + "epoch": 0.08770751896987813, + "grad_norm": 0.8515625, + "learning_rate": 4.910118336750083e-05, + "loss": 1.0194, + "step": 1192 + }, + { + "epoch": 0.08778109910324212, + "grad_norm": 1.1328125, + "learning_rate": 4.909964371700243e-05, + "loss": 1.04, + "step": 1193 + }, + { + "epoch": 0.08785467923660611, + "grad_norm": 1.3125, + "learning_rate": 4.909810277312168e-05, + "loss": 1.8463, + "step": 1194 + }, + { + "epoch": 0.0879282593699701, + "grad_norm": 0.76953125, + "learning_rate": 4.9096560535941264e-05, + "loss": 0.7999, + "step": 1195 + }, + { + "epoch": 0.0880018395033341, + "grad_norm": 1.171875, + "learning_rate": 4.9095017005543956e-05, + "loss": 1.28, + "step": 1196 + }, + { + "epoch": 0.08807541963669809, + "grad_norm": 1.109375, + "learning_rate": 4.909347218201259e-05, + "loss": 1.0784, + "step": 1197 + }, + { + "epoch": 0.08814899977006209, + "grad_norm": 0.921875, + "learning_rate": 4.9091926065430084e-05, + "loss": 0.7425, + "step": 1198 + }, + { + "epoch": 0.08822257990342608, + "grad_norm": 0.91796875, + "learning_rate": 4.90903786558794e-05, + "loss": 0.8728, + "step": 1199 + }, + { + "epoch": 0.08829616003679007, + "grad_norm": 1.109375, + "learning_rate": 4.9088829953443605e-05, + "loss": 1.4336, + "step": 1200 + }, + { + "epoch": 0.08836974017015406, + "grad_norm": 0.921875, + "learning_rate": 4.9087279958205794e-05, + "loss": 0.9031, + "step": 1201 + }, + { + "epoch": 0.08844332030351805, + "grad_norm": 0.75, + "learning_rate": 4.908572867024917e-05, + "loss": 0.7032, + "step": 1202 + }, + { + "epoch": 0.08851690043688204, + "grad_norm": 0.95703125, + "learning_rate": 4.9084176089656975e-05, + "loss": 0.9688, + "step": 1203 + }, + { + "epoch": 0.08859048057024603, + "grad_norm": 1.09375, + "learning_rate": 4.908262221651254e-05, + "loss": 1.2545, + "step": 1204 + }, + { + "epoch": 0.08866406070361002, + "grad_norm": 1.0, + "learning_rate": 4.9081067050899257e-05, + "loss": 1.2364, + "step": 1205 + }, + { + "epoch": 0.08873764083697401, + "grad_norm": 1.1640625, + "learning_rate": 4.9079510592900583e-05, + "loss": 1.2256, + "step": 1206 + }, + { + "epoch": 0.08881122097033801, + "grad_norm": 1.0078125, + "learning_rate": 4.907795284260006e-05, + "loss": 1.3518, + "step": 1207 + }, + { + "epoch": 0.088884801103702, + "grad_norm": 1.0234375, + "learning_rate": 4.907639380008129e-05, + "loss": 1.0418, + "step": 1208 + }, + { + "epoch": 0.088958381237066, + "grad_norm": 1.0703125, + "learning_rate": 4.9074833465427925e-05, + "loss": 0.8984, + "step": 1209 + }, + { + "epoch": 0.08903196137042999, + "grad_norm": 0.87890625, + "learning_rate": 4.9073271838723734e-05, + "loss": 0.8314, + "step": 1210 + }, + { + "epoch": 0.08910554150379398, + "grad_norm": 1.0859375, + "learning_rate": 4.90717089200525e-05, + "loss": 0.9828, + "step": 1211 + }, + { + "epoch": 0.08917912163715797, + "grad_norm": 0.921875, + "learning_rate": 4.907014470949812e-05, + "loss": 0.8529, + "step": 1212 + }, + { + "epoch": 0.08925270177052196, + "grad_norm": 1.0546875, + "learning_rate": 4.9068579207144525e-05, + "loss": 1.2085, + "step": 1213 + }, + { + "epoch": 0.08932628190388595, + "grad_norm": 1.0625, + "learning_rate": 4.906701241307575e-05, + "loss": 1.207, + "step": 1214 + }, + { + "epoch": 0.08939986203724994, + "grad_norm": 0.96484375, + "learning_rate": 4.9065444327375876e-05, + "loss": 1.1959, + "step": 1215 + }, + { + "epoch": 0.08947344217061394, + "grad_norm": 0.9140625, + "learning_rate": 4.9063874950129066e-05, + "loss": 1.0856, + "step": 1216 + }, + { + "epoch": 0.08954702230397793, + "grad_norm": 0.8828125, + "learning_rate": 4.9062304281419525e-05, + "loss": 0.9216, + "step": 1217 + }, + { + "epoch": 0.08962060243734192, + "grad_norm": 1.078125, + "learning_rate": 4.906073232133157e-05, + "loss": 1.2706, + "step": 1218 + }, + { + "epoch": 0.08969418257070591, + "grad_norm": 0.91015625, + "learning_rate": 4.905915906994955e-05, + "loss": 0.9168, + "step": 1219 + }, + { + "epoch": 0.0897677627040699, + "grad_norm": 1.1328125, + "learning_rate": 4.9057584527357894e-05, + "loss": 1.1207, + "step": 1220 + }, + { + "epoch": 0.08984134283743389, + "grad_norm": 0.7890625, + "learning_rate": 4.905600869364113e-05, + "loss": 0.8928, + "step": 1221 + }, + { + "epoch": 0.08991492297079788, + "grad_norm": 1.0546875, + "learning_rate": 4.905443156888381e-05, + "loss": 1.1832, + "step": 1222 + }, + { + "epoch": 0.08998850310416187, + "grad_norm": 0.91796875, + "learning_rate": 4.905285315317058e-05, + "loss": 0.9636, + "step": 1223 + }, + { + "epoch": 0.09006208323752586, + "grad_norm": 0.81640625, + "learning_rate": 4.905127344658615e-05, + "loss": 0.7163, + "step": 1224 + }, + { + "epoch": 0.09013566337088987, + "grad_norm": 1.3046875, + "learning_rate": 4.90496924492153e-05, + "loss": 1.7359, + "step": 1225 + }, + { + "epoch": 0.09020924350425386, + "grad_norm": 0.984375, + "learning_rate": 4.904811016114288e-05, + "loss": 1.1708, + "step": 1226 + }, + { + "epoch": 0.09028282363761785, + "grad_norm": 1.0859375, + "learning_rate": 4.9046526582453814e-05, + "loss": 0.8912, + "step": 1227 + }, + { + "epoch": 0.09035640377098184, + "grad_norm": 0.85546875, + "learning_rate": 4.904494171323307e-05, + "loss": 0.7695, + "step": 1228 + }, + { + "epoch": 0.09042998390434583, + "grad_norm": 1.0390625, + "learning_rate": 4.904335555356573e-05, + "loss": 0.9219, + "step": 1229 + }, + { + "epoch": 0.09050356403770982, + "grad_norm": 0.8984375, + "learning_rate": 4.9041768103536904e-05, + "loss": 0.8511, + "step": 1230 + }, + { + "epoch": 0.0905771441710738, + "grad_norm": 0.92578125, + "learning_rate": 4.90401793632318e-05, + "loss": 0.8241, + "step": 1231 + }, + { + "epoch": 0.0906507243044378, + "grad_norm": 0.99609375, + "learning_rate": 4.903858933273566e-05, + "loss": 1.0765, + "step": 1232 + }, + { + "epoch": 0.0907243044378018, + "grad_norm": 1.0234375, + "learning_rate": 4.903699801213385e-05, + "loss": 1.3338, + "step": 1233 + }, + { + "epoch": 0.09079788457116579, + "grad_norm": 0.89453125, + "learning_rate": 4.9035405401511745e-05, + "loss": 1.0775, + "step": 1234 + }, + { + "epoch": 0.09087146470452978, + "grad_norm": 1.0703125, + "learning_rate": 4.9033811500954836e-05, + "loss": 0.9106, + "step": 1235 + }, + { + "epoch": 0.09094504483789377, + "grad_norm": 0.7109375, + "learning_rate": 4.903221631054865e-05, + "loss": 0.6733, + "step": 1236 + }, + { + "epoch": 0.09101862497125776, + "grad_norm": 0.87109375, + "learning_rate": 4.903061983037881e-05, + "loss": 0.9866, + "step": 1237 + }, + { + "epoch": 0.09109220510462175, + "grad_norm": 0.9140625, + "learning_rate": 4.902902206053099e-05, + "loss": 1.0376, + "step": 1238 + }, + { + "epoch": 0.09116578523798574, + "grad_norm": 0.859375, + "learning_rate": 4.902742300109094e-05, + "loss": 0.9408, + "step": 1239 + }, + { + "epoch": 0.09123936537134973, + "grad_norm": 1.0234375, + "learning_rate": 4.902582265214448e-05, + "loss": 1.5036, + "step": 1240 + }, + { + "epoch": 0.09131294550471372, + "grad_norm": 0.87109375, + "learning_rate": 4.9024221013777494e-05, + "loss": 1.1578, + "step": 1241 + }, + { + "epoch": 0.09138652563807773, + "grad_norm": 0.859375, + "learning_rate": 4.902261808607594e-05, + "loss": 0.8581, + "step": 1242 + }, + { + "epoch": 0.09146010577144172, + "grad_norm": 0.87109375, + "learning_rate": 4.902101386912585e-05, + "loss": 0.941, + "step": 1243 + }, + { + "epoch": 0.0915336859048057, + "grad_norm": 0.9921875, + "learning_rate": 4.9019408363013305e-05, + "loss": 0.9933, + "step": 1244 + }, + { + "epoch": 0.0916072660381697, + "grad_norm": 0.90625, + "learning_rate": 4.9017801567824494e-05, + "loss": 0.9652, + "step": 1245 + }, + { + "epoch": 0.09168084617153369, + "grad_norm": 1.0, + "learning_rate": 4.901619348364563e-05, + "loss": 1.0716, + "step": 1246 + }, + { + "epoch": 0.09175442630489768, + "grad_norm": 1.1171875, + "learning_rate": 4.901458411056302e-05, + "loss": 1.1948, + "step": 1247 + }, + { + "epoch": 0.09182800643826167, + "grad_norm": 0.9453125, + "learning_rate": 4.901297344866304e-05, + "loss": 0.9243, + "step": 1248 + }, + { + "epoch": 0.09190158657162566, + "grad_norm": 1.015625, + "learning_rate": 4.901136149803213e-05, + "loss": 1.1386, + "step": 1249 + }, + { + "epoch": 0.09197516670498965, + "grad_norm": 1.0390625, + "learning_rate": 4.900974825875679e-05, + "loss": 1.0933, + "step": 1250 + }, + { + "epoch": 0.09204874683835365, + "grad_norm": 0.71484375, + "learning_rate": 4.900813373092362e-05, + "loss": 0.8543, + "step": 1251 + }, + { + "epoch": 0.09212232697171764, + "grad_norm": 0.97265625, + "learning_rate": 4.900651791461926e-05, + "loss": 0.9306, + "step": 1252 + }, + { + "epoch": 0.09219590710508163, + "grad_norm": 0.82421875, + "learning_rate": 4.900490080993042e-05, + "loss": 0.8872, + "step": 1253 + }, + { + "epoch": 0.09226948723844562, + "grad_norm": 1.0546875, + "learning_rate": 4.900328241694389e-05, + "loss": 1.4837, + "step": 1254 + }, + { + "epoch": 0.09234306737180961, + "grad_norm": 0.75, + "learning_rate": 4.900166273574653e-05, + "loss": 0.508, + "step": 1255 + }, + { + "epoch": 0.0924166475051736, + "grad_norm": 0.87890625, + "learning_rate": 4.9000041766425264e-05, + "loss": 0.8884, + "step": 1256 + }, + { + "epoch": 0.09249022763853759, + "grad_norm": 0.80859375, + "learning_rate": 4.8998419509067086e-05, + "loss": 0.776, + "step": 1257 + }, + { + "epoch": 0.09256380777190158, + "grad_norm": 1.0078125, + "learning_rate": 4.899679596375906e-05, + "loss": 0.9784, + "step": 1258 + }, + { + "epoch": 0.09263738790526557, + "grad_norm": 1.1015625, + "learning_rate": 4.899517113058831e-05, + "loss": 1.8613, + "step": 1259 + }, + { + "epoch": 0.09271096803862958, + "grad_norm": 1.109375, + "learning_rate": 4.899354500964206e-05, + "loss": 1.2688, + "step": 1260 + }, + { + "epoch": 0.09278454817199357, + "grad_norm": 0.78515625, + "learning_rate": 4.8991917601007566e-05, + "loss": 0.7221, + "step": 1261 + }, + { + "epoch": 0.09285812830535756, + "grad_norm": 1.9296875, + "learning_rate": 4.899028890477216e-05, + "loss": 0.9086, + "step": 1262 + }, + { + "epoch": 0.09293170843872155, + "grad_norm": 0.83203125, + "learning_rate": 4.898865892102326e-05, + "loss": 0.8697, + "step": 1263 + }, + { + "epoch": 0.09300528857208554, + "grad_norm": 1.0625, + "learning_rate": 4.898702764984835e-05, + "loss": 1.2328, + "step": 1264 + }, + { + "epoch": 0.09307886870544953, + "grad_norm": 0.9375, + "learning_rate": 4.898539509133496e-05, + "loss": 1.3106, + "step": 1265 + }, + { + "epoch": 0.09315244883881352, + "grad_norm": 0.828125, + "learning_rate": 4.898376124557073e-05, + "loss": 0.9223, + "step": 1266 + }, + { + "epoch": 0.09322602897217751, + "grad_norm": 1.0625, + "learning_rate": 4.898212611264333e-05, + "loss": 1.0953, + "step": 1267 + }, + { + "epoch": 0.0932996091055415, + "grad_norm": 0.80859375, + "learning_rate": 4.898048969264051e-05, + "loss": 0.8959, + "step": 1268 + }, + { + "epoch": 0.0933731892389055, + "grad_norm": 0.9296875, + "learning_rate": 4.897885198565011e-05, + "loss": 0.7999, + "step": 1269 + }, + { + "epoch": 0.09344676937226949, + "grad_norm": 0.96875, + "learning_rate": 4.8977212991760014e-05, + "loss": 0.9548, + "step": 1270 + }, + { + "epoch": 0.09352034950563348, + "grad_norm": 0.671875, + "learning_rate": 4.897557271105817e-05, + "loss": 0.6872, + "step": 1271 + }, + { + "epoch": 0.09359392963899747, + "grad_norm": 0.9140625, + "learning_rate": 4.897393114363264e-05, + "loss": 0.8207, + "step": 1272 + }, + { + "epoch": 0.09366750977236146, + "grad_norm": 1.2265625, + "learning_rate": 4.8972288289571494e-05, + "loss": 1.1634, + "step": 1273 + }, + { + "epoch": 0.09374108990572545, + "grad_norm": 0.78515625, + "learning_rate": 4.8970644148962916e-05, + "loss": 0.9048, + "step": 1274 + }, + { + "epoch": 0.09381467003908944, + "grad_norm": 0.8046875, + "learning_rate": 4.8968998721895145e-05, + "loss": 1.0288, + "step": 1275 + }, + { + "epoch": 0.09388825017245343, + "grad_norm": 0.83984375, + "learning_rate": 4.896735200845647e-05, + "loss": 0.8646, + "step": 1276 + }, + { + "epoch": 0.09396183030581742, + "grad_norm": 0.94140625, + "learning_rate": 4.896570400873529e-05, + "loss": 0.87, + "step": 1277 + }, + { + "epoch": 0.09403541043918143, + "grad_norm": 0.83203125, + "learning_rate": 4.896405472282004e-05, + "loss": 1.0347, + "step": 1278 + }, + { + "epoch": 0.09410899057254542, + "grad_norm": 0.7109375, + "learning_rate": 4.8962404150799236e-05, + "loss": 0.841, + "step": 1279 + }, + { + "epoch": 0.09418257070590941, + "grad_norm": 0.73828125, + "learning_rate": 4.896075229276146e-05, + "loss": 0.8673, + "step": 1280 + }, + { + "epoch": 0.0942561508392734, + "grad_norm": 0.9921875, + "learning_rate": 4.8959099148795365e-05, + "loss": 0.9358, + "step": 1281 + }, + { + "epoch": 0.09432973097263739, + "grad_norm": 0.84765625, + "learning_rate": 4.895744471898967e-05, + "loss": 0.6897, + "step": 1282 + }, + { + "epoch": 0.09440331110600138, + "grad_norm": 0.88671875, + "learning_rate": 4.895578900343316e-05, + "loss": 0.806, + "step": 1283 + }, + { + "epoch": 0.09447689123936537, + "grad_norm": 1.203125, + "learning_rate": 4.89541320022147e-05, + "loss": 0.9246, + "step": 1284 + }, + { + "epoch": 0.09455047137272936, + "grad_norm": 1.140625, + "learning_rate": 4.895247371542323e-05, + "loss": 1.5952, + "step": 1285 + }, + { + "epoch": 0.09462405150609335, + "grad_norm": 1.09375, + "learning_rate": 4.8950814143147725e-05, + "loss": 1.1463, + "step": 1286 + }, + { + "epoch": 0.09469763163945735, + "grad_norm": 0.765625, + "learning_rate": 4.894915328547727e-05, + "loss": 0.7581, + "step": 1287 + }, + { + "epoch": 0.09477121177282134, + "grad_norm": 0.9296875, + "learning_rate": 4.894749114250098e-05, + "loss": 0.9762, + "step": 1288 + }, + { + "epoch": 0.09484479190618533, + "grad_norm": 0.89453125, + "learning_rate": 4.894582771430808e-05, + "loss": 0.6978, + "step": 1289 + }, + { + "epoch": 0.09491837203954932, + "grad_norm": 0.9375, + "learning_rate": 4.8944163000987834e-05, + "loss": 1.0474, + "step": 1290 + }, + { + "epoch": 0.09499195217291331, + "grad_norm": 0.94140625, + "learning_rate": 4.894249700262958e-05, + "loss": 0.8466, + "step": 1291 + }, + { + "epoch": 0.0950655323062773, + "grad_norm": 1.1015625, + "learning_rate": 4.894082971932274e-05, + "loss": 1.1338, + "step": 1292 + }, + { + "epoch": 0.0951391124396413, + "grad_norm": 1.0703125, + "learning_rate": 4.893916115115678e-05, + "loss": 1.1831, + "step": 1293 + }, + { + "epoch": 0.09521269257300528, + "grad_norm": 0.8125, + "learning_rate": 4.893749129822125e-05, + "loss": 1.0496, + "step": 1294 + }, + { + "epoch": 0.09528627270636927, + "grad_norm": 0.7578125, + "learning_rate": 4.893582016060578e-05, + "loss": 0.7504, + "step": 1295 + }, + { + "epoch": 0.09535985283973328, + "grad_norm": 0.75, + "learning_rate": 4.893414773840005e-05, + "loss": 0.673, + "step": 1296 + }, + { + "epoch": 0.09543343297309727, + "grad_norm": 1.0078125, + "learning_rate": 4.893247403169382e-05, + "loss": 1.3115, + "step": 1297 + }, + { + "epoch": 0.09550701310646126, + "grad_norm": 1.015625, + "learning_rate": 4.89307990405769e-05, + "loss": 0.915, + "step": 1298 + }, + { + "epoch": 0.09558059323982525, + "grad_norm": 0.83984375, + "learning_rate": 4.8929122765139206e-05, + "loss": 0.7835, + "step": 1299 + }, + { + "epoch": 0.09565417337318924, + "grad_norm": 1.0390625, + "learning_rate": 4.892744520547069e-05, + "loss": 1.058, + "step": 1300 + }, + { + "epoch": 0.09572775350655323, + "grad_norm": 0.86328125, + "learning_rate": 4.8925766361661376e-05, + "loss": 1.0108, + "step": 1301 + }, + { + "epoch": 0.09580133363991722, + "grad_norm": 1.09375, + "learning_rate": 4.892408623380137e-05, + "loss": 0.8173, + "step": 1302 + }, + { + "epoch": 0.09587491377328121, + "grad_norm": 1.09375, + "learning_rate": 4.892240482198084e-05, + "loss": 0.7766, + "step": 1303 + }, + { + "epoch": 0.0959484939066452, + "grad_norm": 0.7265625, + "learning_rate": 4.892072212629003e-05, + "loss": 0.6871, + "step": 1304 + }, + { + "epoch": 0.0960220740400092, + "grad_norm": 0.8359375, + "learning_rate": 4.8919038146819244e-05, + "loss": 1.2853, + "step": 1305 + }, + { + "epoch": 0.0960956541733732, + "grad_norm": 1.09375, + "learning_rate": 4.891735288365886e-05, + "loss": 1.4199, + "step": 1306 + }, + { + "epoch": 0.09616923430673718, + "grad_norm": 1.0546875, + "learning_rate": 4.8915666336899315e-05, + "loss": 1.5016, + "step": 1307 + }, + { + "epoch": 0.09624281444010117, + "grad_norm": 0.86328125, + "learning_rate": 4.8913978506631134e-05, + "loss": 0.9186, + "step": 1308 + }, + { + "epoch": 0.09631639457346516, + "grad_norm": 0.85546875, + "learning_rate": 4.8912289392944885e-05, + "loss": 0.9134, + "step": 1309 + }, + { + "epoch": 0.09638997470682915, + "grad_norm": 0.91015625, + "learning_rate": 4.8910598995931236e-05, + "loss": 0.7541, + "step": 1310 + }, + { + "epoch": 0.09646355484019314, + "grad_norm": 0.96484375, + "learning_rate": 4.890890731568089e-05, + "loss": 1.0797, + "step": 1311 + }, + { + "epoch": 0.09653713497355713, + "grad_norm": 0.94140625, + "learning_rate": 4.890721435228465e-05, + "loss": 1.0187, + "step": 1312 + }, + { + "epoch": 0.09661071510692112, + "grad_norm": 1.09375, + "learning_rate": 4.8905520105833375e-05, + "loss": 1.4667, + "step": 1313 + }, + { + "epoch": 0.09668429524028513, + "grad_norm": 0.9765625, + "learning_rate": 4.8903824576417986e-05, + "loss": 0.9393, + "step": 1314 + }, + { + "epoch": 0.09675787537364912, + "grad_norm": 1.171875, + "learning_rate": 4.890212776412948e-05, + "loss": 1.2802, + "step": 1315 + }, + { + "epoch": 0.09683145550701311, + "grad_norm": 0.93359375, + "learning_rate": 4.8900429669058914e-05, + "loss": 0.9362, + "step": 1316 + }, + { + "epoch": 0.0969050356403771, + "grad_norm": 1.171875, + "learning_rate": 4.889873029129743e-05, + "loss": 1.23, + "step": 1317 + }, + { + "epoch": 0.09697861577374109, + "grad_norm": 0.890625, + "learning_rate": 4.8897029630936244e-05, + "loss": 0.7691, + "step": 1318 + }, + { + "epoch": 0.09705219590710508, + "grad_norm": 0.890625, + "learning_rate": 4.88953276880666e-05, + "loss": 0.739, + "step": 1319 + }, + { + "epoch": 0.09712577604046907, + "grad_norm": 1.015625, + "learning_rate": 4.8893624462779854e-05, + "loss": 0.9665, + "step": 1320 + }, + { + "epoch": 0.09719935617383306, + "grad_norm": 1.0234375, + "learning_rate": 4.889191995516741e-05, + "loss": 1.2012, + "step": 1321 + }, + { + "epoch": 0.09727293630719705, + "grad_norm": 0.9765625, + "learning_rate": 4.8890214165320746e-05, + "loss": 0.9751, + "step": 1322 + }, + { + "epoch": 0.09734651644056105, + "grad_norm": 1.0859375, + "learning_rate": 4.888850709333141e-05, + "loss": 1.0713, + "step": 1323 + }, + { + "epoch": 0.09742009657392504, + "grad_norm": 1.015625, + "learning_rate": 4.888679873929103e-05, + "loss": 0.8512, + "step": 1324 + }, + { + "epoch": 0.09749367670728903, + "grad_norm": 1.015625, + "learning_rate": 4.888508910329126e-05, + "loss": 1.0594, + "step": 1325 + }, + { + "epoch": 0.09756725684065302, + "grad_norm": 1.2265625, + "learning_rate": 4.888337818542388e-05, + "loss": 1.2818, + "step": 1326 + }, + { + "epoch": 0.09764083697401701, + "grad_norm": 1.078125, + "learning_rate": 4.88816659857807e-05, + "loss": 1.1116, + "step": 1327 + }, + { + "epoch": 0.097714417107381, + "grad_norm": 0.71484375, + "learning_rate": 4.887995250445361e-05, + "loss": 0.6879, + "step": 1328 + }, + { + "epoch": 0.097787997240745, + "grad_norm": 1.09375, + "learning_rate": 4.8878237741534584e-05, + "loss": 1.156, + "step": 1329 + }, + { + "epoch": 0.09786157737410899, + "grad_norm": 0.9140625, + "learning_rate": 4.8876521697115627e-05, + "loss": 0.971, + "step": 1330 + }, + { + "epoch": 0.09793515750747299, + "grad_norm": 0.78515625, + "learning_rate": 4.887480437128885e-05, + "loss": 0.8463, + "step": 1331 + }, + { + "epoch": 0.09800873764083698, + "grad_norm": 1.0078125, + "learning_rate": 4.8873085764146406e-05, + "loss": 1.2374, + "step": 1332 + }, + { + "epoch": 0.09808231777420097, + "grad_norm": 1.015625, + "learning_rate": 4.887136587578055e-05, + "loss": 1.104, + "step": 1333 + }, + { + "epoch": 0.09815589790756496, + "grad_norm": 1.0, + "learning_rate": 4.886964470628357e-05, + "loss": 0.9857, + "step": 1334 + }, + { + "epoch": 0.09822947804092895, + "grad_norm": 0.84765625, + "learning_rate": 4.886792225574784e-05, + "loss": 0.6217, + "step": 1335 + }, + { + "epoch": 0.09830305817429294, + "grad_norm": 0.8046875, + "learning_rate": 4.88661985242658e-05, + "loss": 0.7217, + "step": 1336 + }, + { + "epoch": 0.09837663830765693, + "grad_norm": 1.2421875, + "learning_rate": 4.8864473511929975e-05, + "loss": 1.8299, + "step": 1337 + }, + { + "epoch": 0.09845021844102092, + "grad_norm": 0.88671875, + "learning_rate": 4.8862747218832924e-05, + "loss": 0.8521, + "step": 1338 + }, + { + "epoch": 0.09852379857438491, + "grad_norm": 0.98046875, + "learning_rate": 4.8861019645067296e-05, + "loss": 0.7501, + "step": 1339 + }, + { + "epoch": 0.09859737870774891, + "grad_norm": 0.94140625, + "learning_rate": 4.8859290790725816e-05, + "loss": 0.9327, + "step": 1340 + }, + { + "epoch": 0.0986709588411129, + "grad_norm": 1.171875, + "learning_rate": 4.885756065590126e-05, + "loss": 1.3556, + "step": 1341 + }, + { + "epoch": 0.0987445389744769, + "grad_norm": 0.91015625, + "learning_rate": 4.8855829240686486e-05, + "loss": 0.8993, + "step": 1342 + }, + { + "epoch": 0.09881811910784088, + "grad_norm": 1.0, + "learning_rate": 4.885409654517441e-05, + "loss": 0.963, + "step": 1343 + }, + { + "epoch": 0.09889169924120488, + "grad_norm": 1.2734375, + "learning_rate": 4.885236256945803e-05, + "loss": 1.1411, + "step": 1344 + }, + { + "epoch": 0.09896527937456887, + "grad_norm": 0.9296875, + "learning_rate": 4.8850627313630405e-05, + "loss": 0.8609, + "step": 1345 + }, + { + "epoch": 0.09903885950793286, + "grad_norm": 1.03125, + "learning_rate": 4.884889077778465e-05, + "loss": 0.8643, + "step": 1346 + }, + { + "epoch": 0.09911243964129685, + "grad_norm": 1.0078125, + "learning_rate": 4.8847152962013974e-05, + "loss": 0.8406, + "step": 1347 + }, + { + "epoch": 0.09918601977466084, + "grad_norm": 1.0859375, + "learning_rate": 4.884541386641165e-05, + "loss": 0.9565, + "step": 1348 + }, + { + "epoch": 0.09925959990802484, + "grad_norm": 0.9140625, + "learning_rate": 4.8843673491070984e-05, + "loss": 0.8559, + "step": 1349 + }, + { + "epoch": 0.09933318004138883, + "grad_norm": 0.90234375, + "learning_rate": 4.884193183608541e-05, + "loss": 1.0358, + "step": 1350 + }, + { + "epoch": 0.09940676017475282, + "grad_norm": 1.0234375, + "learning_rate": 4.8840188901548375e-05, + "loss": 1.3706, + "step": 1351 + }, + { + "epoch": 0.09948034030811681, + "grad_norm": 1.0390625, + "learning_rate": 4.883844468755344e-05, + "loss": 0.9408, + "step": 1352 + }, + { + "epoch": 0.0995539204414808, + "grad_norm": 0.7109375, + "learning_rate": 4.88366991941942e-05, + "loss": 0.655, + "step": 1353 + }, + { + "epoch": 0.09962750057484479, + "grad_norm": 0.984375, + "learning_rate": 4.883495242156433e-05, + "loss": 0.9964, + "step": 1354 + }, + { + "epoch": 0.09970108070820878, + "grad_norm": 1.6640625, + "learning_rate": 4.8833204369757586e-05, + "loss": 1.0287, + "step": 1355 + }, + { + "epoch": 0.09977466084157277, + "grad_norm": 0.8515625, + "learning_rate": 4.883145503886778e-05, + "loss": 1.0197, + "step": 1356 + }, + { + "epoch": 0.09984824097493676, + "grad_norm": 1.0859375, + "learning_rate": 4.882970442898879e-05, + "loss": 1.3703, + "step": 1357 + }, + { + "epoch": 0.09992182110830077, + "grad_norm": 0.85546875, + "learning_rate": 4.8827952540214564e-05, + "loss": 0.9072, + "step": 1358 + }, + { + "epoch": 0.09999540124166476, + "grad_norm": 0.67578125, + "learning_rate": 4.8826199372639136e-05, + "loss": 0.6855, + "step": 1359 + }, + { + "epoch": 0.10006898137502875, + "grad_norm": 0.9609375, + "learning_rate": 4.8824444926356593e-05, + "loss": 0.9797, + "step": 1360 + }, + { + "epoch": 0.10014256150839274, + "grad_norm": 0.66015625, + "learning_rate": 4.882268920146109e-05, + "loss": 0.7094, + "step": 1361 + }, + { + "epoch": 0.10021614164175673, + "grad_norm": 1.15625, + "learning_rate": 4.882093219804684e-05, + "loss": 1.0087, + "step": 1362 + }, + { + "epoch": 0.10028972177512072, + "grad_norm": 0.83984375, + "learning_rate": 4.881917391620816e-05, + "loss": 0.6034, + "step": 1363 + }, + { + "epoch": 0.1003633019084847, + "grad_norm": 1.0859375, + "learning_rate": 4.8817414356039406e-05, + "loss": 0.937, + "step": 1364 + }, + { + "epoch": 0.1004368820418487, + "grad_norm": 0.86328125, + "learning_rate": 4.8815653517635e-05, + "loss": 0.7939, + "step": 1365 + }, + { + "epoch": 0.10051046217521269, + "grad_norm": 1.09375, + "learning_rate": 4.881389140108946e-05, + "loss": 0.91, + "step": 1366 + }, + { + "epoch": 0.10058404230857669, + "grad_norm": 0.8203125, + "learning_rate": 4.8812128006497335e-05, + "loss": 0.9494, + "step": 1367 + }, + { + "epoch": 0.10065762244194068, + "grad_norm": 0.96484375, + "learning_rate": 4.881036333395329e-05, + "loss": 1.3114, + "step": 1368 + }, + { + "epoch": 0.10073120257530467, + "grad_norm": 0.83984375, + "learning_rate": 4.8808597383552e-05, + "loss": 0.831, + "step": 1369 + }, + { + "epoch": 0.10080478270866866, + "grad_norm": 1.078125, + "learning_rate": 4.880683015538827e-05, + "loss": 1.3189, + "step": 1370 + }, + { + "epoch": 0.10087836284203265, + "grad_norm": 1.3125, + "learning_rate": 4.8805061649556924e-05, + "loss": 1.0953, + "step": 1371 + }, + { + "epoch": 0.10095194297539664, + "grad_norm": 1.078125, + "learning_rate": 4.8803291866152876e-05, + "loss": 1.2109, + "step": 1372 + }, + { + "epoch": 0.10102552310876063, + "grad_norm": 0.859375, + "learning_rate": 4.880152080527112e-05, + "loss": 0.7837, + "step": 1373 + }, + { + "epoch": 0.10109910324212462, + "grad_norm": 0.96484375, + "learning_rate": 4.8799748467006694e-05, + "loss": 1.2422, + "step": 1374 + }, + { + "epoch": 0.10117268337548861, + "grad_norm": 1.3203125, + "learning_rate": 4.879797485145472e-05, + "loss": 1.377, + "step": 1375 + }, + { + "epoch": 0.10124626350885262, + "grad_norm": 1.0703125, + "learning_rate": 4.879619995871038e-05, + "loss": 1.1502, + "step": 1376 + }, + { + "epoch": 0.1013198436422166, + "grad_norm": 0.9453125, + "learning_rate": 4.8794423788868934e-05, + "loss": 1.0014, + "step": 1377 + }, + { + "epoch": 0.1013934237755806, + "grad_norm": 0.7578125, + "learning_rate": 4.879264634202571e-05, + "loss": 0.7864, + "step": 1378 + }, + { + "epoch": 0.10146700390894459, + "grad_norm": 1.046875, + "learning_rate": 4.879086761827609e-05, + "loss": 0.9298, + "step": 1379 + }, + { + "epoch": 0.10154058404230858, + "grad_norm": 1.0234375, + "learning_rate": 4.878908761771554e-05, + "loss": 0.9901, + "step": 1380 + }, + { + "epoch": 0.10161416417567257, + "grad_norm": 1.140625, + "learning_rate": 4.8787306340439587e-05, + "loss": 1.1022, + "step": 1381 + }, + { + "epoch": 0.10168774430903656, + "grad_norm": 1.0859375, + "learning_rate": 4.8785523786543836e-05, + "loss": 1.1433, + "step": 1382 + }, + { + "epoch": 0.10176132444240055, + "grad_norm": 0.8203125, + "learning_rate": 4.878373995612394e-05, + "loss": 0.9623, + "step": 1383 + }, + { + "epoch": 0.10183490457576454, + "grad_norm": 1.015625, + "learning_rate": 4.878195484927565e-05, + "loss": 0.8162, + "step": 1384 + }, + { + "epoch": 0.10190848470912854, + "grad_norm": 0.92578125, + "learning_rate": 4.8780168466094757e-05, + "loss": 1.0115, + "step": 1385 + }, + { + "epoch": 0.10198206484249253, + "grad_norm": 1.5390625, + "learning_rate": 4.877838080667714e-05, + "loss": 0.7371, + "step": 1386 + }, + { + "epoch": 0.10205564497585652, + "grad_norm": 0.7109375, + "learning_rate": 4.877659187111873e-05, + "loss": 0.659, + "step": 1387 + }, + { + "epoch": 0.10212922510922051, + "grad_norm": 0.96875, + "learning_rate": 4.877480165951555e-05, + "loss": 1.0697, + "step": 1388 + }, + { + "epoch": 0.1022028052425845, + "grad_norm": 0.81640625, + "learning_rate": 4.877301017196366e-05, + "loss": 0.9541, + "step": 1389 + }, + { + "epoch": 0.10227638537594849, + "grad_norm": 0.7734375, + "learning_rate": 4.877121740855922e-05, + "loss": 0.7799, + "step": 1390 + }, + { + "epoch": 0.10234996550931248, + "grad_norm": 0.890625, + "learning_rate": 4.876942336939844e-05, + "loss": 0.7024, + "step": 1391 + }, + { + "epoch": 0.10242354564267647, + "grad_norm": 0.7578125, + "learning_rate": 4.87676280545776e-05, + "loss": 0.9268, + "step": 1392 + }, + { + "epoch": 0.10249712577604046, + "grad_norm": 0.88671875, + "learning_rate": 4.876583146419305e-05, + "loss": 0.7755, + "step": 1393 + }, + { + "epoch": 0.10257070590940447, + "grad_norm": 0.72265625, + "learning_rate": 4.8764033598341214e-05, + "loss": 0.6314, + "step": 1394 + }, + { + "epoch": 0.10264428604276846, + "grad_norm": 1.09375, + "learning_rate": 4.876223445711858e-05, + "loss": 1.1173, + "step": 1395 + }, + { + "epoch": 0.10271786617613245, + "grad_norm": 0.75390625, + "learning_rate": 4.87604340406217e-05, + "loss": 0.6412, + "step": 1396 + }, + { + "epoch": 0.10279144630949644, + "grad_norm": 0.87890625, + "learning_rate": 4.875863234894721e-05, + "loss": 0.816, + "step": 1397 + }, + { + "epoch": 0.10286502644286043, + "grad_norm": 0.77734375, + "learning_rate": 4.8756829382191796e-05, + "loss": 0.7737, + "step": 1398 + }, + { + "epoch": 0.10293860657622442, + "grad_norm": 0.92578125, + "learning_rate": 4.87550251404522e-05, + "loss": 0.8287, + "step": 1399 + }, + { + "epoch": 0.10301218670958841, + "grad_norm": 0.81640625, + "learning_rate": 4.8753219623825296e-05, + "loss": 0.8514, + "step": 1400 + }, + { + "epoch": 0.1030857668429524, + "grad_norm": 1.2265625, + "learning_rate": 4.875141283240794e-05, + "loss": 1.509, + "step": 1401 + }, + { + "epoch": 0.10315934697631639, + "grad_norm": 0.921875, + "learning_rate": 4.874960476629713e-05, + "loss": 1.1237, + "step": 1402 + }, + { + "epoch": 0.10323292710968039, + "grad_norm": 0.9375, + "learning_rate": 4.874779542558988e-05, + "loss": 0.9198, + "step": 1403 + }, + { + "epoch": 0.10330650724304438, + "grad_norm": 0.9375, + "learning_rate": 4.8745984810383316e-05, + "loss": 0.9819, + "step": 1404 + }, + { + "epoch": 0.10338008737640837, + "grad_norm": 0.8515625, + "learning_rate": 4.874417292077458e-05, + "loss": 0.886, + "step": 1405 + }, + { + "epoch": 0.10345366750977236, + "grad_norm": 0.90625, + "learning_rate": 4.874235975686095e-05, + "loss": 1.3282, + "step": 1406 + }, + { + "epoch": 0.10352724764313635, + "grad_norm": 0.86328125, + "learning_rate": 4.874054531873969e-05, + "loss": 1.0641, + "step": 1407 + }, + { + "epoch": 0.10360082777650034, + "grad_norm": 0.88671875, + "learning_rate": 4.873872960650822e-05, + "loss": 0.8754, + "step": 1408 + }, + { + "epoch": 0.10367440790986433, + "grad_norm": 0.90234375, + "learning_rate": 4.873691262026396e-05, + "loss": 1.068, + "step": 1409 + }, + { + "epoch": 0.10374798804322832, + "grad_norm": 0.79296875, + "learning_rate": 4.873509436010444e-05, + "loss": 1.0116, + "step": 1410 + }, + { + "epoch": 0.10382156817659231, + "grad_norm": 1.0234375, + "learning_rate": 4.873327482612723e-05, + "loss": 0.9322, + "step": 1411 + }, + { + "epoch": 0.10389514830995632, + "grad_norm": 0.953125, + "learning_rate": 4.873145401842999e-05, + "loss": 1.0501, + "step": 1412 + }, + { + "epoch": 0.10396872844332031, + "grad_norm": 0.79296875, + "learning_rate": 4.872963193711042e-05, + "loss": 0.9781, + "step": 1413 + }, + { + "epoch": 0.1040423085766843, + "grad_norm": 0.91796875, + "learning_rate": 4.872780858226634e-05, + "loss": 0.9769, + "step": 1414 + }, + { + "epoch": 0.10411588871004829, + "grad_norm": 0.97265625, + "learning_rate": 4.872598395399558e-05, + "loss": 0.9307, + "step": 1415 + }, + { + "epoch": 0.10418946884341228, + "grad_norm": 1.1015625, + "learning_rate": 4.872415805239607e-05, + "loss": 0.9257, + "step": 1416 + }, + { + "epoch": 0.10426304897677627, + "grad_norm": 1.1015625, + "learning_rate": 4.872233087756582e-05, + "loss": 1.0647, + "step": 1417 + }, + { + "epoch": 0.10433662911014026, + "grad_norm": 0.921875, + "learning_rate": 4.872050242960287e-05, + "loss": 1.3918, + "step": 1418 + }, + { + "epoch": 0.10441020924350425, + "grad_norm": 1.0234375, + "learning_rate": 4.871867270860535e-05, + "loss": 0.8001, + "step": 1419 + }, + { + "epoch": 0.10448378937686825, + "grad_norm": 0.71875, + "learning_rate": 4.8716841714671465e-05, + "loss": 0.587, + "step": 1420 + }, + { + "epoch": 0.10455736951023224, + "grad_norm": 0.77734375, + "learning_rate": 4.871500944789949e-05, + "loss": 0.7017, + "step": 1421 + }, + { + "epoch": 0.10463094964359623, + "grad_norm": 0.859375, + "learning_rate": 4.871317590838774e-05, + "loss": 0.853, + "step": 1422 + }, + { + "epoch": 0.10470452977696022, + "grad_norm": 1.03125, + "learning_rate": 4.871134109623463e-05, + "loss": 1.0274, + "step": 1423 + }, + { + "epoch": 0.10477810991032421, + "grad_norm": 1.078125, + "learning_rate": 4.8709505011538634e-05, + "loss": 0.9413, + "step": 1424 + }, + { + "epoch": 0.1048516900436882, + "grad_norm": 0.7109375, + "learning_rate": 4.870766765439827e-05, + "loss": 0.7312, + "step": 1425 + }, + { + "epoch": 0.1049252701770522, + "grad_norm": 1.0234375, + "learning_rate": 4.870582902491218e-05, + "loss": 0.8642, + "step": 1426 + }, + { + "epoch": 0.10499885031041618, + "grad_norm": 0.734375, + "learning_rate": 4.8703989123179004e-05, + "loss": 0.7064, + "step": 1427 + }, + { + "epoch": 0.10507243044378017, + "grad_norm": 0.9453125, + "learning_rate": 4.87021479492975e-05, + "loss": 1.2552, + "step": 1428 + }, + { + "epoch": 0.10514601057714418, + "grad_norm": 1.0625, + "learning_rate": 4.87003055033665e-05, + "loss": 1.1853, + "step": 1429 + }, + { + "epoch": 0.10521959071050817, + "grad_norm": 0.953125, + "learning_rate": 4.869846178548486e-05, + "loss": 1.0046, + "step": 1430 + }, + { + "epoch": 0.10529317084387216, + "grad_norm": 1.0625, + "learning_rate": 4.869661679575153e-05, + "loss": 0.7922, + "step": 1431 + }, + { + "epoch": 0.10536675097723615, + "grad_norm": 0.80078125, + "learning_rate": 4.8694770534265535e-05, + "loss": 0.9383, + "step": 1432 + }, + { + "epoch": 0.10544033111060014, + "grad_norm": 0.9140625, + "learning_rate": 4.8692923001125956e-05, + "loss": 1.0083, + "step": 1433 + }, + { + "epoch": 0.10551391124396413, + "grad_norm": 1.0546875, + "learning_rate": 4.8691074196431956e-05, + "loss": 1.2046, + "step": 1434 + }, + { + "epoch": 0.10558749137732812, + "grad_norm": 0.9140625, + "learning_rate": 4.868922412028275e-05, + "loss": 0.8876, + "step": 1435 + }, + { + "epoch": 0.10566107151069211, + "grad_norm": 0.91796875, + "learning_rate": 4.868737277277762e-05, + "loss": 0.9315, + "step": 1436 + }, + { + "epoch": 0.1057346516440561, + "grad_norm": 0.91015625, + "learning_rate": 4.868552015401594e-05, + "loss": 0.8065, + "step": 1437 + }, + { + "epoch": 0.1058082317774201, + "grad_norm": 0.8046875, + "learning_rate": 4.868366626409713e-05, + "loss": 0.7333, + "step": 1438 + }, + { + "epoch": 0.1058818119107841, + "grad_norm": 1.015625, + "learning_rate": 4.868181110312068e-05, + "loss": 1.2215, + "step": 1439 + }, + { + "epoch": 0.10595539204414808, + "grad_norm": 0.90625, + "learning_rate": 4.867995467118616e-05, + "loss": 1.3123, + "step": 1440 + }, + { + "epoch": 0.10602897217751207, + "grad_norm": 1.0703125, + "learning_rate": 4.867809696839319e-05, + "loss": 1.0686, + "step": 1441 + }, + { + "epoch": 0.10610255231087606, + "grad_norm": 0.984375, + "learning_rate": 4.867623799484148e-05, + "loss": 1.177, + "step": 1442 + }, + { + "epoch": 0.10617613244424005, + "grad_norm": 0.76171875, + "learning_rate": 4.867437775063079e-05, + "loss": 0.6957, + "step": 1443 + }, + { + "epoch": 0.10624971257760404, + "grad_norm": 0.8046875, + "learning_rate": 4.8672516235860975e-05, + "loss": 0.8737, + "step": 1444 + }, + { + "epoch": 0.10632329271096803, + "grad_norm": 0.91796875, + "learning_rate": 4.867065345063192e-05, + "loss": 1.1496, + "step": 1445 + }, + { + "epoch": 0.10639687284433202, + "grad_norm": 0.79296875, + "learning_rate": 4.86687893950436e-05, + "loss": 1.2304, + "step": 1446 + }, + { + "epoch": 0.10647045297769603, + "grad_norm": 1.0546875, + "learning_rate": 4.866692406919605e-05, + "loss": 1.3171, + "step": 1447 + }, + { + "epoch": 0.10654403311106002, + "grad_norm": 0.8984375, + "learning_rate": 4.866505747318939e-05, + "loss": 1.1928, + "step": 1448 + }, + { + "epoch": 0.10661761324442401, + "grad_norm": 0.8671875, + "learning_rate": 4.8663189607123796e-05, + "loss": 1.0115, + "step": 1449 + }, + { + "epoch": 0.106691193377788, + "grad_norm": 0.90234375, + "learning_rate": 4.866132047109951e-05, + "loss": 0.7637, + "step": 1450 + }, + { + "epoch": 0.10676477351115199, + "grad_norm": 0.88671875, + "learning_rate": 4.865945006521684e-05, + "loss": 0.7481, + "step": 1451 + }, + { + "epoch": 0.10683835364451598, + "grad_norm": 0.78515625, + "learning_rate": 4.8657578389576175e-05, + "loss": 0.8649, + "step": 1452 + }, + { + "epoch": 0.10691193377787997, + "grad_norm": 0.9609375, + "learning_rate": 4.865570544427795e-05, + "loss": 1.1992, + "step": 1453 + }, + { + "epoch": 0.10698551391124396, + "grad_norm": 0.828125, + "learning_rate": 4.8653831229422705e-05, + "loss": 0.7719, + "step": 1454 + }, + { + "epoch": 0.10705909404460795, + "grad_norm": 1.0703125, + "learning_rate": 4.865195574511101e-05, + "loss": 1.3225, + "step": 1455 + }, + { + "epoch": 0.10713267417797195, + "grad_norm": 1.0234375, + "learning_rate": 4.8650078991443523e-05, + "loss": 0.9803, + "step": 1456 + }, + { + "epoch": 0.10720625431133594, + "grad_norm": 1.046875, + "learning_rate": 4.8648200968520965e-05, + "loss": 1.5307, + "step": 1457 + }, + { + "epoch": 0.10727983444469993, + "grad_norm": 1.125, + "learning_rate": 4.864632167644413e-05, + "loss": 1.355, + "step": 1458 + }, + { + "epoch": 0.10735341457806392, + "grad_norm": 0.96875, + "learning_rate": 4.864444111531386e-05, + "loss": 1.348, + "step": 1459 + }, + { + "epoch": 0.10742699471142791, + "grad_norm": 0.97265625, + "learning_rate": 4.86425592852311e-05, + "loss": 1.061, + "step": 1460 + }, + { + "epoch": 0.1075005748447919, + "grad_norm": 0.92578125, + "learning_rate": 4.8640676186296844e-05, + "loss": 1.1889, + "step": 1461 + }, + { + "epoch": 0.1075741549781559, + "grad_norm": 0.921875, + "learning_rate": 4.863879181861215e-05, + "loss": 1.0199, + "step": 1462 + }, + { + "epoch": 0.10764773511151988, + "grad_norm": 1.0859375, + "learning_rate": 4.8636906182278134e-05, + "loss": 1.0298, + "step": 1463 + }, + { + "epoch": 0.10772131524488388, + "grad_norm": 1.203125, + "learning_rate": 4.863501927739601e-05, + "loss": 1.0112, + "step": 1464 + }, + { + "epoch": 0.10779489537824788, + "grad_norm": 1.0390625, + "learning_rate": 4.863313110406704e-05, + "loss": 1.2276, + "step": 1465 + }, + { + "epoch": 0.10786847551161187, + "grad_norm": 0.8046875, + "learning_rate": 4.863124166239257e-05, + "loss": 0.9624, + "step": 1466 + }, + { + "epoch": 0.10794205564497586, + "grad_norm": 1.0546875, + "learning_rate": 4.862935095247398e-05, + "loss": 1.678, + "step": 1467 + }, + { + "epoch": 0.10801563577833985, + "grad_norm": 0.80078125, + "learning_rate": 4.862745897441277e-05, + "loss": 0.8602, + "step": 1468 + }, + { + "epoch": 0.10808921591170384, + "grad_norm": 0.8515625, + "learning_rate": 4.862556572831045e-05, + "loss": 0.8751, + "step": 1469 + }, + { + "epoch": 0.10816279604506783, + "grad_norm": 0.9375, + "learning_rate": 4.862367121426865e-05, + "loss": 0.8047, + "step": 1470 + }, + { + "epoch": 0.10823637617843182, + "grad_norm": 0.9296875, + "learning_rate": 4.862177543238903e-05, + "loss": 0.9325, + "step": 1471 + }, + { + "epoch": 0.10830995631179581, + "grad_norm": 1.1953125, + "learning_rate": 4.861987838277333e-05, + "loss": 1.3588, + "step": 1472 + }, + { + "epoch": 0.1083835364451598, + "grad_norm": 1.109375, + "learning_rate": 4.861798006552338e-05, + "loss": 1.3279, + "step": 1473 + }, + { + "epoch": 0.1084571165785238, + "grad_norm": 0.94140625, + "learning_rate": 4.861608048074104e-05, + "loss": 0.7374, + "step": 1474 + }, + { + "epoch": 0.1085306967118878, + "grad_norm": 1.34375, + "learning_rate": 4.8614179628528265e-05, + "loss": 1.157, + "step": 1475 + }, + { + "epoch": 0.10860427684525178, + "grad_norm": 0.859375, + "learning_rate": 4.861227750898708e-05, + "loss": 0.8502, + "step": 1476 + }, + { + "epoch": 0.10867785697861577, + "grad_norm": 0.88671875, + "learning_rate": 4.861037412221955e-05, + "loss": 0.9861, + "step": 1477 + }, + { + "epoch": 0.10875143711197977, + "grad_norm": 0.7578125, + "learning_rate": 4.860846946832783e-05, + "loss": 0.9201, + "step": 1478 + }, + { + "epoch": 0.10882501724534376, + "grad_norm": 0.97265625, + "learning_rate": 4.860656354741415e-05, + "loss": 1.1797, + "step": 1479 + }, + { + "epoch": 0.10889859737870775, + "grad_norm": 1.1953125, + "learning_rate": 4.860465635958079e-05, + "loss": 1.3831, + "step": 1480 + }, + { + "epoch": 0.10897217751207174, + "grad_norm": 1.296875, + "learning_rate": 4.860274790493011e-05, + "loss": 1.5445, + "step": 1481 + }, + { + "epoch": 0.10904575764543573, + "grad_norm": 1.046875, + "learning_rate": 4.860083818356452e-05, + "loss": 0.8569, + "step": 1482 + }, + { + "epoch": 0.10911933777879973, + "grad_norm": 0.83203125, + "learning_rate": 4.859892719558653e-05, + "loss": 1.044, + "step": 1483 + }, + { + "epoch": 0.10919291791216372, + "grad_norm": 0.6953125, + "learning_rate": 4.859701494109868e-05, + "loss": 0.5431, + "step": 1484 + }, + { + "epoch": 0.10926649804552771, + "grad_norm": 0.7265625, + "learning_rate": 4.859510142020362e-05, + "loss": 0.6719, + "step": 1485 + }, + { + "epoch": 0.1093400781788917, + "grad_norm": 0.76953125, + "learning_rate": 4.859318663300402e-05, + "loss": 0.8481, + "step": 1486 + }, + { + "epoch": 0.10941365831225569, + "grad_norm": 0.75, + "learning_rate": 4.859127057960266e-05, + "loss": 0.7634, + "step": 1487 + }, + { + "epoch": 0.10948723844561968, + "grad_norm": 1.171875, + "learning_rate": 4.858935326010237e-05, + "loss": 1.5947, + "step": 1488 + }, + { + "epoch": 0.10956081857898367, + "grad_norm": 0.84765625, + "learning_rate": 4.8587434674606036e-05, + "loss": 0.7334, + "step": 1489 + }, + { + "epoch": 0.10963439871234766, + "grad_norm": 0.9765625, + "learning_rate": 4.858551482321663e-05, + "loss": 1.0087, + "step": 1490 + }, + { + "epoch": 0.10970797884571165, + "grad_norm": 0.96875, + "learning_rate": 4.85835937060372e-05, + "loss": 1.0311, + "step": 1491 + }, + { + "epoch": 0.10978155897907566, + "grad_norm": 0.99609375, + "learning_rate": 4.858167132317083e-05, + "loss": 0.7996, + "step": 1492 + }, + { + "epoch": 0.10985513911243965, + "grad_norm": 0.7734375, + "learning_rate": 4.857974767472071e-05, + "loss": 0.6179, + "step": 1493 + }, + { + "epoch": 0.10992871924580364, + "grad_norm": 0.99609375, + "learning_rate": 4.857782276079006e-05, + "loss": 1.0871, + "step": 1494 + }, + { + "epoch": 0.11000229937916763, + "grad_norm": 0.91796875, + "learning_rate": 4.8575896581482195e-05, + "loss": 0.9696, + "step": 1495 + }, + { + "epoch": 0.11007587951253162, + "grad_norm": 0.82421875, + "learning_rate": 4.8573969136900495e-05, + "loss": 0.7804, + "step": 1496 + }, + { + "epoch": 0.1101494596458956, + "grad_norm": 0.87890625, + "learning_rate": 4.857204042714839e-05, + "loss": 0.8615, + "step": 1497 + }, + { + "epoch": 0.1102230397792596, + "grad_norm": 0.8203125, + "learning_rate": 4.8570110452329395e-05, + "loss": 1.0255, + "step": 1498 + }, + { + "epoch": 0.11029661991262359, + "grad_norm": 0.8203125, + "learning_rate": 4.85681792125471e-05, + "loss": 0.7892, + "step": 1499 + }, + { + "epoch": 0.11037020004598758, + "grad_norm": 0.72265625, + "learning_rate": 4.856624670790513e-05, + "loss": 0.7113, + "step": 1500 + }, + { + "epoch": 0.11044378017935158, + "grad_norm": 0.875, + "learning_rate": 4.856431293850722e-05, + "loss": 0.9847, + "step": 1501 + }, + { + "epoch": 0.11051736031271557, + "grad_norm": 0.8359375, + "learning_rate": 4.856237790445714e-05, + "loss": 0.6973, + "step": 1502 + }, + { + "epoch": 0.11059094044607956, + "grad_norm": 0.84375, + "learning_rate": 4.856044160585872e-05, + "loss": 0.7668, + "step": 1503 + }, + { + "epoch": 0.11066452057944355, + "grad_norm": 0.90234375, + "learning_rate": 4.855850404281592e-05, + "loss": 0.6934, + "step": 1504 + }, + { + "epoch": 0.11073810071280754, + "grad_norm": 0.765625, + "learning_rate": 4.85565652154327e-05, + "loss": 0.8594, + "step": 1505 + }, + { + "epoch": 0.11081168084617153, + "grad_norm": 0.76171875, + "learning_rate": 4.8554625123813116e-05, + "loss": 0.8429, + "step": 1506 + }, + { + "epoch": 0.11088526097953552, + "grad_norm": 0.890625, + "learning_rate": 4.855268376806129e-05, + "loss": 0.9555, + "step": 1507 + }, + { + "epoch": 0.11095884111289951, + "grad_norm": 1.046875, + "learning_rate": 4.8550741148281406e-05, + "loss": 1.3677, + "step": 1508 + }, + { + "epoch": 0.11103242124626352, + "grad_norm": 0.91796875, + "learning_rate": 4.854879726457773e-05, + "loss": 1.0277, + "step": 1509 + }, + { + "epoch": 0.1111060013796275, + "grad_norm": 0.90625, + "learning_rate": 4.854685211705458e-05, + "loss": 0.9829, + "step": 1510 + }, + { + "epoch": 0.1111795815129915, + "grad_norm": 1.1171875, + "learning_rate": 4.854490570581635e-05, + "loss": 1.2195, + "step": 1511 + }, + { + "epoch": 0.11125316164635549, + "grad_norm": 0.8125, + "learning_rate": 4.854295803096751e-05, + "loss": 0.6009, + "step": 1512 + }, + { + "epoch": 0.11132674177971948, + "grad_norm": 0.9453125, + "learning_rate": 4.854100909261256e-05, + "loss": 1.1862, + "step": 1513 + }, + { + "epoch": 0.11140032191308347, + "grad_norm": 0.97265625, + "learning_rate": 4.853905889085613e-05, + "loss": 1.0422, + "step": 1514 + }, + { + "epoch": 0.11147390204644746, + "grad_norm": 0.73046875, + "learning_rate": 4.8537107425802865e-05, + "loss": 0.7051, + "step": 1515 + }, + { + "epoch": 0.11154748217981145, + "grad_norm": 0.73828125, + "learning_rate": 4.85351546975575e-05, + "loss": 0.6874, + "step": 1516 + }, + { + "epoch": 0.11162106231317544, + "grad_norm": 0.8046875, + "learning_rate": 4.853320070622484e-05, + "loss": 1.0476, + "step": 1517 + }, + { + "epoch": 0.11169464244653944, + "grad_norm": 0.74609375, + "learning_rate": 4.853124545190973e-05, + "loss": 0.7274, + "step": 1518 + }, + { + "epoch": 0.11176822257990343, + "grad_norm": 0.83203125, + "learning_rate": 4.852928893471713e-05, + "loss": 0.8882, + "step": 1519 + }, + { + "epoch": 0.11184180271326742, + "grad_norm": 0.9296875, + "learning_rate": 4.8527331154752035e-05, + "loss": 1.0539, + "step": 1520 + }, + { + "epoch": 0.11191538284663141, + "grad_norm": 0.8984375, + "learning_rate": 4.852537211211951e-05, + "loss": 0.9637, + "step": 1521 + }, + { + "epoch": 0.1119889629799954, + "grad_norm": 1.015625, + "learning_rate": 4.8523411806924704e-05, + "loss": 1.4332, + "step": 1522 + }, + { + "epoch": 0.11206254311335939, + "grad_norm": 0.84375, + "learning_rate": 4.852145023927281e-05, + "loss": 0.7953, + "step": 1523 + }, + { + "epoch": 0.11213612324672338, + "grad_norm": 0.81640625, + "learning_rate": 4.85194874092691e-05, + "loss": 0.8669, + "step": 1524 + }, + { + "epoch": 0.11220970338008737, + "grad_norm": 0.84765625, + "learning_rate": 4.851752331701894e-05, + "loss": 0.9211, + "step": 1525 + }, + { + "epoch": 0.11228328351345136, + "grad_norm": 1.1015625, + "learning_rate": 4.851555796262771e-05, + "loss": 1.3201, + "step": 1526 + }, + { + "epoch": 0.11235686364681537, + "grad_norm": 0.86328125, + "learning_rate": 4.85135913462009e-05, + "loss": 0.8635, + "step": 1527 + }, + { + "epoch": 0.11243044378017936, + "grad_norm": 1.09375, + "learning_rate": 4.851162346784406e-05, + "loss": 0.9325, + "step": 1528 + }, + { + "epoch": 0.11250402391354335, + "grad_norm": 0.8828125, + "learning_rate": 4.850965432766279e-05, + "loss": 1.066, + "step": 1529 + }, + { + "epoch": 0.11257760404690734, + "grad_norm": 1.1953125, + "learning_rate": 4.850768392576277e-05, + "loss": 1.2734, + "step": 1530 + }, + { + "epoch": 0.11265118418027133, + "grad_norm": 0.89453125, + "learning_rate": 4.850571226224976e-05, + "loss": 0.8032, + "step": 1531 + }, + { + "epoch": 0.11272476431363532, + "grad_norm": 0.80078125, + "learning_rate": 4.850373933722957e-05, + "loss": 0.7884, + "step": 1532 + }, + { + "epoch": 0.11279834444699931, + "grad_norm": 0.921875, + "learning_rate": 4.8501765150808085e-05, + "loss": 0.8609, + "step": 1533 + }, + { + "epoch": 0.1128719245803633, + "grad_norm": 0.75390625, + "learning_rate": 4.849978970309125e-05, + "loss": 0.841, + "step": 1534 + }, + { + "epoch": 0.11294550471372729, + "grad_norm": 0.77734375, + "learning_rate": 4.8497812994185075e-05, + "loss": 0.8201, + "step": 1535 + }, + { + "epoch": 0.11301908484709129, + "grad_norm": 0.7421875, + "learning_rate": 4.8495835024195665e-05, + "loss": 0.8334, + "step": 1536 + }, + { + "epoch": 0.11309266498045528, + "grad_norm": 0.86328125, + "learning_rate": 4.8493855793229174e-05, + "loss": 1.2785, + "step": 1537 + }, + { + "epoch": 0.11316624511381927, + "grad_norm": 0.80859375, + "learning_rate": 4.8491875301391806e-05, + "loss": 0.6987, + "step": 1538 + }, + { + "epoch": 0.11323982524718326, + "grad_norm": 0.98046875, + "learning_rate": 4.8489893548789874e-05, + "loss": 1.2081, + "step": 1539 + }, + { + "epoch": 0.11331340538054725, + "grad_norm": 0.8359375, + "learning_rate": 4.848791053552971e-05, + "loss": 0.9546, + "step": 1540 + }, + { + "epoch": 0.11338698551391124, + "grad_norm": 0.80859375, + "learning_rate": 4.848592626171775e-05, + "loss": 0.7666, + "step": 1541 + }, + { + "epoch": 0.11346056564727523, + "grad_norm": 0.9453125, + "learning_rate": 4.84839407274605e-05, + "loss": 0.8671, + "step": 1542 + }, + { + "epoch": 0.11353414578063922, + "grad_norm": 1.015625, + "learning_rate": 4.848195393286449e-05, + "loss": 1.3558, + "step": 1543 + }, + { + "epoch": 0.11360772591400321, + "grad_norm": 0.84375, + "learning_rate": 4.8479965878036373e-05, + "loss": 0.8777, + "step": 1544 + }, + { + "epoch": 0.11368130604736722, + "grad_norm": 1.0546875, + "learning_rate": 4.8477976563082824e-05, + "loss": 0.9407, + "step": 1545 + }, + { + "epoch": 0.11375488618073121, + "grad_norm": 0.9609375, + "learning_rate": 4.847598598811062e-05, + "loss": 1.3234, + "step": 1546 + }, + { + "epoch": 0.1138284663140952, + "grad_norm": 0.953125, + "learning_rate": 4.8473994153226594e-05, + "loss": 1.0208, + "step": 1547 + }, + { + "epoch": 0.11390204644745919, + "grad_norm": 0.84375, + "learning_rate": 4.847200105853763e-05, + "loss": 1.0776, + "step": 1548 + }, + { + "epoch": 0.11397562658082318, + "grad_norm": 0.9375, + "learning_rate": 4.8470006704150705e-05, + "loss": 1.1107, + "step": 1549 + }, + { + "epoch": 0.11404920671418717, + "grad_norm": 1.0546875, + "learning_rate": 4.846801109017285e-05, + "loss": 0.8734, + "step": 1550 + }, + { + "epoch": 0.11412278684755116, + "grad_norm": 1.046875, + "learning_rate": 4.846601421671116e-05, + "loss": 1.2276, + "step": 1551 + }, + { + "epoch": 0.11419636698091515, + "grad_norm": 0.85546875, + "learning_rate": 4.846401608387282e-05, + "loss": 1.2157, + "step": 1552 + }, + { + "epoch": 0.11426994711427914, + "grad_norm": 0.8984375, + "learning_rate": 4.846201669176505e-05, + "loss": 0.8714, + "step": 1553 + }, + { + "epoch": 0.11434352724764314, + "grad_norm": 1.3828125, + "learning_rate": 4.846001604049515e-05, + "loss": 1.3024, + "step": 1554 + }, + { + "epoch": 0.11441710738100713, + "grad_norm": 1.5546875, + "learning_rate": 4.8458014130170506e-05, + "loss": 0.8927, + "step": 1555 + }, + { + "epoch": 0.11449068751437112, + "grad_norm": 0.91015625, + "learning_rate": 4.845601096089855e-05, + "loss": 1.1201, + "step": 1556 + }, + { + "epoch": 0.11456426764773511, + "grad_norm": 0.9140625, + "learning_rate": 4.845400653278678e-05, + "loss": 1.013, + "step": 1557 + }, + { + "epoch": 0.1146378477810991, + "grad_norm": 0.8046875, + "learning_rate": 4.845200084594278e-05, + "loss": 0.7813, + "step": 1558 + }, + { + "epoch": 0.1147114279144631, + "grad_norm": 1.1328125, + "learning_rate": 4.8449993900474187e-05, + "loss": 1.1068, + "step": 1559 + }, + { + "epoch": 0.11478500804782708, + "grad_norm": 1.0, + "learning_rate": 4.844798569648872e-05, + "loss": 1.1226, + "step": 1560 + }, + { + "epoch": 0.11485858818119107, + "grad_norm": 0.9296875, + "learning_rate": 4.844597623409414e-05, + "loss": 0.816, + "step": 1561 + }, + { + "epoch": 0.11493216831455506, + "grad_norm": 0.9609375, + "learning_rate": 4.84439655133983e-05, + "loss": 0.908, + "step": 1562 + }, + { + "epoch": 0.11500574844791907, + "grad_norm": 0.91015625, + "learning_rate": 4.8441953534509116e-05, + "loss": 1.1246, + "step": 1563 + }, + { + "epoch": 0.11507932858128306, + "grad_norm": 1.140625, + "learning_rate": 4.843994029753456e-05, + "loss": 1.2163, + "step": 1564 + }, + { + "epoch": 0.11515290871464705, + "grad_norm": 0.85546875, + "learning_rate": 4.843792580258267e-05, + "loss": 1.0191, + "step": 1565 + }, + { + "epoch": 0.11522648884801104, + "grad_norm": 1.0078125, + "learning_rate": 4.843591004976158e-05, + "loss": 0.8801, + "step": 1566 + }, + { + "epoch": 0.11530006898137503, + "grad_norm": 1.1875, + "learning_rate": 4.843389303917946e-05, + "loss": 1.2238, + "step": 1567 + }, + { + "epoch": 0.11537364911473902, + "grad_norm": 0.921875, + "learning_rate": 4.843187477094456e-05, + "loss": 0.8479, + "step": 1568 + }, + { + "epoch": 0.11544722924810301, + "grad_norm": 0.94921875, + "learning_rate": 4.84298552451652e-05, + "loss": 1.1632, + "step": 1569 + }, + { + "epoch": 0.115520809381467, + "grad_norm": 1.0703125, + "learning_rate": 4.8427834461949764e-05, + "loss": 1.2675, + "step": 1570 + }, + { + "epoch": 0.11559438951483099, + "grad_norm": 0.70703125, + "learning_rate": 4.842581242140669e-05, + "loss": 0.6874, + "step": 1571 + }, + { + "epoch": 0.115667969648195, + "grad_norm": 1.0234375, + "learning_rate": 4.842378912364452e-05, + "loss": 1.2296, + "step": 1572 + }, + { + "epoch": 0.11574154978155898, + "grad_norm": 1.03125, + "learning_rate": 4.842176456877182e-05, + "loss": 1.1198, + "step": 1573 + }, + { + "epoch": 0.11581512991492297, + "grad_norm": 1.5, + "learning_rate": 4.841973875689726e-05, + "loss": 0.8049, + "step": 1574 + }, + { + "epoch": 0.11588871004828696, + "grad_norm": 1.0390625, + "learning_rate": 4.841771168812955e-05, + "loss": 0.8588, + "step": 1575 + }, + { + "epoch": 0.11596229018165095, + "grad_norm": 0.85546875, + "learning_rate": 4.841568336257749e-05, + "loss": 0.7734, + "step": 1576 + }, + { + "epoch": 0.11603587031501494, + "grad_norm": 1.0859375, + "learning_rate": 4.8413653780349925e-05, + "loss": 0.997, + "step": 1577 + }, + { + "epoch": 0.11610945044837893, + "grad_norm": 0.875, + "learning_rate": 4.841162294155578e-05, + "loss": 0.8062, + "step": 1578 + }, + { + "epoch": 0.11618303058174292, + "grad_norm": 1.015625, + "learning_rate": 4.840959084630406e-05, + "loss": 0.8249, + "step": 1579 + }, + { + "epoch": 0.11625661071510691, + "grad_norm": 0.80078125, + "learning_rate": 4.84075574947038e-05, + "loss": 0.7494, + "step": 1580 + }, + { + "epoch": 0.11633019084847092, + "grad_norm": 0.703125, + "learning_rate": 4.840552288686415e-05, + "loss": 0.6936, + "step": 1581 + }, + { + "epoch": 0.11640377098183491, + "grad_norm": 0.80078125, + "learning_rate": 4.8403487022894294e-05, + "loss": 0.7006, + "step": 1582 + }, + { + "epoch": 0.1164773511151989, + "grad_norm": 0.79296875, + "learning_rate": 4.840144990290349e-05, + "loss": 0.9448, + "step": 1583 + }, + { + "epoch": 0.11655093124856289, + "grad_norm": 0.765625, + "learning_rate": 4.839941152700107e-05, + "loss": 1.0342, + "step": 1584 + }, + { + "epoch": 0.11662451138192688, + "grad_norm": 0.87890625, + "learning_rate": 4.839737189529643e-05, + "loss": 0.959, + "step": 1585 + }, + { + "epoch": 0.11669809151529087, + "grad_norm": 1.0546875, + "learning_rate": 4.839533100789903e-05, + "loss": 0.8676, + "step": 1586 + }, + { + "epoch": 0.11677167164865486, + "grad_norm": 0.94921875, + "learning_rate": 4.83932888649184e-05, + "loss": 0.7941, + "step": 1587 + }, + { + "epoch": 0.11684525178201885, + "grad_norm": 0.96875, + "learning_rate": 4.839124546646414e-05, + "loss": 0.6636, + "step": 1588 + }, + { + "epoch": 0.11691883191538284, + "grad_norm": 1.0390625, + "learning_rate": 4.838920081264592e-05, + "loss": 1.021, + "step": 1589 + }, + { + "epoch": 0.11699241204874684, + "grad_norm": 0.9765625, + "learning_rate": 4.8387154903573464e-05, + "loss": 1.5042, + "step": 1590 + }, + { + "epoch": 0.11706599218211083, + "grad_norm": 1.03125, + "learning_rate": 4.838510773935657e-05, + "loss": 1.0714, + "step": 1591 + }, + { + "epoch": 0.11713957231547482, + "grad_norm": 0.76953125, + "learning_rate": 4.8383059320105124e-05, + "loss": 0.6976, + "step": 1592 + }, + { + "epoch": 0.11721315244883881, + "grad_norm": 0.8359375, + "learning_rate": 4.8381009645929044e-05, + "loss": 0.8119, + "step": 1593 + }, + { + "epoch": 0.1172867325822028, + "grad_norm": 0.71875, + "learning_rate": 4.837895871693834e-05, + "loss": 0.7397, + "step": 1594 + }, + { + "epoch": 0.1173603127155668, + "grad_norm": 1.0234375, + "learning_rate": 4.837690653324307e-05, + "loss": 1.279, + "step": 1595 + }, + { + "epoch": 0.11743389284893078, + "grad_norm": 0.90234375, + "learning_rate": 4.8374853094953384e-05, + "loss": 0.7486, + "step": 1596 + }, + { + "epoch": 0.11750747298229477, + "grad_norm": 0.6796875, + "learning_rate": 4.837279840217948e-05, + "loss": 0.7038, + "step": 1597 + }, + { + "epoch": 0.11758105311565878, + "grad_norm": 0.74609375, + "learning_rate": 4.837074245503164e-05, + "loss": 0.9153, + "step": 1598 + }, + { + "epoch": 0.11765463324902277, + "grad_norm": 0.8984375, + "learning_rate": 4.836868525362018e-05, + "loss": 1.0461, + "step": 1599 + }, + { + "epoch": 0.11772821338238676, + "grad_norm": 0.7578125, + "learning_rate": 4.836662679805553e-05, + "loss": 0.798, + "step": 1600 + }, + { + "epoch": 0.11780179351575075, + "grad_norm": 0.8828125, + "learning_rate": 4.836456708844814e-05, + "loss": 1.1684, + "step": 1601 + }, + { + "epoch": 0.11787537364911474, + "grad_norm": 0.75390625, + "learning_rate": 4.8362506124908574e-05, + "loss": 0.8785, + "step": 1602 + }, + { + "epoch": 0.11794895378247873, + "grad_norm": 1.0, + "learning_rate": 4.8360443907547423e-05, + "loss": 1.3117, + "step": 1603 + }, + { + "epoch": 0.11802253391584272, + "grad_norm": 0.9453125, + "learning_rate": 4.835838043647538e-05, + "loss": 0.9618, + "step": 1604 + }, + { + "epoch": 0.11809611404920671, + "grad_norm": 0.86328125, + "learning_rate": 4.8356315711803166e-05, + "loss": 0.9472, + "step": 1605 + }, + { + "epoch": 0.1181696941825707, + "grad_norm": 0.7734375, + "learning_rate": 4.835424973364161e-05, + "loss": 0.805, + "step": 1606 + }, + { + "epoch": 0.1182432743159347, + "grad_norm": 0.8828125, + "learning_rate": 4.835218250210157e-05, + "loss": 0.8101, + "step": 1607 + }, + { + "epoch": 0.1183168544492987, + "grad_norm": 1.1953125, + "learning_rate": 4.8350114017294015e-05, + "loss": 1.0567, + "step": 1608 + }, + { + "epoch": 0.11839043458266268, + "grad_norm": 0.91796875, + "learning_rate": 4.834804427932993e-05, + "loss": 0.9547, + "step": 1609 + }, + { + "epoch": 0.11846401471602667, + "grad_norm": 0.71484375, + "learning_rate": 4.8345973288320414e-05, + "loss": 0.9241, + "step": 1610 + }, + { + "epoch": 0.11853759484939067, + "grad_norm": 0.9375, + "learning_rate": 4.8343901044376604e-05, + "loss": 1.2618, + "step": 1611 + }, + { + "epoch": 0.11861117498275466, + "grad_norm": 1.078125, + "learning_rate": 4.834182754760972e-05, + "loss": 1.0232, + "step": 1612 + }, + { + "epoch": 0.11868475511611865, + "grad_norm": 0.98828125, + "learning_rate": 4.8339752798131034e-05, + "loss": 1.1661, + "step": 1613 + }, + { + "epoch": 0.11875833524948264, + "grad_norm": 0.75390625, + "learning_rate": 4.8337676796051895e-05, + "loss": 0.6603, + "step": 1614 + }, + { + "epoch": 0.11883191538284663, + "grad_norm": 0.89453125, + "learning_rate": 4.8335599541483724e-05, + "loss": 0.8218, + "step": 1615 + }, + { + "epoch": 0.11890549551621063, + "grad_norm": 0.77734375, + "learning_rate": 4.8333521034538e-05, + "loss": 0.8218, + "step": 1616 + }, + { + "epoch": 0.11897907564957462, + "grad_norm": 0.7421875, + "learning_rate": 4.833144127532627e-05, + "loss": 0.8746, + "step": 1617 + }, + { + "epoch": 0.11905265578293861, + "grad_norm": 1.03125, + "learning_rate": 4.8329360263960155e-05, + "loss": 1.3788, + "step": 1618 + }, + { + "epoch": 0.1191262359163026, + "grad_norm": 0.6953125, + "learning_rate": 4.832727800055134e-05, + "loss": 0.7577, + "step": 1619 + }, + { + "epoch": 0.11919981604966659, + "grad_norm": 0.88671875, + "learning_rate": 4.8325194485211575e-05, + "loss": 1.0625, + "step": 1620 + }, + { + "epoch": 0.11927339618303058, + "grad_norm": 0.8828125, + "learning_rate": 4.8323109718052675e-05, + "loss": 1.0207, + "step": 1621 + }, + { + "epoch": 0.11934697631639457, + "grad_norm": 0.85546875, + "learning_rate": 4.832102369918652e-05, + "loss": 1.2307, + "step": 1622 + }, + { + "epoch": 0.11942055644975856, + "grad_norm": 0.83203125, + "learning_rate": 4.8318936428725083e-05, + "loss": 0.7842, + "step": 1623 + }, + { + "epoch": 0.11949413658312255, + "grad_norm": 0.99609375, + "learning_rate": 4.831684790678036e-05, + "loss": 1.0629, + "step": 1624 + }, + { + "epoch": 0.11956771671648656, + "grad_norm": 0.83203125, + "learning_rate": 4.831475813346445e-05, + "loss": 0.7109, + "step": 1625 + }, + { + "epoch": 0.11964129684985055, + "grad_norm": 0.71875, + "learning_rate": 4.831266710888952e-05, + "loss": 0.6695, + "step": 1626 + }, + { + "epoch": 0.11971487698321454, + "grad_norm": 0.78125, + "learning_rate": 4.8310574833167763e-05, + "loss": 1.0373, + "step": 1627 + }, + { + "epoch": 0.11978845711657853, + "grad_norm": 0.703125, + "learning_rate": 4.8308481306411493e-05, + "loss": 0.4827, + "step": 1628 + }, + { + "epoch": 0.11986203724994252, + "grad_norm": 0.8828125, + "learning_rate": 4.830638652873305e-05, + "loss": 0.8619, + "step": 1629 + }, + { + "epoch": 0.1199356173833065, + "grad_norm": 0.79296875, + "learning_rate": 4.8304290500244855e-05, + "loss": 0.8509, + "step": 1630 + }, + { + "epoch": 0.1200091975166705, + "grad_norm": 0.82421875, + "learning_rate": 4.830219322105941e-05, + "loss": 0.8503, + "step": 1631 + }, + { + "epoch": 0.12008277765003449, + "grad_norm": 1.15625, + "learning_rate": 4.830009469128927e-05, + "loss": 1.3459, + "step": 1632 + }, + { + "epoch": 0.12015635778339848, + "grad_norm": 0.83984375, + "learning_rate": 4.829799491104705e-05, + "loss": 0.8609, + "step": 1633 + }, + { + "epoch": 0.12022993791676248, + "grad_norm": 0.90625, + "learning_rate": 4.829589388044545e-05, + "loss": 1.1231, + "step": 1634 + }, + { + "epoch": 0.12030351805012647, + "grad_norm": 1.0546875, + "learning_rate": 4.829379159959722e-05, + "loss": 0.9082, + "step": 1635 + }, + { + "epoch": 0.12037709818349046, + "grad_norm": 0.9375, + "learning_rate": 4.8291688068615196e-05, + "loss": 1.1797, + "step": 1636 + }, + { + "epoch": 0.12045067831685445, + "grad_norm": 1.0078125, + "learning_rate": 4.828958328761226e-05, + "loss": 0.9844, + "step": 1637 + }, + { + "epoch": 0.12052425845021844, + "grad_norm": 0.80859375, + "learning_rate": 4.828747725670138e-05, + "loss": 0.9034, + "step": 1638 + }, + { + "epoch": 0.12059783858358243, + "grad_norm": 1.0, + "learning_rate": 4.828536997599559e-05, + "loss": 1.5587, + "step": 1639 + }, + { + "epoch": 0.12067141871694642, + "grad_norm": 1.0390625, + "learning_rate": 4.828326144560795e-05, + "loss": 1.0669, + "step": 1640 + }, + { + "epoch": 0.12074499885031041, + "grad_norm": 0.8984375, + "learning_rate": 4.8281151665651664e-05, + "loss": 0.9971, + "step": 1641 + }, + { + "epoch": 0.1208185789836744, + "grad_norm": 0.78125, + "learning_rate": 4.8279040636239934e-05, + "loss": 0.8416, + "step": 1642 + }, + { + "epoch": 0.1208921591170384, + "grad_norm": 0.7734375, + "learning_rate": 4.8276928357486056e-05, + "loss": 0.9918, + "step": 1643 + }, + { + "epoch": 0.1209657392504024, + "grad_norm": 0.90625, + "learning_rate": 4.82748148295034e-05, + "loss": 0.9947, + "step": 1644 + }, + { + "epoch": 0.12103931938376639, + "grad_norm": 0.87109375, + "learning_rate": 4.8272700052405396e-05, + "loss": 0.8783, + "step": 1645 + }, + { + "epoch": 0.12111289951713038, + "grad_norm": 0.90625, + "learning_rate": 4.827058402630553e-05, + "loss": 1.3422, + "step": 1646 + }, + { + "epoch": 0.12118647965049437, + "grad_norm": 0.83984375, + "learning_rate": 4.826846675131738e-05, + "loss": 0.8693, + "step": 1647 + }, + { + "epoch": 0.12126005978385836, + "grad_norm": 0.90625, + "learning_rate": 4.826634822755456e-05, + "loss": 1.0177, + "step": 1648 + }, + { + "epoch": 0.12133363991722235, + "grad_norm": 1.0546875, + "learning_rate": 4.826422845513077e-05, + "loss": 1.4612, + "step": 1649 + }, + { + "epoch": 0.12140722005058634, + "grad_norm": 0.8046875, + "learning_rate": 4.826210743415979e-05, + "loss": 1.0388, + "step": 1650 + }, + { + "epoch": 0.12148080018395033, + "grad_norm": 1.0, + "learning_rate": 4.825998516475543e-05, + "loss": 1.1511, + "step": 1651 + }, + { + "epoch": 0.12155438031731433, + "grad_norm": 1.015625, + "learning_rate": 4.8257861647031607e-05, + "loss": 1.3444, + "step": 1652 + }, + { + "epoch": 0.12162796045067832, + "grad_norm": 1.0234375, + "learning_rate": 4.825573688110227e-05, + "loss": 1.3203, + "step": 1653 + }, + { + "epoch": 0.12170154058404231, + "grad_norm": 1.0078125, + "learning_rate": 4.825361086708146e-05, + "loss": 0.9841, + "step": 1654 + }, + { + "epoch": 0.1217751207174063, + "grad_norm": 0.8359375, + "learning_rate": 4.825148360508328e-05, + "loss": 0.7331, + "step": 1655 + }, + { + "epoch": 0.12184870085077029, + "grad_norm": 0.84375, + "learning_rate": 4.824935509522188e-05, + "loss": 0.7633, + "step": 1656 + }, + { + "epoch": 0.12192228098413428, + "grad_norm": 0.9453125, + "learning_rate": 4.824722533761151e-05, + "loss": 0.944, + "step": 1657 + }, + { + "epoch": 0.12199586111749827, + "grad_norm": 1.0546875, + "learning_rate": 4.824509433236647e-05, + "loss": 1.0866, + "step": 1658 + }, + { + "epoch": 0.12206944125086226, + "grad_norm": 0.9453125, + "learning_rate": 4.8242962079601115e-05, + "loss": 1.2952, + "step": 1659 + }, + { + "epoch": 0.12214302138422625, + "grad_norm": 1.03125, + "learning_rate": 4.8240828579429886e-05, + "loss": 1.3609, + "step": 1660 + }, + { + "epoch": 0.12221660151759026, + "grad_norm": 0.796875, + "learning_rate": 4.823869383196728e-05, + "loss": 0.7113, + "step": 1661 + }, + { + "epoch": 0.12229018165095425, + "grad_norm": 0.95703125, + "learning_rate": 4.823655783732787e-05, + "loss": 1.1002, + "step": 1662 + }, + { + "epoch": 0.12236376178431824, + "grad_norm": 0.88671875, + "learning_rate": 4.8234420595626286e-05, + "loss": 0.9645, + "step": 1663 + }, + { + "epoch": 0.12243734191768223, + "grad_norm": 1.046875, + "learning_rate": 4.823228210697723e-05, + "loss": 1.3537, + "step": 1664 + }, + { + "epoch": 0.12251092205104622, + "grad_norm": 0.96875, + "learning_rate": 4.823014237149548e-05, + "loss": 1.0574, + "step": 1665 + }, + { + "epoch": 0.12258450218441021, + "grad_norm": 1.0234375, + "learning_rate": 4.8228001389295864e-05, + "loss": 0.9434, + "step": 1666 + }, + { + "epoch": 0.1226580823177742, + "grad_norm": 0.859375, + "learning_rate": 4.822585916049328e-05, + "loss": 0.8898, + "step": 1667 + }, + { + "epoch": 0.12273166245113819, + "grad_norm": 0.97265625, + "learning_rate": 4.8223715685202706e-05, + "loss": 1.2811, + "step": 1668 + }, + { + "epoch": 0.12280524258450218, + "grad_norm": 0.80078125, + "learning_rate": 4.8221570963539175e-05, + "loss": 0.6959, + "step": 1669 + }, + { + "epoch": 0.12287882271786618, + "grad_norm": 0.91015625, + "learning_rate": 4.821942499561779e-05, + "loss": 0.7751, + "step": 1670 + }, + { + "epoch": 0.12295240285123017, + "grad_norm": 0.921875, + "learning_rate": 4.8217277781553716e-05, + "loss": 0.8852, + "step": 1671 + }, + { + "epoch": 0.12302598298459416, + "grad_norm": 1.0625, + "learning_rate": 4.82151293214622e-05, + "loss": 1.1222, + "step": 1672 + }, + { + "epoch": 0.12309956311795815, + "grad_norm": 0.9609375, + "learning_rate": 4.8212979615458534e-05, + "loss": 0.9906, + "step": 1673 + }, + { + "epoch": 0.12317314325132214, + "grad_norm": 0.84375, + "learning_rate": 4.8210828663658106e-05, + "loss": 0.9114, + "step": 1674 + }, + { + "epoch": 0.12324672338468613, + "grad_norm": 0.8203125, + "learning_rate": 4.8208676466176325e-05, + "loss": 0.7124, + "step": 1675 + }, + { + "epoch": 0.12332030351805012, + "grad_norm": 0.93359375, + "learning_rate": 4.820652302312873e-05, + "loss": 0.8773, + "step": 1676 + }, + { + "epoch": 0.12339388365141411, + "grad_norm": 0.96484375, + "learning_rate": 4.820436833463087e-05, + "loss": 0.712, + "step": 1677 + }, + { + "epoch": 0.1234674637847781, + "grad_norm": 0.87109375, + "learning_rate": 4.820221240079838e-05, + "loss": 0.8774, + "step": 1678 + }, + { + "epoch": 0.12354104391814211, + "grad_norm": 0.75, + "learning_rate": 4.820005522174699e-05, + "loss": 0.8815, + "step": 1679 + }, + { + "epoch": 0.1236146240515061, + "grad_norm": 1.0625, + "learning_rate": 4.8197896797592444e-05, + "loss": 1.206, + "step": 1680 + }, + { + "epoch": 0.12368820418487009, + "grad_norm": 1.1015625, + "learning_rate": 4.819573712845059e-05, + "loss": 1.4705, + "step": 1681 + }, + { + "epoch": 0.12376178431823408, + "grad_norm": 0.796875, + "learning_rate": 4.819357621443734e-05, + "loss": 0.8625, + "step": 1682 + }, + { + "epoch": 0.12383536445159807, + "grad_norm": 0.85546875, + "learning_rate": 4.819141405566866e-05, + "loss": 0.8718, + "step": 1683 + }, + { + "epoch": 0.12390894458496206, + "grad_norm": 0.8359375, + "learning_rate": 4.818925065226059e-05, + "loss": 0.7995, + "step": 1684 + }, + { + "epoch": 0.12398252471832605, + "grad_norm": 0.8125, + "learning_rate": 4.818708600432923e-05, + "loss": 0.8796, + "step": 1685 + }, + { + "epoch": 0.12405610485169004, + "grad_norm": 1.1875, + "learning_rate": 4.818492011199076e-05, + "loss": 1.2425, + "step": 1686 + }, + { + "epoch": 0.12412968498505403, + "grad_norm": 0.88671875, + "learning_rate": 4.8182752975361425e-05, + "loss": 0.6448, + "step": 1687 + }, + { + "epoch": 0.12420326511841803, + "grad_norm": 1.0, + "learning_rate": 4.818058459455752e-05, + "loss": 1.1012, + "step": 1688 + }, + { + "epoch": 0.12427684525178202, + "grad_norm": 0.84765625, + "learning_rate": 4.817841496969542e-05, + "loss": 1.0217, + "step": 1689 + }, + { + "epoch": 0.12435042538514601, + "grad_norm": 0.921875, + "learning_rate": 4.8176244100891566e-05, + "loss": 0.9944, + "step": 1690 + }, + { + "epoch": 0.12442400551851, + "grad_norm": 0.91796875, + "learning_rate": 4.817407198826247e-05, + "loss": 0.8699, + "step": 1691 + }, + { + "epoch": 0.124497585651874, + "grad_norm": 0.6953125, + "learning_rate": 4.8171898631924695e-05, + "loss": 0.7512, + "step": 1692 + }, + { + "epoch": 0.12457116578523798, + "grad_norm": 0.78515625, + "learning_rate": 4.816972403199489e-05, + "loss": 0.8817, + "step": 1693 + }, + { + "epoch": 0.12464474591860197, + "grad_norm": 1.0, + "learning_rate": 4.816754818858975e-05, + "loss": 1.0115, + "step": 1694 + }, + { + "epoch": 0.12471832605196596, + "grad_norm": 0.83203125, + "learning_rate": 4.8165371101826064e-05, + "loss": 0.8853, + "step": 1695 + }, + { + "epoch": 0.12479190618532997, + "grad_norm": 1.0859375, + "learning_rate": 4.8163192771820655e-05, + "loss": 1.1001, + "step": 1696 + }, + { + "epoch": 0.12486548631869396, + "grad_norm": 0.94921875, + "learning_rate": 4.816101319869045e-05, + "loss": 0.6952, + "step": 1697 + }, + { + "epoch": 0.12493906645205795, + "grad_norm": 0.8359375, + "learning_rate": 4.8158832382552406e-05, + "loss": 0.6278, + "step": 1698 + }, + { + "epoch": 0.12501264658542194, + "grad_norm": 0.76171875, + "learning_rate": 4.8156650323523566e-05, + "loss": 0.7689, + "step": 1699 + }, + { + "epoch": 0.12508622671878591, + "grad_norm": 1.0078125, + "learning_rate": 4.8154467021721046e-05, + "loss": 0.9663, + "step": 1700 + }, + { + "epoch": 0.12515980685214992, + "grad_norm": 0.9140625, + "learning_rate": 4.815228247726201e-05, + "loss": 1.3419, + "step": 1701 + }, + { + "epoch": 0.12523338698551392, + "grad_norm": 0.8984375, + "learning_rate": 4.8150096690263704e-05, + "loss": 1.0555, + "step": 1702 + }, + { + "epoch": 0.1253069671188779, + "grad_norm": 0.90234375, + "learning_rate": 4.814790966084343e-05, + "loss": 0.8885, + "step": 1703 + }, + { + "epoch": 0.1253805472522419, + "grad_norm": 0.7734375, + "learning_rate": 4.814572138911857e-05, + "loss": 0.733, + "step": 1704 + }, + { + "epoch": 0.12545412738560588, + "grad_norm": 1.0625, + "learning_rate": 4.8143531875206546e-05, + "loss": 1.1117, + "step": 1705 + }, + { + "epoch": 0.12552770751896988, + "grad_norm": 0.8984375, + "learning_rate": 4.814134111922489e-05, + "loss": 0.9932, + "step": 1706 + }, + { + "epoch": 0.12560128765233386, + "grad_norm": 0.9375, + "learning_rate": 4.813914912129116e-05, + "loss": 1.3604, + "step": 1707 + }, + { + "epoch": 0.12567486778569786, + "grad_norm": 0.8828125, + "learning_rate": 4.8136955881523004e-05, + "loss": 0.8963, + "step": 1708 + }, + { + "epoch": 0.12574844791906184, + "grad_norm": 0.86328125, + "learning_rate": 4.813476140003812e-05, + "loss": 0.7431, + "step": 1709 + }, + { + "epoch": 0.12582202805242584, + "grad_norm": 0.8671875, + "learning_rate": 4.813256567695429e-05, + "loss": 1.0601, + "step": 1710 + }, + { + "epoch": 0.12589560818578985, + "grad_norm": 0.79296875, + "learning_rate": 4.813036871238935e-05, + "loss": 0.9191, + "step": 1711 + }, + { + "epoch": 0.12596918831915382, + "grad_norm": 0.89453125, + "learning_rate": 4.8128170506461215e-05, + "loss": 1.2271, + "step": 1712 + }, + { + "epoch": 0.12604276845251783, + "grad_norm": 1.1171875, + "learning_rate": 4.812597105928784e-05, + "loss": 1.1664, + "step": 1713 + }, + { + "epoch": 0.1261163485858818, + "grad_norm": 0.96875, + "learning_rate": 4.8123770370987284e-05, + "loss": 1.2332, + "step": 1714 + }, + { + "epoch": 0.1261899287192458, + "grad_norm": 0.80078125, + "learning_rate": 4.8121568441677656e-05, + "loss": 0.8885, + "step": 1715 + }, + { + "epoch": 0.12626350885260978, + "grad_norm": 0.89453125, + "learning_rate": 4.8119365271477105e-05, + "loss": 0.9694, + "step": 1716 + }, + { + "epoch": 0.1263370889859738, + "grad_norm": 0.8984375, + "learning_rate": 4.811716086050389e-05, + "loss": 1.0888, + "step": 1717 + }, + { + "epoch": 0.12641066911933777, + "grad_norm": 1.1640625, + "learning_rate": 4.8114955208876325e-05, + "loss": 0.9996, + "step": 1718 + }, + { + "epoch": 0.12648424925270177, + "grad_norm": 0.94921875, + "learning_rate": 4.811274831671275e-05, + "loss": 1.2635, + "step": 1719 + }, + { + "epoch": 0.12655782938606577, + "grad_norm": 0.8203125, + "learning_rate": 4.811054018413165e-05, + "loss": 0.9658, + "step": 1720 + }, + { + "epoch": 0.12663140951942975, + "grad_norm": 1.0, + "learning_rate": 4.810833081125149e-05, + "loss": 1.4049, + "step": 1721 + }, + { + "epoch": 0.12670498965279375, + "grad_norm": 0.7890625, + "learning_rate": 4.810612019819087e-05, + "loss": 0.8666, + "step": 1722 + }, + { + "epoch": 0.12677856978615773, + "grad_norm": 0.7734375, + "learning_rate": 4.8103908345068416e-05, + "loss": 0.773, + "step": 1723 + }, + { + "epoch": 0.12685214991952173, + "grad_norm": 0.765625, + "learning_rate": 4.8101695252002846e-05, + "loss": 0.7949, + "step": 1724 + }, + { + "epoch": 0.1269257300528857, + "grad_norm": 1.4375, + "learning_rate": 4.809948091911292e-05, + "loss": 0.8907, + "step": 1725 + }, + { + "epoch": 0.12699931018624971, + "grad_norm": 1.046875, + "learning_rate": 4.8097265346517474e-05, + "loss": 0.8715, + "step": 1726 + }, + { + "epoch": 0.1270728903196137, + "grad_norm": 1.03125, + "learning_rate": 4.809504853433543e-05, + "loss": 1.3757, + "step": 1727 + }, + { + "epoch": 0.1271464704529777, + "grad_norm": 1.0859375, + "learning_rate": 4.809283048268575e-05, + "loss": 1.327, + "step": 1728 + }, + { + "epoch": 0.1272200505863417, + "grad_norm": 0.65234375, + "learning_rate": 4.809061119168747e-05, + "loss": 0.8474, + "step": 1729 + }, + { + "epoch": 0.12729363071970567, + "grad_norm": 0.75, + "learning_rate": 4.80883906614597e-05, + "loss": 0.6874, + "step": 1730 + }, + { + "epoch": 0.12736721085306968, + "grad_norm": 1.015625, + "learning_rate": 4.808616889212162e-05, + "loss": 0.9943, + "step": 1731 + }, + { + "epoch": 0.12744079098643366, + "grad_norm": 1.1171875, + "learning_rate": 4.808394588379245e-05, + "loss": 1.1231, + "step": 1732 + }, + { + "epoch": 0.12751437111979766, + "grad_norm": 1.078125, + "learning_rate": 4.80817216365915e-05, + "loss": 1.0542, + "step": 1733 + }, + { + "epoch": 0.12758795125316164, + "grad_norm": 1.0, + "learning_rate": 4.807949615063816e-05, + "loss": 0.9884, + "step": 1734 + }, + { + "epoch": 0.12766153138652564, + "grad_norm": 0.8359375, + "learning_rate": 4.807726942605184e-05, + "loss": 0.8242, + "step": 1735 + }, + { + "epoch": 0.12773511151988964, + "grad_norm": 0.953125, + "learning_rate": 4.8075041462952066e-05, + "loss": 0.9871, + "step": 1736 + }, + { + "epoch": 0.12780869165325362, + "grad_norm": 0.890625, + "learning_rate": 4.807281226145839e-05, + "loss": 0.5976, + "step": 1737 + }, + { + "epoch": 0.12788227178661762, + "grad_norm": 1.0234375, + "learning_rate": 4.807058182169046e-05, + "loss": 0.8272, + "step": 1738 + }, + { + "epoch": 0.1279558519199816, + "grad_norm": 1.09375, + "learning_rate": 4.8068350143767985e-05, + "loss": 1.5913, + "step": 1739 + }, + { + "epoch": 0.1280294320533456, + "grad_norm": 0.94140625, + "learning_rate": 4.806611722781073e-05, + "loss": 0.8145, + "step": 1740 + }, + { + "epoch": 0.12810301218670958, + "grad_norm": 1.3828125, + "learning_rate": 4.8063883073938515e-05, + "loss": 1.3143, + "step": 1741 + }, + { + "epoch": 0.12817659232007358, + "grad_norm": 1.3359375, + "learning_rate": 4.8061647682271266e-05, + "loss": 0.7281, + "step": 1742 + }, + { + "epoch": 0.12825017245343756, + "grad_norm": 1.1171875, + "learning_rate": 4.805941105292894e-05, + "loss": 1.1643, + "step": 1743 + }, + { + "epoch": 0.12832375258680156, + "grad_norm": 1.0, + "learning_rate": 4.805717318603158e-05, + "loss": 1.0296, + "step": 1744 + }, + { + "epoch": 0.12839733272016557, + "grad_norm": 0.8359375, + "learning_rate": 4.8054934081699275e-05, + "loss": 0.8484, + "step": 1745 + }, + { + "epoch": 0.12847091285352955, + "grad_norm": 1.0546875, + "learning_rate": 4.8052693740052214e-05, + "loss": 1.1198, + "step": 1746 + }, + { + "epoch": 0.12854449298689355, + "grad_norm": 0.85546875, + "learning_rate": 4.8050452161210616e-05, + "loss": 0.9856, + "step": 1747 + }, + { + "epoch": 0.12861807312025753, + "grad_norm": 0.6796875, + "learning_rate": 4.804820934529478e-05, + "loss": 0.7506, + "step": 1748 + }, + { + "epoch": 0.12869165325362153, + "grad_norm": 1.0859375, + "learning_rate": 4.8045965292425085e-05, + "loss": 1.194, + "step": 1749 + }, + { + "epoch": 0.1287652333869855, + "grad_norm": 1.0078125, + "learning_rate": 4.804372000272196e-05, + "loss": 0.8777, + "step": 1750 + }, + { + "epoch": 0.1288388135203495, + "grad_norm": 0.890625, + "learning_rate": 4.804147347630591e-05, + "loss": 0.9836, + "step": 1751 + }, + { + "epoch": 0.1289123936537135, + "grad_norm": 0.9296875, + "learning_rate": 4.80392257132975e-05, + "loss": 0.7982, + "step": 1752 + }, + { + "epoch": 0.1289859737870775, + "grad_norm": 1.125, + "learning_rate": 4.803697671381735e-05, + "loss": 1.4899, + "step": 1753 + }, + { + "epoch": 0.1290595539204415, + "grad_norm": 0.81640625, + "learning_rate": 4.8034726477986175e-05, + "loss": 0.7501, + "step": 1754 + }, + { + "epoch": 0.12913313405380547, + "grad_norm": 0.99609375, + "learning_rate": 4.8032475005924734e-05, + "loss": 1.5302, + "step": 1755 + }, + { + "epoch": 0.12920671418716947, + "grad_norm": 0.72265625, + "learning_rate": 4.8030222297753867e-05, + "loss": 0.7802, + "step": 1756 + }, + { + "epoch": 0.12928029432053345, + "grad_norm": 0.90625, + "learning_rate": 4.802796835359447e-05, + "loss": 0.7931, + "step": 1757 + }, + { + "epoch": 0.12935387445389745, + "grad_norm": 0.875, + "learning_rate": 4.80257131735675e-05, + "loss": 0.7262, + "step": 1758 + }, + { + "epoch": 0.12942745458726143, + "grad_norm": 1.0859375, + "learning_rate": 4.8023456757793986e-05, + "loss": 1.3814, + "step": 1759 + }, + { + "epoch": 0.12950103472062544, + "grad_norm": 0.76171875, + "learning_rate": 4.802119910639504e-05, + "loss": 0.7002, + "step": 1760 + }, + { + "epoch": 0.1295746148539894, + "grad_norm": 1.046875, + "learning_rate": 4.801894021949183e-05, + "loss": 1.6328, + "step": 1761 + }, + { + "epoch": 0.12964819498735342, + "grad_norm": 0.96484375, + "learning_rate": 4.801668009720556e-05, + "loss": 0.9086, + "step": 1762 + }, + { + "epoch": 0.12972177512071742, + "grad_norm": 0.734375, + "learning_rate": 4.801441873965754e-05, + "loss": 0.9095, + "step": 1763 + }, + { + "epoch": 0.1297953552540814, + "grad_norm": 0.9140625, + "learning_rate": 4.801215614696915e-05, + "loss": 0.8858, + "step": 1764 + }, + { + "epoch": 0.1298689353874454, + "grad_norm": 0.9765625, + "learning_rate": 4.800989231926178e-05, + "loss": 0.9846, + "step": 1765 + }, + { + "epoch": 0.12994251552080938, + "grad_norm": 0.859375, + "learning_rate": 4.8007627256656965e-05, + "loss": 1.1344, + "step": 1766 + }, + { + "epoch": 0.13001609565417338, + "grad_norm": 0.87890625, + "learning_rate": 4.8005360959276255e-05, + "loss": 0.721, + "step": 1767 + }, + { + "epoch": 0.13008967578753736, + "grad_norm": 0.90625, + "learning_rate": 4.8003093427241266e-05, + "loss": 0.9318, + "step": 1768 + }, + { + "epoch": 0.13016325592090136, + "grad_norm": 1.15625, + "learning_rate": 4.800082466067369e-05, + "loss": 1.6782, + "step": 1769 + }, + { + "epoch": 0.13023683605426534, + "grad_norm": 0.9296875, + "learning_rate": 4.799855465969531e-05, + "loss": 0.799, + "step": 1770 + }, + { + "epoch": 0.13031041618762934, + "grad_norm": 0.97265625, + "learning_rate": 4.7996283424427935e-05, + "loss": 1.383, + "step": 1771 + }, + { + "epoch": 0.13038399632099335, + "grad_norm": 0.94921875, + "learning_rate": 4.7994010954993465e-05, + "loss": 1.0181, + "step": 1772 + }, + { + "epoch": 0.13045757645435732, + "grad_norm": 0.890625, + "learning_rate": 4.7991737251513855e-05, + "loss": 0.9318, + "step": 1773 + }, + { + "epoch": 0.13053115658772133, + "grad_norm": 1.0390625, + "learning_rate": 4.798946231411113e-05, + "loss": 1.2828, + "step": 1774 + }, + { + "epoch": 0.1306047367210853, + "grad_norm": 0.80078125, + "learning_rate": 4.798718614290739e-05, + "loss": 0.8909, + "step": 1775 + }, + { + "epoch": 0.1306783168544493, + "grad_norm": 1.015625, + "learning_rate": 4.798490873802478e-05, + "loss": 0.9724, + "step": 1776 + }, + { + "epoch": 0.13075189698781328, + "grad_norm": 0.76171875, + "learning_rate": 4.798263009958553e-05, + "loss": 0.7469, + "step": 1777 + }, + { + "epoch": 0.13082547712117729, + "grad_norm": 0.859375, + "learning_rate": 4.7980350227711935e-05, + "loss": 0.9373, + "step": 1778 + }, + { + "epoch": 0.13089905725454126, + "grad_norm": 0.90234375, + "learning_rate": 4.7978069122526334e-05, + "loss": 0.8338, + "step": 1779 + }, + { + "epoch": 0.13097263738790527, + "grad_norm": 0.82421875, + "learning_rate": 4.797578678415118e-05, + "loss": 0.6374, + "step": 1780 + }, + { + "epoch": 0.13104621752126927, + "grad_norm": 0.8984375, + "learning_rate": 4.797350321270894e-05, + "loss": 0.9759, + "step": 1781 + }, + { + "epoch": 0.13111979765463325, + "grad_norm": 1.2578125, + "learning_rate": 4.7971218408322175e-05, + "loss": 1.3983, + "step": 1782 + }, + { + "epoch": 0.13119337778799725, + "grad_norm": 0.96875, + "learning_rate": 4.7968932371113495e-05, + "loss": 1.0449, + "step": 1783 + }, + { + "epoch": 0.13126695792136123, + "grad_norm": 0.8828125, + "learning_rate": 4.7966645101205596e-05, + "loss": 0.7516, + "step": 1784 + }, + { + "epoch": 0.13134053805472523, + "grad_norm": 0.8046875, + "learning_rate": 4.7964356598721245e-05, + "loss": 0.7642, + "step": 1785 + }, + { + "epoch": 0.1314141181880892, + "grad_norm": 0.8125, + "learning_rate": 4.796206686378324e-05, + "loss": 0.8153, + "step": 1786 + }, + { + "epoch": 0.1314876983214532, + "grad_norm": 0.953125, + "learning_rate": 4.7959775896514484e-05, + "loss": 1.0266, + "step": 1787 + }, + { + "epoch": 0.1315612784548172, + "grad_norm": 0.828125, + "learning_rate": 4.795748369703791e-05, + "loss": 0.9504, + "step": 1788 + }, + { + "epoch": 0.1316348585881812, + "grad_norm": 0.80859375, + "learning_rate": 4.795519026547655e-05, + "loss": 0.7396, + "step": 1789 + }, + { + "epoch": 0.1317084387215452, + "grad_norm": 0.87890625, + "learning_rate": 4.7952895601953484e-05, + "loss": 0.885, + "step": 1790 + }, + { + "epoch": 0.13178201885490917, + "grad_norm": 1.03125, + "learning_rate": 4.795059970659187e-05, + "loss": 0.8995, + "step": 1791 + }, + { + "epoch": 0.13185559898827318, + "grad_norm": 0.91796875, + "learning_rate": 4.7948302579514906e-05, + "loss": 0.7972, + "step": 1792 + }, + { + "epoch": 0.13192917912163715, + "grad_norm": 0.74609375, + "learning_rate": 4.794600422084589e-05, + "loss": 0.7332, + "step": 1793 + }, + { + "epoch": 0.13200275925500116, + "grad_norm": 1.0625, + "learning_rate": 4.7943704630708163e-05, + "loss": 1.256, + "step": 1794 + }, + { + "epoch": 0.13207633938836513, + "grad_norm": 0.81640625, + "learning_rate": 4.794140380922515e-05, + "loss": 1.0226, + "step": 1795 + }, + { + "epoch": 0.13214991952172914, + "grad_norm": 1.015625, + "learning_rate": 4.793910175652032e-05, + "loss": 0.8569, + "step": 1796 + }, + { + "epoch": 0.1322234996550931, + "grad_norm": 0.97265625, + "learning_rate": 4.793679847271723e-05, + "loss": 1.2505, + "step": 1797 + }, + { + "epoch": 0.13229707978845712, + "grad_norm": 0.671875, + "learning_rate": 4.7934493957939485e-05, + "loss": 0.8825, + "step": 1798 + }, + { + "epoch": 0.13237065992182112, + "grad_norm": 0.88671875, + "learning_rate": 4.793218821231077e-05, + "loss": 0.9187, + "step": 1799 + }, + { + "epoch": 0.1324442400551851, + "grad_norm": 0.9375, + "learning_rate": 4.792988123595482e-05, + "loss": 1.1352, + "step": 1800 + }, + { + "epoch": 0.1325178201885491, + "grad_norm": 1.0, + "learning_rate": 4.7927573028995453e-05, + "loss": 1.1833, + "step": 1801 + }, + { + "epoch": 0.13259140032191308, + "grad_norm": 0.84375, + "learning_rate": 4.792526359155655e-05, + "loss": 0.7917, + "step": 1802 + }, + { + "epoch": 0.13266498045527708, + "grad_norm": 0.796875, + "learning_rate": 4.792295292376204e-05, + "loss": 0.9438, + "step": 1803 + }, + { + "epoch": 0.13273856058864106, + "grad_norm": 0.75390625, + "learning_rate": 4.792064102573595e-05, + "loss": 0.6914, + "step": 1804 + }, + { + "epoch": 0.13281214072200506, + "grad_norm": 0.79296875, + "learning_rate": 4.7918327897602344e-05, + "loss": 0.9107, + "step": 1805 + }, + { + "epoch": 0.13288572085536904, + "grad_norm": 0.76953125, + "learning_rate": 4.791601353948537e-05, + "loss": 1.0726, + "step": 1806 + }, + { + "epoch": 0.13295930098873304, + "grad_norm": 0.75390625, + "learning_rate": 4.791369795150923e-05, + "loss": 0.9249, + "step": 1807 + }, + { + "epoch": 0.13303288112209705, + "grad_norm": 0.890625, + "learning_rate": 4.7911381133798197e-05, + "loss": 0.9071, + "step": 1808 + }, + { + "epoch": 0.13310646125546102, + "grad_norm": 0.98046875, + "learning_rate": 4.790906308647661e-05, + "loss": 1.2928, + "step": 1809 + }, + { + "epoch": 0.13318004138882503, + "grad_norm": 0.84375, + "learning_rate": 4.7906743809668885e-05, + "loss": 1.1867, + "step": 1810 + }, + { + "epoch": 0.133253621522189, + "grad_norm": 0.734375, + "learning_rate": 4.790442330349948e-05, + "loss": 0.6382, + "step": 1811 + }, + { + "epoch": 0.133327201655553, + "grad_norm": 0.86328125, + "learning_rate": 4.7902101568092935e-05, + "loss": 1.0395, + "step": 1812 + }, + { + "epoch": 0.13340078178891698, + "grad_norm": 0.83203125, + "learning_rate": 4.789977860357385e-05, + "loss": 1.1669, + "step": 1813 + }, + { + "epoch": 0.133474361922281, + "grad_norm": 1.09375, + "learning_rate": 4.789745441006691e-05, + "loss": 1.3609, + "step": 1814 + }, + { + "epoch": 0.13354794205564496, + "grad_norm": 1.1328125, + "learning_rate": 4.7895128987696834e-05, + "loss": 0.9618, + "step": 1815 + }, + { + "epoch": 0.13362152218900897, + "grad_norm": 0.7578125, + "learning_rate": 4.7892802336588425e-05, + "loss": 0.74, + "step": 1816 + }, + { + "epoch": 0.13369510232237297, + "grad_norm": 0.91796875, + "learning_rate": 4.789047445686656e-05, + "loss": 0.9331, + "step": 1817 + }, + { + "epoch": 0.13376868245573695, + "grad_norm": 0.98828125, + "learning_rate": 4.788814534865615e-05, + "loss": 0.9501, + "step": 1818 + }, + { + "epoch": 0.13384226258910095, + "grad_norm": 1.171875, + "learning_rate": 4.788581501208222e-05, + "loss": 1.0475, + "step": 1819 + }, + { + "epoch": 0.13391584272246493, + "grad_norm": 0.87890625, + "learning_rate": 4.788348344726983e-05, + "loss": 0.8627, + "step": 1820 + }, + { + "epoch": 0.13398942285582893, + "grad_norm": 0.91796875, + "learning_rate": 4.7881150654344093e-05, + "loss": 0.8419, + "step": 1821 + }, + { + "epoch": 0.1340630029891929, + "grad_norm": 1.0234375, + "learning_rate": 4.787881663343022e-05, + "loss": 0.8597, + "step": 1822 + }, + { + "epoch": 0.1341365831225569, + "grad_norm": 0.93359375, + "learning_rate": 4.787648138465347e-05, + "loss": 1.136, + "step": 1823 + }, + { + "epoch": 0.1342101632559209, + "grad_norm": 0.78515625, + "learning_rate": 4.7874144908139175e-05, + "loss": 0.7524, + "step": 1824 + }, + { + "epoch": 0.1342837433892849, + "grad_norm": 0.80859375, + "learning_rate": 4.787180720401272e-05, + "loss": 0.9548, + "step": 1825 + }, + { + "epoch": 0.1343573235226489, + "grad_norm": 0.9609375, + "learning_rate": 4.7869468272399574e-05, + "loss": 0.8754, + "step": 1826 + }, + { + "epoch": 0.13443090365601287, + "grad_norm": 0.8046875, + "learning_rate": 4.7867128113425265e-05, + "loss": 0.8539, + "step": 1827 + }, + { + "epoch": 0.13450448378937688, + "grad_norm": 0.75390625, + "learning_rate": 4.786478672721537e-05, + "loss": 0.7618, + "step": 1828 + }, + { + "epoch": 0.13457806392274085, + "grad_norm": 1.0, + "learning_rate": 4.7862444113895565e-05, + "loss": 0.9876, + "step": 1829 + }, + { + "epoch": 0.13465164405610486, + "grad_norm": 0.75, + "learning_rate": 4.786010027359156e-05, + "loss": 0.8211, + "step": 1830 + }, + { + "epoch": 0.13472522418946883, + "grad_norm": 0.98046875, + "learning_rate": 4.785775520642916e-05, + "loss": 1.0766, + "step": 1831 + }, + { + "epoch": 0.13479880432283284, + "grad_norm": 1.1484375, + "learning_rate": 4.785540891253419e-05, + "loss": 1.0317, + "step": 1832 + }, + { + "epoch": 0.13487238445619681, + "grad_norm": 0.84765625, + "learning_rate": 4.78530613920326e-05, + "loss": 1.0577, + "step": 1833 + }, + { + "epoch": 0.13494596458956082, + "grad_norm": 0.91796875, + "learning_rate": 4.785071264505038e-05, + "loss": 0.6501, + "step": 1834 + }, + { + "epoch": 0.13501954472292482, + "grad_norm": 0.8046875, + "learning_rate": 4.784836267171356e-05, + "loss": 0.8578, + "step": 1835 + }, + { + "epoch": 0.1350931248562888, + "grad_norm": 1.1328125, + "learning_rate": 4.784601147214828e-05, + "loss": 1.4335, + "step": 1836 + }, + { + "epoch": 0.1351667049896528, + "grad_norm": 0.7578125, + "learning_rate": 4.78436590464807e-05, + "loss": 0.7884, + "step": 1837 + }, + { + "epoch": 0.13524028512301678, + "grad_norm": 0.890625, + "learning_rate": 4.7841305394837096e-05, + "loss": 0.9455, + "step": 1838 + }, + { + "epoch": 0.13531386525638078, + "grad_norm": 0.97265625, + "learning_rate": 4.7838950517343774e-05, + "loss": 0.9894, + "step": 1839 + }, + { + "epoch": 0.13538744538974476, + "grad_norm": 0.72265625, + "learning_rate": 4.78365944141271e-05, + "loss": 0.7096, + "step": 1840 + }, + { + "epoch": 0.13546102552310876, + "grad_norm": 0.890625, + "learning_rate": 4.783423708531355e-05, + "loss": 0.9349, + "step": 1841 + }, + { + "epoch": 0.13553460565647274, + "grad_norm": 0.71484375, + "learning_rate": 4.783187853102962e-05, + "loss": 0.9142, + "step": 1842 + }, + { + "epoch": 0.13560818578983674, + "grad_norm": 0.984375, + "learning_rate": 4.7829518751401893e-05, + "loss": 0.7758, + "step": 1843 + }, + { + "epoch": 0.13568176592320075, + "grad_norm": 1.0546875, + "learning_rate": 4.7827157746557e-05, + "loss": 1.3119, + "step": 1844 + }, + { + "epoch": 0.13575534605656472, + "grad_norm": 0.8828125, + "learning_rate": 4.782479551662168e-05, + "loss": 1.2775, + "step": 1845 + }, + { + "epoch": 0.13582892618992873, + "grad_norm": 0.86328125, + "learning_rate": 4.7822432061722693e-05, + "loss": 0.9439, + "step": 1846 + }, + { + "epoch": 0.1359025063232927, + "grad_norm": 0.9765625, + "learning_rate": 4.7820067381986885e-05, + "loss": 1.2515, + "step": 1847 + }, + { + "epoch": 0.1359760864566567, + "grad_norm": 0.9453125, + "learning_rate": 4.7817701477541154e-05, + "loss": 0.8253, + "step": 1848 + }, + { + "epoch": 0.13604966659002068, + "grad_norm": 0.92578125, + "learning_rate": 4.781533434851249e-05, + "loss": 0.6957, + "step": 1849 + }, + { + "epoch": 0.1361232467233847, + "grad_norm": 0.99609375, + "learning_rate": 4.7812965995027915e-05, + "loss": 1.0547, + "step": 1850 + }, + { + "epoch": 0.13619682685674867, + "grad_norm": 1.3359375, + "learning_rate": 4.7810596417214543e-05, + "loss": 1.3689, + "step": 1851 + }, + { + "epoch": 0.13627040699011267, + "grad_norm": 0.8671875, + "learning_rate": 4.780822561519955e-05, + "loss": 1.0853, + "step": 1852 + }, + { + "epoch": 0.13634398712347667, + "grad_norm": 0.84765625, + "learning_rate": 4.780585358911016e-05, + "loss": 0.9258, + "step": 1853 + }, + { + "epoch": 0.13641756725684065, + "grad_norm": 0.734375, + "learning_rate": 4.780348033907369e-05, + "loss": 0.8984, + "step": 1854 + }, + { + "epoch": 0.13649114739020465, + "grad_norm": 0.93359375, + "learning_rate": 4.78011058652175e-05, + "loss": 1.2386, + "step": 1855 + }, + { + "epoch": 0.13656472752356863, + "grad_norm": 0.828125, + "learning_rate": 4.7798730167669016e-05, + "loss": 0.9408, + "step": 1856 + }, + { + "epoch": 0.13663830765693263, + "grad_norm": 0.72265625, + "learning_rate": 4.7796353246555746e-05, + "loss": 0.6505, + "step": 1857 + }, + { + "epoch": 0.1367118877902966, + "grad_norm": 1.25, + "learning_rate": 4.779397510200525e-05, + "loss": 1.2214, + "step": 1858 + }, + { + "epoch": 0.13678546792366061, + "grad_norm": 0.96484375, + "learning_rate": 4.7791595734145164e-05, + "loss": 0.976, + "step": 1859 + }, + { + "epoch": 0.1368590480570246, + "grad_norm": 0.9296875, + "learning_rate": 4.7789215143103195e-05, + "loss": 0.8904, + "step": 1860 + }, + { + "epoch": 0.1369326281903886, + "grad_norm": 0.98828125, + "learning_rate": 4.778683332900708e-05, + "loss": 1.0398, + "step": 1861 + }, + { + "epoch": 0.1370062083237526, + "grad_norm": 0.765625, + "learning_rate": 4.7784450291984664e-05, + "loss": 0.9263, + "step": 1862 + }, + { + "epoch": 0.13707978845711657, + "grad_norm": 0.88671875, + "learning_rate": 4.778206603216383e-05, + "loss": 0.9906, + "step": 1863 + }, + { + "epoch": 0.13715336859048058, + "grad_norm": 0.8125, + "learning_rate": 4.777968054967254e-05, + "loss": 0.9936, + "step": 1864 + }, + { + "epoch": 0.13722694872384456, + "grad_norm": 0.82421875, + "learning_rate": 4.777729384463882e-05, + "loss": 0.7839, + "step": 1865 + }, + { + "epoch": 0.13730052885720856, + "grad_norm": 0.9375, + "learning_rate": 4.777490591719076e-05, + "loss": 0.9634, + "step": 1866 + }, + { + "epoch": 0.13737410899057254, + "grad_norm": 0.98828125, + "learning_rate": 4.777251676745652e-05, + "loss": 0.9631, + "step": 1867 + }, + { + "epoch": 0.13744768912393654, + "grad_norm": 0.66796875, + "learning_rate": 4.7770126395564315e-05, + "loss": 0.595, + "step": 1868 + }, + { + "epoch": 0.13752126925730052, + "grad_norm": 0.83984375, + "learning_rate": 4.776773480164243e-05, + "loss": 0.869, + "step": 1869 + }, + { + "epoch": 0.13759484939066452, + "grad_norm": 0.8125, + "learning_rate": 4.776534198581922e-05, + "loss": 0.8312, + "step": 1870 + }, + { + "epoch": 0.13766842952402852, + "grad_norm": 0.98828125, + "learning_rate": 4.77629479482231e-05, + "loss": 0.9036, + "step": 1871 + }, + { + "epoch": 0.1377420096573925, + "grad_norm": 0.98828125, + "learning_rate": 4.776055268898256e-05, + "loss": 1.1004, + "step": 1872 + }, + { + "epoch": 0.1378155897907565, + "grad_norm": 0.84765625, + "learning_rate": 4.7758156208226156e-05, + "loss": 0.7739, + "step": 1873 + }, + { + "epoch": 0.13788916992412048, + "grad_norm": 1.1328125, + "learning_rate": 4.775575850608248e-05, + "loss": 0.9294, + "step": 1874 + }, + { + "epoch": 0.13796275005748448, + "grad_norm": 0.8671875, + "learning_rate": 4.775335958268023e-05, + "loss": 0.7431, + "step": 1875 + }, + { + "epoch": 0.13803633019084846, + "grad_norm": 0.9140625, + "learning_rate": 4.7750959438148146e-05, + "loss": 0.9908, + "step": 1876 + }, + { + "epoch": 0.13810991032421246, + "grad_norm": 0.83203125, + "learning_rate": 4.7748558072615034e-05, + "loss": 0.9226, + "step": 1877 + }, + { + "epoch": 0.13818349045757644, + "grad_norm": 0.7890625, + "learning_rate": 4.7746155486209784e-05, + "loss": 1.1121, + "step": 1878 + }, + { + "epoch": 0.13825707059094045, + "grad_norm": 0.96484375, + "learning_rate": 4.774375167906132e-05, + "loss": 1.1865, + "step": 1879 + }, + { + "epoch": 0.13833065072430445, + "grad_norm": 0.73828125, + "learning_rate": 4.7741346651298665e-05, + "loss": 0.825, + "step": 1880 + }, + { + "epoch": 0.13840423085766843, + "grad_norm": 0.7890625, + "learning_rate": 4.773894040305089e-05, + "loss": 0.6942, + "step": 1881 + }, + { + "epoch": 0.13847781099103243, + "grad_norm": 0.92578125, + "learning_rate": 4.7736532934447134e-05, + "loss": 0.8538, + "step": 1882 + }, + { + "epoch": 0.1385513911243964, + "grad_norm": 0.6171875, + "learning_rate": 4.77341242456166e-05, + "loss": 0.5544, + "step": 1883 + }, + { + "epoch": 0.1386249712577604, + "grad_norm": 1.140625, + "learning_rate": 4.773171433668855e-05, + "loss": 0.9721, + "step": 1884 + }, + { + "epoch": 0.1386985513911244, + "grad_norm": 0.7109375, + "learning_rate": 4.772930320779232e-05, + "loss": 0.7255, + "step": 1885 + }, + { + "epoch": 0.1387721315244884, + "grad_norm": 0.796875, + "learning_rate": 4.772689085905733e-05, + "loss": 0.6076, + "step": 1886 + }, + { + "epoch": 0.13884571165785237, + "grad_norm": 0.98046875, + "learning_rate": 4.7724477290613024e-05, + "loss": 1.1983, + "step": 1887 + }, + { + "epoch": 0.13891929179121637, + "grad_norm": 0.92578125, + "learning_rate": 4.772206250258894e-05, + "loss": 1.039, + "step": 1888 + }, + { + "epoch": 0.13899287192458037, + "grad_norm": 0.71875, + "learning_rate": 4.771964649511469e-05, + "loss": 0.7667, + "step": 1889 + }, + { + "epoch": 0.13906645205794435, + "grad_norm": 0.69140625, + "learning_rate": 4.771722926831991e-05, + "loss": 0.7998, + "step": 1890 + }, + { + "epoch": 0.13914003219130835, + "grad_norm": 1.0546875, + "learning_rate": 4.771481082233434e-05, + "loss": 1.4976, + "step": 1891 + }, + { + "epoch": 0.13921361232467233, + "grad_norm": 0.828125, + "learning_rate": 4.771239115728779e-05, + "loss": 0.8863, + "step": 1892 + }, + { + "epoch": 0.13928719245803634, + "grad_norm": 0.796875, + "learning_rate": 4.7709970273310095e-05, + "loss": 0.7846, + "step": 1893 + }, + { + "epoch": 0.1393607725914003, + "grad_norm": 0.84765625, + "learning_rate": 4.770754817053119e-05, + "loss": 0.9603, + "step": 1894 + }, + { + "epoch": 0.13943435272476432, + "grad_norm": 1.0859375, + "learning_rate": 4.7705124849081063e-05, + "loss": 1.2508, + "step": 1895 + }, + { + "epoch": 0.1395079328581283, + "grad_norm": 0.87890625, + "learning_rate": 4.7702700309089776e-05, + "loss": 0.6949, + "step": 1896 + }, + { + "epoch": 0.1395815129914923, + "grad_norm": 0.9921875, + "learning_rate": 4.770027455068743e-05, + "loss": 1.0184, + "step": 1897 + }, + { + "epoch": 0.1396550931248563, + "grad_norm": 0.92578125, + "learning_rate": 4.7697847574004234e-05, + "loss": 1.1851, + "step": 1898 + }, + { + "epoch": 0.13972867325822028, + "grad_norm": 0.984375, + "learning_rate": 4.7695419379170426e-05, + "loss": 1.2688, + "step": 1899 + }, + { + "epoch": 0.13980225339158428, + "grad_norm": 0.98046875, + "learning_rate": 4.7692989966316324e-05, + "loss": 0.8647, + "step": 1900 + }, + { + "epoch": 0.13987583352494826, + "grad_norm": 0.90234375, + "learning_rate": 4.769055933557231e-05, + "loss": 1.0615, + "step": 1901 + }, + { + "epoch": 0.13994941365831226, + "grad_norm": 0.7890625, + "learning_rate": 4.7688127487068836e-05, + "loss": 1.0232, + "step": 1902 + }, + { + "epoch": 0.14002299379167624, + "grad_norm": 0.9296875, + "learning_rate": 4.768569442093641e-05, + "loss": 1.0361, + "step": 1903 + }, + { + "epoch": 0.14009657392504024, + "grad_norm": 0.8515625, + "learning_rate": 4.7683260137305615e-05, + "loss": 0.9985, + "step": 1904 + }, + { + "epoch": 0.14017015405840422, + "grad_norm": 1.0703125, + "learning_rate": 4.7680824636307086e-05, + "loss": 1.0015, + "step": 1905 + }, + { + "epoch": 0.14024373419176822, + "grad_norm": 0.86328125, + "learning_rate": 4.767838791807154e-05, + "loss": 0.8507, + "step": 1906 + }, + { + "epoch": 0.14031731432513223, + "grad_norm": 0.8828125, + "learning_rate": 4.767594998272974e-05, + "loss": 0.9166, + "step": 1907 + }, + { + "epoch": 0.1403908944584962, + "grad_norm": 0.875, + "learning_rate": 4.7673510830412546e-05, + "loss": 1.0552, + "step": 1908 + }, + { + "epoch": 0.1404644745918602, + "grad_norm": 0.8046875, + "learning_rate": 4.7671070461250846e-05, + "loss": 1.0193, + "step": 1909 + }, + { + "epoch": 0.14053805472522418, + "grad_norm": 1.0859375, + "learning_rate": 4.766862887537561e-05, + "loss": 1.2958, + "step": 1910 + }, + { + "epoch": 0.14061163485858819, + "grad_norm": 0.8671875, + "learning_rate": 4.766618607291787e-05, + "loss": 0.8358, + "step": 1911 + }, + { + "epoch": 0.14068521499195216, + "grad_norm": 0.89453125, + "learning_rate": 4.766374205400875e-05, + "loss": 1.1922, + "step": 1912 + }, + { + "epoch": 0.14075879512531617, + "grad_norm": 1.171875, + "learning_rate": 4.766129681877939e-05, + "loss": 1.3769, + "step": 1913 + }, + { + "epoch": 0.14083237525868014, + "grad_norm": 0.98828125, + "learning_rate": 4.765885036736104e-05, + "loss": 0.8358, + "step": 1914 + }, + { + "epoch": 0.14090595539204415, + "grad_norm": 1.046875, + "learning_rate": 4.765640269988497e-05, + "loss": 1.398, + "step": 1915 + }, + { + "epoch": 0.14097953552540815, + "grad_norm": 0.85546875, + "learning_rate": 4.7653953816482576e-05, + "loss": 1.0017, + "step": 1916 + }, + { + "epoch": 0.14105311565877213, + "grad_norm": 0.86328125, + "learning_rate": 4.7651503717285265e-05, + "loss": 0.8333, + "step": 1917 + }, + { + "epoch": 0.14112669579213613, + "grad_norm": 0.92578125, + "learning_rate": 4.764905240242452e-05, + "loss": 0.8153, + "step": 1918 + }, + { + "epoch": 0.1412002759255001, + "grad_norm": 0.92578125, + "learning_rate": 4.764659987203192e-05, + "loss": 1.147, + "step": 1919 + }, + { + "epoch": 0.1412738560588641, + "grad_norm": 0.84765625, + "learning_rate": 4.764414612623907e-05, + "loss": 0.7301, + "step": 1920 + }, + { + "epoch": 0.1413474361922281, + "grad_norm": 0.9375, + "learning_rate": 4.764169116517768e-05, + "loss": 0.8859, + "step": 1921 + }, + { + "epoch": 0.1414210163255921, + "grad_norm": 0.70703125, + "learning_rate": 4.763923498897948e-05, + "loss": 0.6014, + "step": 1922 + }, + { + "epoch": 0.1414945964589561, + "grad_norm": 1.3125, + "learning_rate": 4.7636777597776306e-05, + "loss": 1.2059, + "step": 1923 + }, + { + "epoch": 0.14156817659232007, + "grad_norm": 0.875, + "learning_rate": 4.763431899170002e-05, + "loss": 0.791, + "step": 1924 + }, + { + "epoch": 0.14164175672568408, + "grad_norm": 0.80859375, + "learning_rate": 4.76318591708826e-05, + "loss": 0.7207, + "step": 1925 + }, + { + "epoch": 0.14171533685904805, + "grad_norm": 0.80078125, + "learning_rate": 4.7629398135456035e-05, + "loss": 0.8129, + "step": 1926 + }, + { + "epoch": 0.14178891699241206, + "grad_norm": 0.890625, + "learning_rate": 4.7626935885552406e-05, + "loss": 0.8824, + "step": 1927 + }, + { + "epoch": 0.14186249712577603, + "grad_norm": 1.25, + "learning_rate": 4.7624472421303876e-05, + "loss": 1.1402, + "step": 1928 + }, + { + "epoch": 0.14193607725914004, + "grad_norm": 0.77734375, + "learning_rate": 4.7622007742842635e-05, + "loss": 0.9439, + "step": 1929 + }, + { + "epoch": 0.142009657392504, + "grad_norm": 0.97265625, + "learning_rate": 4.7619541850300976e-05, + "loss": 0.9133, + "step": 1930 + }, + { + "epoch": 0.14208323752586802, + "grad_norm": 1.1875, + "learning_rate": 4.761707474381122e-05, + "loss": 1.0942, + "step": 1931 + }, + { + "epoch": 0.14215681765923202, + "grad_norm": 1.0625, + "learning_rate": 4.7614606423505773e-05, + "loss": 1.0403, + "step": 1932 + }, + { + "epoch": 0.142230397792596, + "grad_norm": 0.8671875, + "learning_rate": 4.761213688951712e-05, + "loss": 0.9698, + "step": 1933 + }, + { + "epoch": 0.14230397792596, + "grad_norm": 0.8125, + "learning_rate": 4.760966614197779e-05, + "loss": 0.8362, + "step": 1934 + }, + { + "epoch": 0.14237755805932398, + "grad_norm": 1.0625, + "learning_rate": 4.760719418102038e-05, + "loss": 1.0727, + "step": 1935 + }, + { + "epoch": 0.14245113819268798, + "grad_norm": 0.79296875, + "learning_rate": 4.760472100677755e-05, + "loss": 0.9647, + "step": 1936 + }, + { + "epoch": 0.14252471832605196, + "grad_norm": 0.9453125, + "learning_rate": 4.760224661938205e-05, + "loss": 0.9848, + "step": 1937 + }, + { + "epoch": 0.14259829845941596, + "grad_norm": 0.9765625, + "learning_rate": 4.759977101896666e-05, + "loss": 1.0689, + "step": 1938 + }, + { + "epoch": 0.14267187859277994, + "grad_norm": 0.93359375, + "learning_rate": 4.7597294205664244e-05, + "loss": 1.0485, + "step": 1939 + }, + { + "epoch": 0.14274545872614394, + "grad_norm": 0.91015625, + "learning_rate": 4.759481617960772e-05, + "loss": 1.0691, + "step": 1940 + }, + { + "epoch": 0.14281903885950795, + "grad_norm": 0.80859375, + "learning_rate": 4.75923369409301e-05, + "loss": 0.9305, + "step": 1941 + }, + { + "epoch": 0.14289261899287192, + "grad_norm": 1.1328125, + "learning_rate": 4.7589856489764414e-05, + "loss": 1.23, + "step": 1942 + }, + { + "epoch": 0.14296619912623593, + "grad_norm": 1.125, + "learning_rate": 4.758737482624381e-05, + "loss": 1.4511, + "step": 1943 + }, + { + "epoch": 0.1430397792595999, + "grad_norm": 1.046875, + "learning_rate": 4.758489195050145e-05, + "loss": 0.85, + "step": 1944 + }, + { + "epoch": 0.1431133593929639, + "grad_norm": 0.86328125, + "learning_rate": 4.758240786267061e-05, + "loss": 0.8774, + "step": 1945 + }, + { + "epoch": 0.14318693952632788, + "grad_norm": 0.84375, + "learning_rate": 4.757992256288458e-05, + "loss": 0.9928, + "step": 1946 + }, + { + "epoch": 0.1432605196596919, + "grad_norm": 0.828125, + "learning_rate": 4.7577436051276754e-05, + "loss": 1.0241, + "step": 1947 + }, + { + "epoch": 0.14333409979305586, + "grad_norm": 1.1640625, + "learning_rate": 4.757494832798057e-05, + "loss": 0.9499, + "step": 1948 + }, + { + "epoch": 0.14340767992641987, + "grad_norm": 0.9921875, + "learning_rate": 4.7572459393129567e-05, + "loss": 1.0906, + "step": 1949 + }, + { + "epoch": 0.14348126005978387, + "grad_norm": 0.921875, + "learning_rate": 4.7569969246857285e-05, + "loss": 0.8805, + "step": 1950 + }, + { + "epoch": 0.14355484019314785, + "grad_norm": 0.78515625, + "learning_rate": 4.756747788929739e-05, + "loss": 0.9583, + "step": 1951 + }, + { + "epoch": 0.14362842032651185, + "grad_norm": 1.0, + "learning_rate": 4.7564985320583575e-05, + "loss": 1.0505, + "step": 1952 + }, + { + "epoch": 0.14370200045987583, + "grad_norm": 0.76171875, + "learning_rate": 4.756249154084963e-05, + "loss": 0.9484, + "step": 1953 + }, + { + "epoch": 0.14377558059323983, + "grad_norm": 0.91015625, + "learning_rate": 4.755999655022937e-05, + "loss": 1.1815, + "step": 1954 + }, + { + "epoch": 0.1438491607266038, + "grad_norm": 2.46875, + "learning_rate": 4.75575003488567e-05, + "loss": 0.9117, + "step": 1955 + }, + { + "epoch": 0.1439227408599678, + "grad_norm": 0.703125, + "learning_rate": 4.75550029368656e-05, + "loss": 0.7588, + "step": 1956 + }, + { + "epoch": 0.1439963209933318, + "grad_norm": 0.828125, + "learning_rate": 4.755250431439009e-05, + "loss": 0.6512, + "step": 1957 + }, + { + "epoch": 0.1440699011266958, + "grad_norm": 0.96875, + "learning_rate": 4.7550004481564266e-05, + "loss": 1.0559, + "step": 1958 + }, + { + "epoch": 0.1441434812600598, + "grad_norm": 0.9375, + "learning_rate": 4.754750343852229e-05, + "loss": 0.932, + "step": 1959 + }, + { + "epoch": 0.14421706139342377, + "grad_norm": 0.8515625, + "learning_rate": 4.75450011853984e-05, + "loss": 0.8623, + "step": 1960 + }, + { + "epoch": 0.14429064152678778, + "grad_norm": 0.9140625, + "learning_rate": 4.7542497722326874e-05, + "loss": 0.6626, + "step": 1961 + }, + { + "epoch": 0.14436422166015175, + "grad_norm": 1.0546875, + "learning_rate": 4.753999304944207e-05, + "loss": 1.1838, + "step": 1962 + }, + { + "epoch": 0.14443780179351576, + "grad_norm": 1.0546875, + "learning_rate": 4.753748716687841e-05, + "loss": 0.9884, + "step": 1963 + }, + { + "epoch": 0.14451138192687973, + "grad_norm": 0.75, + "learning_rate": 4.753498007477038e-05, + "loss": 0.8274, + "step": 1964 + }, + { + "epoch": 0.14458496206024374, + "grad_norm": 0.98046875, + "learning_rate": 4.7532471773252535e-05, + "loss": 1.5581, + "step": 1965 + }, + { + "epoch": 0.14465854219360771, + "grad_norm": 0.8125, + "learning_rate": 4.7529962262459484e-05, + "loss": 0.856, + "step": 1966 + }, + { + "epoch": 0.14473212232697172, + "grad_norm": 0.82421875, + "learning_rate": 4.752745154252591e-05, + "loss": 0.8135, + "step": 1967 + }, + { + "epoch": 0.14480570246033572, + "grad_norm": 1.1171875, + "learning_rate": 4.752493961358657e-05, + "loss": 1.2011, + "step": 1968 + }, + { + "epoch": 0.1448792825936997, + "grad_norm": 0.90234375, + "learning_rate": 4.752242647577626e-05, + "loss": 0.7709, + "step": 1969 + }, + { + "epoch": 0.1449528627270637, + "grad_norm": 0.9140625, + "learning_rate": 4.751991212922986e-05, + "loss": 0.8145, + "step": 1970 + }, + { + "epoch": 0.14502644286042768, + "grad_norm": 0.99609375, + "learning_rate": 4.751739657408231e-05, + "loss": 1.091, + "step": 1971 + }, + { + "epoch": 0.14510002299379168, + "grad_norm": 0.9296875, + "learning_rate": 4.751487981046861e-05, + "loss": 1.2284, + "step": 1972 + }, + { + "epoch": 0.14517360312715566, + "grad_norm": 2.59375, + "learning_rate": 4.751236183852385e-05, + "loss": 1.9465, + "step": 1973 + }, + { + "epoch": 0.14524718326051966, + "grad_norm": 0.8125, + "learning_rate": 4.750984265838313e-05, + "loss": 0.9955, + "step": 1974 + }, + { + "epoch": 0.14532076339388364, + "grad_norm": 0.828125, + "learning_rate": 4.750732227018168e-05, + "loss": 1.0679, + "step": 1975 + }, + { + "epoch": 0.14539434352724764, + "grad_norm": 1.0078125, + "learning_rate": 4.750480067405476e-05, + "loss": 1.0622, + "step": 1976 + }, + { + "epoch": 0.14546792366061165, + "grad_norm": 1.0234375, + "learning_rate": 4.750227787013768e-05, + "loss": 1.7516, + "step": 1977 + }, + { + "epoch": 0.14554150379397562, + "grad_norm": 0.8046875, + "learning_rate": 4.749975385856586e-05, + "loss": 0.8107, + "step": 1978 + }, + { + "epoch": 0.14561508392733963, + "grad_norm": 0.76953125, + "learning_rate": 4.749722863947473e-05, + "loss": 0.5357, + "step": 1979 + }, + { + "epoch": 0.1456886640607036, + "grad_norm": 0.7578125, + "learning_rate": 4.749470221299984e-05, + "loss": 0.8121, + "step": 1980 + }, + { + "epoch": 0.1457622441940676, + "grad_norm": 0.8984375, + "learning_rate": 4.7492174579276774e-05, + "loss": 0.9148, + "step": 1981 + }, + { + "epoch": 0.14583582432743158, + "grad_norm": 1.0859375, + "learning_rate": 4.748964573844118e-05, + "loss": 1.3749, + "step": 1982 + }, + { + "epoch": 0.1459094044607956, + "grad_norm": 0.82421875, + "learning_rate": 4.748711569062877e-05, + "loss": 0.7866, + "step": 1983 + }, + { + "epoch": 0.14598298459415956, + "grad_norm": 0.828125, + "learning_rate": 4.748458443597533e-05, + "loss": 0.9804, + "step": 1984 + }, + { + "epoch": 0.14605656472752357, + "grad_norm": 0.8671875, + "learning_rate": 4.748205197461671e-05, + "loss": 0.8009, + "step": 1985 + }, + { + "epoch": 0.14613014486088757, + "grad_norm": 0.72265625, + "learning_rate": 4.747951830668884e-05, + "loss": 1.013, + "step": 1986 + }, + { + "epoch": 0.14620372499425155, + "grad_norm": 0.92578125, + "learning_rate": 4.747698343232766e-05, + "loss": 1.1268, + "step": 1987 + }, + { + "epoch": 0.14627730512761555, + "grad_norm": 0.8671875, + "learning_rate": 4.7474447351669244e-05, + "loss": 1.0835, + "step": 1988 + }, + { + "epoch": 0.14635088526097953, + "grad_norm": 0.94921875, + "learning_rate": 4.7471910064849685e-05, + "loss": 1.0355, + "step": 1989 + }, + { + "epoch": 0.14642446539434353, + "grad_norm": 1.0546875, + "learning_rate": 4.746937157200515e-05, + "loss": 1.3579, + "step": 1990 + }, + { + "epoch": 0.1464980455277075, + "grad_norm": 0.9296875, + "learning_rate": 4.746683187327189e-05, + "loss": 1.1034, + "step": 1991 + }, + { + "epoch": 0.14657162566107151, + "grad_norm": 0.76953125, + "learning_rate": 4.746429096878619e-05, + "loss": 0.7161, + "step": 1992 + }, + { + "epoch": 0.1466452057944355, + "grad_norm": 0.8828125, + "learning_rate": 4.746174885868443e-05, + "loss": 1.0823, + "step": 1993 + }, + { + "epoch": 0.1467187859277995, + "grad_norm": 0.7578125, + "learning_rate": 4.7459205543103026e-05, + "loss": 0.7984, + "step": 1994 + }, + { + "epoch": 0.1467923660611635, + "grad_norm": 0.8671875, + "learning_rate": 4.745666102217848e-05, + "loss": 0.8845, + "step": 1995 + }, + { + "epoch": 0.14686594619452747, + "grad_norm": 1.046875, + "learning_rate": 4.745411529604736e-05, + "loss": 1.076, + "step": 1996 + }, + { + "epoch": 0.14693952632789148, + "grad_norm": 0.96484375, + "learning_rate": 4.745156836484627e-05, + "loss": 1.0053, + "step": 1997 + }, + { + "epoch": 0.14701310646125546, + "grad_norm": 0.8203125, + "learning_rate": 4.744902022871192e-05, + "loss": 0.7948, + "step": 1998 + }, + { + "epoch": 0.14708668659461946, + "grad_norm": 0.90234375, + "learning_rate": 4.744647088778105e-05, + "loss": 1.0067, + "step": 1999 + }, + { + "epoch": 0.14716026672798344, + "grad_norm": 0.859375, + "learning_rate": 4.7443920342190485e-05, + "loss": 1.0513, + "step": 2000 + }, + { + "epoch": 0.14723384686134744, + "grad_norm": 0.80078125, + "learning_rate": 4.744136859207711e-05, + "loss": 0.7567, + "step": 2001 + }, + { + "epoch": 0.14730742699471142, + "grad_norm": 0.96484375, + "learning_rate": 4.743881563757786e-05, + "loss": 0.9063, + "step": 2002 + }, + { + "epoch": 0.14738100712807542, + "grad_norm": 1.0390625, + "learning_rate": 4.7436261478829756e-05, + "loss": 1.4177, + "step": 2003 + }, + { + "epoch": 0.14745458726143942, + "grad_norm": 0.72265625, + "learning_rate": 4.743370611596988e-05, + "loss": 0.6505, + "step": 2004 + }, + { + "epoch": 0.1475281673948034, + "grad_norm": 0.8984375, + "learning_rate": 4.7431149549135364e-05, + "loss": 0.6567, + "step": 2005 + }, + { + "epoch": 0.1476017475281674, + "grad_norm": 0.80078125, + "learning_rate": 4.742859177846342e-05, + "loss": 0.9314, + "step": 2006 + }, + { + "epoch": 0.14767532766153138, + "grad_norm": 0.80859375, + "learning_rate": 4.7426032804091315e-05, + "loss": 0.8187, + "step": 2007 + }, + { + "epoch": 0.14774890779489538, + "grad_norm": 0.90234375, + "learning_rate": 4.742347262615639e-05, + "loss": 1.0127, + "step": 2008 + }, + { + "epoch": 0.14782248792825936, + "grad_norm": 1.5390625, + "learning_rate": 4.742091124479604e-05, + "loss": 0.562, + "step": 2009 + }, + { + "epoch": 0.14789606806162336, + "grad_norm": 0.99609375, + "learning_rate": 4.741834866014773e-05, + "loss": 1.4887, + "step": 2010 + }, + { + "epoch": 0.14796964819498734, + "grad_norm": 0.83984375, + "learning_rate": 4.741578487234899e-05, + "loss": 1.1488, + "step": 2011 + }, + { + "epoch": 0.14804322832835135, + "grad_norm": 1.109375, + "learning_rate": 4.7413219881537404e-05, + "loss": 1.2171, + "step": 2012 + }, + { + "epoch": 0.14811680846171535, + "grad_norm": 1.0078125, + "learning_rate": 4.741065368785066e-05, + "loss": 1.8913, + "step": 2013 + }, + { + "epoch": 0.14819038859507933, + "grad_norm": 0.78515625, + "learning_rate": 4.740808629142645e-05, + "loss": 0.7099, + "step": 2014 + }, + { + "epoch": 0.14826396872844333, + "grad_norm": 0.8515625, + "learning_rate": 4.740551769240256e-05, + "loss": 1.0766, + "step": 2015 + }, + { + "epoch": 0.1483375488618073, + "grad_norm": 0.734375, + "learning_rate": 4.7402947890916865e-05, + "loss": 0.7768, + "step": 2016 + }, + { + "epoch": 0.1484111289951713, + "grad_norm": 1.2421875, + "learning_rate": 4.740037688710727e-05, + "loss": 1.2981, + "step": 2017 + }, + { + "epoch": 0.14848470912853529, + "grad_norm": 0.81640625, + "learning_rate": 4.739780468111175e-05, + "loss": 1.0663, + "step": 2018 + }, + { + "epoch": 0.1485582892618993, + "grad_norm": 0.96875, + "learning_rate": 4.739523127306837e-05, + "loss": 0.8822, + "step": 2019 + }, + { + "epoch": 0.14863186939526327, + "grad_norm": 0.7265625, + "learning_rate": 4.739265666311521e-05, + "loss": 0.7141, + "step": 2020 + }, + { + "epoch": 0.14870544952862727, + "grad_norm": 2.453125, + "learning_rate": 4.739008085139046e-05, + "loss": 1.0347, + "step": 2021 + }, + { + "epoch": 0.14877902966199127, + "grad_norm": 0.796875, + "learning_rate": 4.738750383803237e-05, + "loss": 1.0854, + "step": 2022 + }, + { + "epoch": 0.14885260979535525, + "grad_norm": 0.81640625, + "learning_rate": 4.738492562317923e-05, + "loss": 0.8448, + "step": 2023 + }, + { + "epoch": 0.14892618992871925, + "grad_norm": 0.8125, + "learning_rate": 4.7382346206969405e-05, + "loss": 1.1947, + "step": 2024 + }, + { + "epoch": 0.14899977006208323, + "grad_norm": 0.828125, + "learning_rate": 4.737976558954135e-05, + "loss": 0.8409, + "step": 2025 + }, + { + "epoch": 0.14907335019544724, + "grad_norm": 0.84765625, + "learning_rate": 4.737718377103353e-05, + "loss": 0.8744, + "step": 2026 + }, + { + "epoch": 0.1491469303288112, + "grad_norm": 2.09375, + "learning_rate": 4.737460075158452e-05, + "loss": 0.8834, + "step": 2027 + }, + { + "epoch": 0.14922051046217522, + "grad_norm": 0.796875, + "learning_rate": 4.737201653133295e-05, + "loss": 0.7441, + "step": 2028 + }, + { + "epoch": 0.1492940905955392, + "grad_norm": 1.078125, + "learning_rate": 4.736943111041752e-05, + "loss": 1.0531, + "step": 2029 + }, + { + "epoch": 0.1493676707289032, + "grad_norm": 0.91015625, + "learning_rate": 4.736684448897696e-05, + "loss": 1.1605, + "step": 2030 + }, + { + "epoch": 0.1494412508622672, + "grad_norm": 0.8828125, + "learning_rate": 4.7364256667150095e-05, + "loss": 0.8815, + "step": 2031 + }, + { + "epoch": 0.14951483099563118, + "grad_norm": 1.0859375, + "learning_rate": 4.736166764507583e-05, + "loss": 1.0073, + "step": 2032 + }, + { + "epoch": 0.14958841112899518, + "grad_norm": 0.84765625, + "learning_rate": 4.7359077422893094e-05, + "loss": 0.9168, + "step": 2033 + }, + { + "epoch": 0.14966199126235916, + "grad_norm": 0.92578125, + "learning_rate": 4.73564860007409e-05, + "loss": 1.0928, + "step": 2034 + }, + { + "epoch": 0.14973557139572316, + "grad_norm": 0.796875, + "learning_rate": 4.7353893378758326e-05, + "loss": 0.9621, + "step": 2035 + }, + { + "epoch": 0.14980915152908714, + "grad_norm": 0.984375, + "learning_rate": 4.7351299557084515e-05, + "loss": 1.1565, + "step": 2036 + }, + { + "epoch": 0.14988273166245114, + "grad_norm": 2.28125, + "learning_rate": 4.734870453585867e-05, + "loss": 0.9596, + "step": 2037 + }, + { + "epoch": 0.14995631179581512, + "grad_norm": 0.8359375, + "learning_rate": 4.734610831522007e-05, + "loss": 0.8283, + "step": 2038 + }, + { + "epoch": 0.15002989192917912, + "grad_norm": 0.67578125, + "learning_rate": 4.7343510895308044e-05, + "loss": 0.6312, + "step": 2039 + }, + { + "epoch": 0.15010347206254313, + "grad_norm": 1.0078125, + "learning_rate": 4.734091227626198e-05, + "loss": 1.0671, + "step": 2040 + }, + { + "epoch": 0.1501770521959071, + "grad_norm": 1.015625, + "learning_rate": 4.733831245822136e-05, + "loss": 1.1031, + "step": 2041 + }, + { + "epoch": 0.1502506323292711, + "grad_norm": 0.8203125, + "learning_rate": 4.733571144132569e-05, + "loss": 0.8168, + "step": 2042 + }, + { + "epoch": 0.15032421246263508, + "grad_norm": 0.90234375, + "learning_rate": 4.733310922571458e-05, + "loss": 1.0221, + "step": 2043 + }, + { + "epoch": 0.15039779259599909, + "grad_norm": 0.80859375, + "learning_rate": 4.733050581152768e-05, + "loss": 0.8938, + "step": 2044 + }, + { + "epoch": 0.15047137272936306, + "grad_norm": 0.96484375, + "learning_rate": 4.732790119890471e-05, + "loss": 1.0355, + "step": 2045 + }, + { + "epoch": 0.15054495286272707, + "grad_norm": 0.78125, + "learning_rate": 4.732529538798545e-05, + "loss": 0.6696, + "step": 2046 + }, + { + "epoch": 0.15061853299609104, + "grad_norm": 0.87109375, + "learning_rate": 4.7322688378909754e-05, + "loss": 0.8844, + "step": 2047 + }, + { + "epoch": 0.15069211312945505, + "grad_norm": 0.796875, + "learning_rate": 4.7320080171817536e-05, + "loss": 0.9323, + "step": 2048 + }, + { + "epoch": 0.15076569326281905, + "grad_norm": 0.90625, + "learning_rate": 4.731747076684877e-05, + "loss": 1.1546, + "step": 2049 + }, + { + "epoch": 0.15083927339618303, + "grad_norm": 2.0625, + "learning_rate": 4.731486016414351e-05, + "loss": 0.8309, + "step": 2050 + }, + { + "epoch": 0.15091285352954703, + "grad_norm": 1.0703125, + "learning_rate": 4.731224836384184e-05, + "loss": 1.1214, + "step": 2051 + }, + { + "epoch": 0.150986433662911, + "grad_norm": 0.82421875, + "learning_rate": 4.730963536608394e-05, + "loss": 1.1951, + "step": 2052 + }, + { + "epoch": 0.151060013796275, + "grad_norm": 0.98046875, + "learning_rate": 4.7307021171010054e-05, + "loss": 0.8857, + "step": 2053 + }, + { + "epoch": 0.151133593929639, + "grad_norm": 1.046875, + "learning_rate": 4.7304405778760466e-05, + "loss": 1.3764, + "step": 2054 + }, + { + "epoch": 0.151207174063003, + "grad_norm": 0.9453125, + "learning_rate": 4.7301789189475556e-05, + "loss": 1.0744, + "step": 2055 + }, + { + "epoch": 0.15128075419636697, + "grad_norm": 1.140625, + "learning_rate": 4.729917140329574e-05, + "loss": 1.1497, + "step": 2056 + }, + { + "epoch": 0.15135433432973097, + "grad_norm": 0.95703125, + "learning_rate": 4.7296552420361505e-05, + "loss": 1.4336, + "step": 2057 + }, + { + "epoch": 0.15142791446309498, + "grad_norm": 0.9296875, + "learning_rate": 4.7293932240813424e-05, + "loss": 0.8087, + "step": 2058 + }, + { + "epoch": 0.15150149459645895, + "grad_norm": 0.75390625, + "learning_rate": 4.7291310864792104e-05, + "loss": 0.7935, + "step": 2059 + }, + { + "epoch": 0.15157507472982296, + "grad_norm": 0.79296875, + "learning_rate": 4.728868829243823e-05, + "loss": 0.5489, + "step": 2060 + }, + { + "epoch": 0.15164865486318693, + "grad_norm": 1.0625, + "learning_rate": 4.728606452389255e-05, + "loss": 0.9969, + "step": 2061 + }, + { + "epoch": 0.15172223499655094, + "grad_norm": 0.84375, + "learning_rate": 4.7283439559295884e-05, + "loss": 0.8286, + "step": 2062 + }, + { + "epoch": 0.1517958151299149, + "grad_norm": 0.9140625, + "learning_rate": 4.72808133987891e-05, + "loss": 0.9153, + "step": 2063 + }, + { + "epoch": 0.15186939526327892, + "grad_norm": 0.8671875, + "learning_rate": 4.727818604251315e-05, + "loss": 0.955, + "step": 2064 + }, + { + "epoch": 0.1519429753966429, + "grad_norm": 0.9375, + "learning_rate": 4.7275557490609026e-05, + "loss": 0.8011, + "step": 2065 + }, + { + "epoch": 0.1520165555300069, + "grad_norm": 0.859375, + "learning_rate": 4.727292774321781e-05, + "loss": 1.0804, + "step": 2066 + }, + { + "epoch": 0.1520901356633709, + "grad_norm": 0.79296875, + "learning_rate": 4.727029680048063e-05, + "loss": 0.7592, + "step": 2067 + }, + { + "epoch": 0.15216371579673488, + "grad_norm": 0.828125, + "learning_rate": 4.726766466253867e-05, + "loss": 1.0333, + "step": 2068 + }, + { + "epoch": 0.15223729593009888, + "grad_norm": 1.0234375, + "learning_rate": 4.7265031329533215e-05, + "loss": 1.2509, + "step": 2069 + }, + { + "epoch": 0.15231087606346286, + "grad_norm": 0.8984375, + "learning_rate": 4.726239680160559e-05, + "loss": 1.1099, + "step": 2070 + }, + { + "epoch": 0.15238445619682686, + "grad_norm": 0.875, + "learning_rate": 4.7259761078897166e-05, + "loss": 0.9803, + "step": 2071 + }, + { + "epoch": 0.15245803633019084, + "grad_norm": 0.8671875, + "learning_rate": 4.72571241615494e-05, + "loss": 0.9686, + "step": 2072 + }, + { + "epoch": 0.15253161646355484, + "grad_norm": 0.8125, + "learning_rate": 4.7254486049703834e-05, + "loss": 0.9403, + "step": 2073 + }, + { + "epoch": 0.15260519659691882, + "grad_norm": 0.80078125, + "learning_rate": 4.725184674350203e-05, + "loss": 1.0394, + "step": 2074 + }, + { + "epoch": 0.15267877673028282, + "grad_norm": 0.90625, + "learning_rate": 4.724920624308563e-05, + "loss": 0.6761, + "step": 2075 + }, + { + "epoch": 0.15275235686364683, + "grad_norm": 0.8046875, + "learning_rate": 4.724656454859636e-05, + "loss": 1.0326, + "step": 2076 + }, + { + "epoch": 0.1528259369970108, + "grad_norm": 0.7734375, + "learning_rate": 4.7243921660175996e-05, + "loss": 0.7864, + "step": 2077 + }, + { + "epoch": 0.1528995171303748, + "grad_norm": 1.0625, + "learning_rate": 4.724127757796636e-05, + "loss": 0.9623, + "step": 2078 + }, + { + "epoch": 0.15297309726373878, + "grad_norm": 0.99609375, + "learning_rate": 4.7238632302109364e-05, + "loss": 1.1846, + "step": 2079 + }, + { + "epoch": 0.1530466773971028, + "grad_norm": 0.7421875, + "learning_rate": 4.7235985832746976e-05, + "loss": 0.8224, + "step": 2080 + }, + { + "epoch": 0.15312025753046676, + "grad_norm": 0.953125, + "learning_rate": 4.723333817002123e-05, + "loss": 0.9363, + "step": 2081 + }, + { + "epoch": 0.15319383766383077, + "grad_norm": 0.84375, + "learning_rate": 4.723068931407422e-05, + "loss": 0.9264, + "step": 2082 + }, + { + "epoch": 0.15326741779719474, + "grad_norm": 0.81640625, + "learning_rate": 4.72280392650481e-05, + "loss": 1.0241, + "step": 2083 + }, + { + "epoch": 0.15334099793055875, + "grad_norm": 1.1640625, + "learning_rate": 4.722538802308508e-05, + "loss": 1.3347, + "step": 2084 + }, + { + "epoch": 0.15341457806392275, + "grad_norm": 0.99609375, + "learning_rate": 4.722273558832748e-05, + "loss": 1.1008, + "step": 2085 + }, + { + "epoch": 0.15348815819728673, + "grad_norm": 0.79296875, + "learning_rate": 4.7220081960917625e-05, + "loss": 0.9284, + "step": 2086 + }, + { + "epoch": 0.15356173833065073, + "grad_norm": 1.0859375, + "learning_rate": 4.721742714099795e-05, + "loss": 1.1102, + "step": 2087 + }, + { + "epoch": 0.1536353184640147, + "grad_norm": 1.5859375, + "learning_rate": 4.721477112871091e-05, + "loss": 1.1702, + "step": 2088 + }, + { + "epoch": 0.1537088985973787, + "grad_norm": 0.86328125, + "learning_rate": 4.721211392419907e-05, + "loss": 0.9287, + "step": 2089 + }, + { + "epoch": 0.1537824787307427, + "grad_norm": 0.8828125, + "learning_rate": 4.720945552760503e-05, + "loss": 1.1566, + "step": 2090 + }, + { + "epoch": 0.1538560588641067, + "grad_norm": 1.25, + "learning_rate": 4.720679593907145e-05, + "loss": 1.5028, + "step": 2091 + }, + { + "epoch": 0.15392963899747067, + "grad_norm": 0.7890625, + "learning_rate": 4.720413515874108e-05, + "loss": 0.6543, + "step": 2092 + }, + { + "epoch": 0.15400321913083467, + "grad_norm": 0.94921875, + "learning_rate": 4.7201473186756716e-05, + "loss": 0.9032, + "step": 2093 + }, + { + "epoch": 0.15407679926419868, + "grad_norm": 0.93359375, + "learning_rate": 4.719881002326121e-05, + "loss": 0.7149, + "step": 2094 + }, + { + "epoch": 0.15415037939756265, + "grad_norm": 0.77734375, + "learning_rate": 4.7196145668397504e-05, + "loss": 0.7797, + "step": 2095 + }, + { + "epoch": 0.15422395953092666, + "grad_norm": 0.77734375, + "learning_rate": 4.719348012230859e-05, + "loss": 0.6621, + "step": 2096 + }, + { + "epoch": 0.15429753966429063, + "grad_norm": 0.890625, + "learning_rate": 4.7190813385137503e-05, + "loss": 1.0652, + "step": 2097 + }, + { + "epoch": 0.15437111979765464, + "grad_norm": 0.97265625, + "learning_rate": 4.718814545702738e-05, + "loss": 0.9207, + "step": 2098 + }, + { + "epoch": 0.15444469993101861, + "grad_norm": 0.8046875, + "learning_rate": 4.7185476338121395e-05, + "loss": 1.0707, + "step": 2099 + }, + { + "epoch": 0.15451828006438262, + "grad_norm": 0.8828125, + "learning_rate": 4.71828060285628e-05, + "loss": 0.8185, + "step": 2100 + }, + { + "epoch": 0.15459186019774662, + "grad_norm": 1.03125, + "learning_rate": 4.7180134528494903e-05, + "loss": 1.5909, + "step": 2101 + }, + { + "epoch": 0.1546654403311106, + "grad_norm": 0.90625, + "learning_rate": 4.717746183806108e-05, + "loss": 0.7151, + "step": 2102 + }, + { + "epoch": 0.1547390204644746, + "grad_norm": 0.85546875, + "learning_rate": 4.717478795740477e-05, + "loss": 0.6298, + "step": 2103 + }, + { + "epoch": 0.15481260059783858, + "grad_norm": 0.9921875, + "learning_rate": 4.717211288666946e-05, + "loss": 1.4797, + "step": 2104 + }, + { + "epoch": 0.15488618073120258, + "grad_norm": 1.5078125, + "learning_rate": 4.716943662599873e-05, + "loss": 0.8167, + "step": 2105 + }, + { + "epoch": 0.15495976086456656, + "grad_norm": 0.79296875, + "learning_rate": 4.716675917553622e-05, + "loss": 0.8225, + "step": 2106 + }, + { + "epoch": 0.15503334099793056, + "grad_norm": 0.7421875, + "learning_rate": 4.71640805354256e-05, + "loss": 0.7856, + "step": 2107 + }, + { + "epoch": 0.15510692113129454, + "grad_norm": 0.7578125, + "learning_rate": 4.7161400705810646e-05, + "loss": 0.7681, + "step": 2108 + }, + { + "epoch": 0.15518050126465854, + "grad_norm": 0.80078125, + "learning_rate": 4.7158719686835176e-05, + "loss": 1.1255, + "step": 2109 + }, + { + "epoch": 0.15525408139802255, + "grad_norm": 0.75390625, + "learning_rate": 4.715603747864307e-05, + "loss": 0.6637, + "step": 2110 + }, + { + "epoch": 0.15532766153138652, + "grad_norm": 0.83203125, + "learning_rate": 4.715335408137827e-05, + "loss": 0.8472, + "step": 2111 + }, + { + "epoch": 0.15540124166475053, + "grad_norm": 0.92578125, + "learning_rate": 4.715066949518481e-05, + "loss": 0.8604, + "step": 2112 + }, + { + "epoch": 0.1554748217981145, + "grad_norm": 0.9140625, + "learning_rate": 4.7147983720206755e-05, + "loss": 0.9236, + "step": 2113 + }, + { + "epoch": 0.1555484019314785, + "grad_norm": 1.0625, + "learning_rate": 4.714529675658824e-05, + "loss": 1.0879, + "step": 2114 + }, + { + "epoch": 0.15562198206484248, + "grad_norm": 0.765625, + "learning_rate": 4.714260860447348e-05, + "loss": 0.7562, + "step": 2115 + }, + { + "epoch": 0.1556955621982065, + "grad_norm": 0.875, + "learning_rate": 4.713991926400673e-05, + "loss": 1.0826, + "step": 2116 + }, + { + "epoch": 0.15576914233157046, + "grad_norm": 0.90234375, + "learning_rate": 4.713722873533234e-05, + "loss": 1.4371, + "step": 2117 + }, + { + "epoch": 0.15584272246493447, + "grad_norm": 1.015625, + "learning_rate": 4.713453701859468e-05, + "loss": 1.0721, + "step": 2118 + }, + { + "epoch": 0.15591630259829847, + "grad_norm": 1.0546875, + "learning_rate": 4.713184411393824e-05, + "loss": 0.9858, + "step": 2119 + }, + { + "epoch": 0.15598988273166245, + "grad_norm": 0.94921875, + "learning_rate": 4.712915002150752e-05, + "loss": 1.2844, + "step": 2120 + }, + { + "epoch": 0.15606346286502645, + "grad_norm": 0.92578125, + "learning_rate": 4.712645474144711e-05, + "loss": 0.9151, + "step": 2121 + }, + { + "epoch": 0.15613704299839043, + "grad_norm": 0.9609375, + "learning_rate": 4.712375827390167e-05, + "loss": 1.0682, + "step": 2122 + }, + { + "epoch": 0.15621062313175443, + "grad_norm": 1.140625, + "learning_rate": 4.712106061901591e-05, + "loss": 1.1453, + "step": 2123 + }, + { + "epoch": 0.1562842032651184, + "grad_norm": 0.83984375, + "learning_rate": 4.711836177693461e-05, + "loss": 0.9225, + "step": 2124 + }, + { + "epoch": 0.15635778339848241, + "grad_norm": 0.85546875, + "learning_rate": 4.7115661747802604e-05, + "loss": 1.0966, + "step": 2125 + }, + { + "epoch": 0.1564313635318464, + "grad_norm": 0.79296875, + "learning_rate": 4.71129605317648e-05, + "loss": 0.9089, + "step": 2126 + }, + { + "epoch": 0.1565049436652104, + "grad_norm": 0.70703125, + "learning_rate": 4.711025812896618e-05, + "loss": 0.6613, + "step": 2127 + }, + { + "epoch": 0.1565785237985744, + "grad_norm": 0.73046875, + "learning_rate": 4.710755453955176e-05, + "loss": 0.9499, + "step": 2128 + }, + { + "epoch": 0.15665210393193837, + "grad_norm": 0.86328125, + "learning_rate": 4.710484976366664e-05, + "loss": 0.948, + "step": 2129 + }, + { + "epoch": 0.15672568406530238, + "grad_norm": 0.71875, + "learning_rate": 4.710214380145599e-05, + "loss": 0.7208, + "step": 2130 + }, + { + "epoch": 0.15679926419866635, + "grad_norm": 0.73828125, + "learning_rate": 4.709943665306502e-05, + "loss": 0.6862, + "step": 2131 + }, + { + "epoch": 0.15687284433203036, + "grad_norm": 0.86328125, + "learning_rate": 4.7096728318639025e-05, + "loss": 1.1438, + "step": 2132 + }, + { + "epoch": 0.15694642446539434, + "grad_norm": 1.3828125, + "learning_rate": 4.7094018798323365e-05, + "loss": 1.1915, + "step": 2133 + }, + { + "epoch": 0.15702000459875834, + "grad_norm": 0.859375, + "learning_rate": 4.709130809226344e-05, + "loss": 1.2771, + "step": 2134 + }, + { + "epoch": 0.15709358473212232, + "grad_norm": 0.73828125, + "learning_rate": 4.7088596200604735e-05, + "loss": 0.7715, + "step": 2135 + }, + { + "epoch": 0.15716716486548632, + "grad_norm": 1.015625, + "learning_rate": 4.708588312349279e-05, + "loss": 1.2178, + "step": 2136 + }, + { + "epoch": 0.15724074499885032, + "grad_norm": 0.8671875, + "learning_rate": 4.708316886107321e-05, + "loss": 0.8211, + "step": 2137 + }, + { + "epoch": 0.1573143251322143, + "grad_norm": 0.859375, + "learning_rate": 4.708045341349168e-05, + "loss": 0.8783, + "step": 2138 + }, + { + "epoch": 0.1573879052655783, + "grad_norm": 0.79296875, + "learning_rate": 4.70777367808939e-05, + "loss": 0.951, + "step": 2139 + }, + { + "epoch": 0.15746148539894228, + "grad_norm": 0.73046875, + "learning_rate": 4.70750189634257e-05, + "loss": 0.792, + "step": 2140 + }, + { + "epoch": 0.15753506553230628, + "grad_norm": 0.83984375, + "learning_rate": 4.7072299961232915e-05, + "loss": 0.6977, + "step": 2141 + }, + { + "epoch": 0.15760864566567026, + "grad_norm": 0.78125, + "learning_rate": 4.7069579774461485e-05, + "loss": 0.7599, + "step": 2142 + }, + { + "epoch": 0.15768222579903426, + "grad_norm": 1.140625, + "learning_rate": 4.70668584032574e-05, + "loss": 1.1872, + "step": 2143 + }, + { + "epoch": 0.15775580593239824, + "grad_norm": 1.0859375, + "learning_rate": 4.7064135847766686e-05, + "loss": 0.9709, + "step": 2144 + }, + { + "epoch": 0.15782938606576224, + "grad_norm": 0.84765625, + "learning_rate": 4.706141210813549e-05, + "loss": 0.9034, + "step": 2145 + }, + { + "epoch": 0.15790296619912625, + "grad_norm": 0.828125, + "learning_rate": 4.705868718450996e-05, + "loss": 0.9864, + "step": 2146 + }, + { + "epoch": 0.15797654633249023, + "grad_norm": 1.234375, + "learning_rate": 4.7055961077036364e-05, + "loss": 1.5484, + "step": 2147 + }, + { + "epoch": 0.15805012646585423, + "grad_norm": 0.77734375, + "learning_rate": 4.7053233785860996e-05, + "loss": 0.9305, + "step": 2148 + }, + { + "epoch": 0.1581237065992182, + "grad_norm": 0.859375, + "learning_rate": 4.705050531113021e-05, + "loss": 0.8664, + "step": 2149 + }, + { + "epoch": 0.1581972867325822, + "grad_norm": 6.03125, + "learning_rate": 4.7047775652990464e-05, + "loss": 1.0787, + "step": 2150 + }, + { + "epoch": 0.15827086686594619, + "grad_norm": 0.984375, + "learning_rate": 4.704504481158823e-05, + "loss": 0.9074, + "step": 2151 + }, + { + "epoch": 0.1583444469993102, + "grad_norm": 0.83984375, + "learning_rate": 4.7042312787070084e-05, + "loss": 0.9109, + "step": 2152 + }, + { + "epoch": 0.15841802713267417, + "grad_norm": 1.2734375, + "learning_rate": 4.703957957958265e-05, + "loss": 1.3144, + "step": 2153 + }, + { + "epoch": 0.15849160726603817, + "grad_norm": 0.80078125, + "learning_rate": 4.70368451892726e-05, + "loss": 0.7123, + "step": 2154 + }, + { + "epoch": 0.15856518739940217, + "grad_norm": 0.77734375, + "learning_rate": 4.7034109616286694e-05, + "loss": 0.9552, + "step": 2155 + }, + { + "epoch": 0.15863876753276615, + "grad_norm": 0.77734375, + "learning_rate": 4.7031372860771735e-05, + "loss": 0.937, + "step": 2156 + }, + { + "epoch": 0.15871234766613015, + "grad_norm": 0.75390625, + "learning_rate": 4.70286349228746e-05, + "loss": 0.739, + "step": 2157 + }, + { + "epoch": 0.15878592779949413, + "grad_norm": 0.95703125, + "learning_rate": 4.702589580274225e-05, + "loss": 1.2628, + "step": 2158 + }, + { + "epoch": 0.15885950793285813, + "grad_norm": 1.109375, + "learning_rate": 4.7023155500521666e-05, + "loss": 1.3584, + "step": 2159 + }, + { + "epoch": 0.1589330880662221, + "grad_norm": 0.84375, + "learning_rate": 4.7020414016359926e-05, + "loss": 0.9563, + "step": 2160 + }, + { + "epoch": 0.15900666819958612, + "grad_norm": 0.84765625, + "learning_rate": 4.701767135040414e-05, + "loss": 0.9553, + "step": 2161 + }, + { + "epoch": 0.1590802483329501, + "grad_norm": 1.1015625, + "learning_rate": 4.701492750280154e-05, + "loss": 1.3369, + "step": 2162 + }, + { + "epoch": 0.1591538284663141, + "grad_norm": 0.9921875, + "learning_rate": 4.701218247369935e-05, + "loss": 1.0824, + "step": 2163 + }, + { + "epoch": 0.1592274085996781, + "grad_norm": 0.921875, + "learning_rate": 4.7009436263244914e-05, + "loss": 1.254, + "step": 2164 + }, + { + "epoch": 0.15930098873304208, + "grad_norm": 0.8671875, + "learning_rate": 4.700668887158559e-05, + "loss": 1.0145, + "step": 2165 + }, + { + "epoch": 0.15937456886640608, + "grad_norm": 0.9375, + "learning_rate": 4.700394029886884e-05, + "loss": 0.7849, + "step": 2166 + }, + { + "epoch": 0.15944814899977006, + "grad_norm": 0.91796875, + "learning_rate": 4.700119054524218e-05, + "loss": 0.9497, + "step": 2167 + }, + { + "epoch": 0.15952172913313406, + "grad_norm": 0.8046875, + "learning_rate": 4.699843961085317e-05, + "loss": 0.784, + "step": 2168 + }, + { + "epoch": 0.15959530926649804, + "grad_norm": 0.9375, + "learning_rate": 4.699568749584946e-05, + "loss": 0.8025, + "step": 2169 + }, + { + "epoch": 0.15966888939986204, + "grad_norm": 0.89453125, + "learning_rate": 4.6992934200378744e-05, + "loss": 0.8684, + "step": 2170 + }, + { + "epoch": 0.15974246953322602, + "grad_norm": 0.80859375, + "learning_rate": 4.69901797245888e-05, + "loss": 1.0372, + "step": 2171 + }, + { + "epoch": 0.15981604966659002, + "grad_norm": 0.671875, + "learning_rate": 4.6987424068627425e-05, + "loss": 0.7471, + "step": 2172 + }, + { + "epoch": 0.15988962979995403, + "grad_norm": 1.0546875, + "learning_rate": 4.6984667232642546e-05, + "loss": 1.122, + "step": 2173 + }, + { + "epoch": 0.159963209933318, + "grad_norm": 0.828125, + "learning_rate": 4.698190921678208e-05, + "loss": 1.0807, + "step": 2174 + }, + { + "epoch": 0.160036790066682, + "grad_norm": 1.0703125, + "learning_rate": 4.697915002119408e-05, + "loss": 1.258, + "step": 2175 + }, + { + "epoch": 0.16011037020004598, + "grad_norm": 1.015625, + "learning_rate": 4.697638964602661e-05, + "loss": 1.2759, + "step": 2176 + }, + { + "epoch": 0.16018395033340999, + "grad_norm": 0.75390625, + "learning_rate": 4.6973628091427805e-05, + "loss": 0.8642, + "step": 2177 + }, + { + "epoch": 0.16025753046677396, + "grad_norm": 0.875, + "learning_rate": 4.697086535754589e-05, + "loss": 0.838, + "step": 2178 + }, + { + "epoch": 0.16033111060013797, + "grad_norm": 1.0234375, + "learning_rate": 4.6968101444529136e-05, + "loss": 1.2733, + "step": 2179 + }, + { + "epoch": 0.16040469073350194, + "grad_norm": 0.9765625, + "learning_rate": 4.696533635252586e-05, + "loss": 0.7795, + "step": 2180 + }, + { + "epoch": 0.16047827086686595, + "grad_norm": 0.84765625, + "learning_rate": 4.6962570081684464e-05, + "loss": 0.9203, + "step": 2181 + }, + { + "epoch": 0.16055185100022995, + "grad_norm": 0.85546875, + "learning_rate": 4.695980263215342e-05, + "loss": 0.853, + "step": 2182 + }, + { + "epoch": 0.16062543113359393, + "grad_norm": 0.98828125, + "learning_rate": 4.695703400408124e-05, + "loss": 1.5444, + "step": 2183 + }, + { + "epoch": 0.16069901126695793, + "grad_norm": 0.81640625, + "learning_rate": 4.695426419761652e-05, + "loss": 1.1323, + "step": 2184 + }, + { + "epoch": 0.1607725914003219, + "grad_norm": 0.9296875, + "learning_rate": 4.6951493212907905e-05, + "loss": 1.099, + "step": 2185 + }, + { + "epoch": 0.1608461715336859, + "grad_norm": 0.8671875, + "learning_rate": 4.694872105010412e-05, + "loss": 1.1622, + "step": 2186 + }, + { + "epoch": 0.1609197516670499, + "grad_norm": 1.140625, + "learning_rate": 4.694594770935391e-05, + "loss": 1.0416, + "step": 2187 + }, + { + "epoch": 0.1609933318004139, + "grad_norm": 1.21875, + "learning_rate": 4.694317319080615e-05, + "loss": 1.1845, + "step": 2188 + }, + { + "epoch": 0.16106691193377787, + "grad_norm": 0.859375, + "learning_rate": 4.694039749460973e-05, + "loss": 0.9836, + "step": 2189 + }, + { + "epoch": 0.16114049206714187, + "grad_norm": 0.68359375, + "learning_rate": 4.6937620620913617e-05, + "loss": 0.7475, + "step": 2190 + }, + { + "epoch": 0.16121407220050588, + "grad_norm": 1.03125, + "learning_rate": 4.693484256986683e-05, + "loss": 1.038, + "step": 2191 + }, + { + "epoch": 0.16128765233386985, + "grad_norm": 0.9375, + "learning_rate": 4.693206334161848e-05, + "loss": 1.0436, + "step": 2192 + }, + { + "epoch": 0.16136123246723386, + "grad_norm": 0.94140625, + "learning_rate": 4.692928293631772e-05, + "loss": 0.791, + "step": 2193 + }, + { + "epoch": 0.16143481260059783, + "grad_norm": 0.97265625, + "learning_rate": 4.692650135411375e-05, + "loss": 1.2259, + "step": 2194 + }, + { + "epoch": 0.16150839273396184, + "grad_norm": 0.83984375, + "learning_rate": 4.692371859515587e-05, + "loss": 0.9201, + "step": 2195 + }, + { + "epoch": 0.1615819728673258, + "grad_norm": 0.98046875, + "learning_rate": 4.692093465959342e-05, + "loss": 1.0345, + "step": 2196 + }, + { + "epoch": 0.16165555300068982, + "grad_norm": 0.73828125, + "learning_rate": 4.691814954757582e-05, + "loss": 0.6074, + "step": 2197 + }, + { + "epoch": 0.1617291331340538, + "grad_norm": 0.94921875, + "learning_rate": 4.691536325925252e-05, + "loss": 1.0246, + "step": 2198 + }, + { + "epoch": 0.1618027132674178, + "grad_norm": 0.9296875, + "learning_rate": 4.691257579477306e-05, + "loss": 1.0069, + "step": 2199 + }, + { + "epoch": 0.1618762934007818, + "grad_norm": 1.0234375, + "learning_rate": 4.690978715428705e-05, + "loss": 1.6378, + "step": 2200 + }, + { + "epoch": 0.16194987353414578, + "grad_norm": 0.86328125, + "learning_rate": 4.690699733794416e-05, + "loss": 0.8153, + "step": 2201 + }, + { + "epoch": 0.16202345366750978, + "grad_norm": 1.046875, + "learning_rate": 4.690420634589408e-05, + "loss": 0.9973, + "step": 2202 + }, + { + "epoch": 0.16209703380087376, + "grad_norm": 1.15625, + "learning_rate": 4.690141417828663e-05, + "loss": 0.9972, + "step": 2203 + }, + { + "epoch": 0.16217061393423776, + "grad_norm": 0.80078125, + "learning_rate": 4.689862083527164e-05, + "loss": 0.7335, + "step": 2204 + }, + { + "epoch": 0.16224419406760174, + "grad_norm": 0.91015625, + "learning_rate": 4.689582631699903e-05, + "loss": 0.6909, + "step": 2205 + }, + { + "epoch": 0.16231777420096574, + "grad_norm": 0.703125, + "learning_rate": 4.689303062361878e-05, + "loss": 0.6839, + "step": 2206 + }, + { + "epoch": 0.16239135433432972, + "grad_norm": 0.921875, + "learning_rate": 4.689023375528092e-05, + "loss": 0.9808, + "step": 2207 + }, + { + "epoch": 0.16246493446769372, + "grad_norm": 1.015625, + "learning_rate": 4.688743571213557e-05, + "loss": 1.6107, + "step": 2208 + }, + { + "epoch": 0.16253851460105773, + "grad_norm": 0.97265625, + "learning_rate": 4.688463649433288e-05, + "loss": 1.2718, + "step": 2209 + }, + { + "epoch": 0.1626120947344217, + "grad_norm": 0.77734375, + "learning_rate": 4.688183610202308e-05, + "loss": 0.9009, + "step": 2210 + }, + { + "epoch": 0.1626856748677857, + "grad_norm": 0.765625, + "learning_rate": 4.687903453535647e-05, + "loss": 1.0113, + "step": 2211 + }, + { + "epoch": 0.16275925500114968, + "grad_norm": 1.1953125, + "learning_rate": 4.687623179448339e-05, + "loss": 1.2858, + "step": 2212 + }, + { + "epoch": 0.1628328351345137, + "grad_norm": 0.953125, + "learning_rate": 4.6873427879554274e-05, + "loss": 1.2891, + "step": 2213 + }, + { + "epoch": 0.16290641526787766, + "grad_norm": 0.859375, + "learning_rate": 4.687062279071961e-05, + "loss": 1.1064, + "step": 2214 + }, + { + "epoch": 0.16297999540124167, + "grad_norm": 1.046875, + "learning_rate": 4.686781652812992e-05, + "loss": 1.3416, + "step": 2215 + }, + { + "epoch": 0.16305357553460564, + "grad_norm": 0.77734375, + "learning_rate": 4.686500909193581e-05, + "loss": 0.8285, + "step": 2216 + }, + { + "epoch": 0.16312715566796965, + "grad_norm": 0.87890625, + "learning_rate": 4.686220048228796e-05, + "loss": 1.0324, + "step": 2217 + }, + { + "epoch": 0.16320073580133365, + "grad_norm": 1.015625, + "learning_rate": 4.685939069933711e-05, + "loss": 1.0747, + "step": 2218 + }, + { + "epoch": 0.16327431593469763, + "grad_norm": 1.0, + "learning_rate": 4.6856579743234044e-05, + "loss": 1.0746, + "step": 2219 + }, + { + "epoch": 0.16334789606806163, + "grad_norm": 1.015625, + "learning_rate": 4.685376761412963e-05, + "loss": 1.3434, + "step": 2220 + }, + { + "epoch": 0.1634214762014256, + "grad_norm": 0.7421875, + "learning_rate": 4.6850954312174775e-05, + "loss": 0.602, + "step": 2221 + }, + { + "epoch": 0.1634950563347896, + "grad_norm": 0.8046875, + "learning_rate": 4.684813983752048e-05, + "loss": 0.9007, + "step": 2222 + }, + { + "epoch": 0.1635686364681536, + "grad_norm": 0.8359375, + "learning_rate": 4.684532419031778e-05, + "loss": 0.6229, + "step": 2223 + }, + { + "epoch": 0.1636422166015176, + "grad_norm": 0.88671875, + "learning_rate": 4.684250737071779e-05, + "loss": 0.7356, + "step": 2224 + }, + { + "epoch": 0.16371579673488157, + "grad_norm": 0.734375, + "learning_rate": 4.683968937887169e-05, + "loss": 0.75, + "step": 2225 + }, + { + "epoch": 0.16378937686824557, + "grad_norm": 1.265625, + "learning_rate": 4.6836870214930704e-05, + "loss": 1.3397, + "step": 2226 + }, + { + "epoch": 0.16386295700160958, + "grad_norm": 0.984375, + "learning_rate": 4.683404987904615e-05, + "loss": 1.1249, + "step": 2227 + }, + { + "epoch": 0.16393653713497355, + "grad_norm": 1.046875, + "learning_rate": 4.683122837136937e-05, + "loss": 1.0647, + "step": 2228 + }, + { + "epoch": 0.16401011726833756, + "grad_norm": 0.859375, + "learning_rate": 4.682840569205179e-05, + "loss": 0.8404, + "step": 2229 + }, + { + "epoch": 0.16408369740170153, + "grad_norm": 1.078125, + "learning_rate": 4.6825581841244916e-05, + "loss": 1.2112, + "step": 2230 + }, + { + "epoch": 0.16415727753506554, + "grad_norm": 0.78125, + "learning_rate": 4.6822756819100275e-05, + "loss": 1.0361, + "step": 2231 + }, + { + "epoch": 0.16423085766842951, + "grad_norm": 1.0234375, + "learning_rate": 4.68199306257695e-05, + "loss": 1.0725, + "step": 2232 + }, + { + "epoch": 0.16430443780179352, + "grad_norm": 0.84375, + "learning_rate": 4.681710326140426e-05, + "loss": 1.0729, + "step": 2233 + }, + { + "epoch": 0.1643780179351575, + "grad_norm": 0.94140625, + "learning_rate": 4.6814274726156296e-05, + "loss": 0.9688, + "step": 2234 + }, + { + "epoch": 0.1644515980685215, + "grad_norm": 0.80859375, + "learning_rate": 4.68114450201774e-05, + "loss": 1.3399, + "step": 2235 + }, + { + "epoch": 0.1645251782018855, + "grad_norm": 0.8984375, + "learning_rate": 4.680861414361945e-05, + "loss": 1.3039, + "step": 2236 + }, + { + "epoch": 0.16459875833524948, + "grad_norm": 0.953125, + "learning_rate": 4.680578209663438e-05, + "loss": 1.2941, + "step": 2237 + }, + { + "epoch": 0.16467233846861348, + "grad_norm": 0.84765625, + "learning_rate": 4.680294887937416e-05, + "loss": 0.8394, + "step": 2238 + }, + { + "epoch": 0.16474591860197746, + "grad_norm": 0.89453125, + "learning_rate": 4.680011449199085e-05, + "loss": 1.136, + "step": 2239 + }, + { + "epoch": 0.16481949873534146, + "grad_norm": 0.84375, + "learning_rate": 4.679727893463658e-05, + "loss": 1.0401, + "step": 2240 + }, + { + "epoch": 0.16489307886870544, + "grad_norm": 0.67578125, + "learning_rate": 4.679444220746352e-05, + "loss": 0.6675, + "step": 2241 + }, + { + "epoch": 0.16496665900206944, + "grad_norm": 0.90625, + "learning_rate": 4.679160431062391e-05, + "loss": 0.9519, + "step": 2242 + }, + { + "epoch": 0.16504023913543342, + "grad_norm": 0.83984375, + "learning_rate": 4.678876524427004e-05, + "loss": 0.9513, + "step": 2243 + }, + { + "epoch": 0.16511381926879742, + "grad_norm": 0.92578125, + "learning_rate": 4.6785925008554305e-05, + "loss": 0.7941, + "step": 2244 + }, + { + "epoch": 0.16518739940216143, + "grad_norm": 0.84375, + "learning_rate": 4.678308360362912e-05, + "loss": 1.0033, + "step": 2245 + }, + { + "epoch": 0.1652609795355254, + "grad_norm": 1.1328125, + "learning_rate": 4.6780241029646975e-05, + "loss": 1.5162, + "step": 2246 + }, + { + "epoch": 0.1653345596688894, + "grad_norm": 1.3359375, + "learning_rate": 4.677739728676044e-05, + "loss": 1.1994, + "step": 2247 + }, + { + "epoch": 0.16540813980225338, + "grad_norm": 0.9140625, + "learning_rate": 4.677455237512212e-05, + "loss": 1.0858, + "step": 2248 + }, + { + "epoch": 0.1654817199356174, + "grad_norm": 0.921875, + "learning_rate": 4.6771706294884696e-05, + "loss": 0.7822, + "step": 2249 + }, + { + "epoch": 0.16555530006898136, + "grad_norm": 0.859375, + "learning_rate": 4.6768859046200924e-05, + "loss": 1.2687, + "step": 2250 + }, + { + "epoch": 0.16562888020234537, + "grad_norm": 0.875, + "learning_rate": 4.67660106292236e-05, + "loss": 1.0894, + "step": 2251 + }, + { + "epoch": 0.16570246033570935, + "grad_norm": 0.7265625, + "learning_rate": 4.6763161044105595e-05, + "loss": 0.7779, + "step": 2252 + }, + { + "epoch": 0.16577604046907335, + "grad_norm": 0.890625, + "learning_rate": 4.6760310290999844e-05, + "loss": 1.0385, + "step": 2253 + }, + { + "epoch": 0.16584962060243735, + "grad_norm": 0.95703125, + "learning_rate": 4.6757458370059336e-05, + "loss": 1.1762, + "step": 2254 + }, + { + "epoch": 0.16592320073580133, + "grad_norm": 0.8515625, + "learning_rate": 4.6754605281437134e-05, + "loss": 0.7289, + "step": 2255 + }, + { + "epoch": 0.16599678086916533, + "grad_norm": 0.78125, + "learning_rate": 4.675175102528635e-05, + "loss": 0.7508, + "step": 2256 + }, + { + "epoch": 0.1660703610025293, + "grad_norm": 1.125, + "learning_rate": 4.674889560176018e-05, + "loss": 1.3737, + "step": 2257 + }, + { + "epoch": 0.1661439411358933, + "grad_norm": 0.828125, + "learning_rate": 4.674603901101186e-05, + "loss": 0.8059, + "step": 2258 + }, + { + "epoch": 0.1662175212692573, + "grad_norm": 0.9375, + "learning_rate": 4.674318125319469e-05, + "loss": 0.723, + "step": 2259 + }, + { + "epoch": 0.1662911014026213, + "grad_norm": 0.9453125, + "learning_rate": 4.674032232846205e-05, + "loss": 0.841, + "step": 2260 + }, + { + "epoch": 0.16636468153598527, + "grad_norm": 0.94921875, + "learning_rate": 4.6737462236967374e-05, + "loss": 0.9784, + "step": 2261 + }, + { + "epoch": 0.16643826166934927, + "grad_norm": 0.87109375, + "learning_rate": 4.6734600978864164e-05, + "loss": 0.8706, + "step": 2262 + }, + { + "epoch": 0.16651184180271328, + "grad_norm": 0.9296875, + "learning_rate": 4.673173855430596e-05, + "loss": 0.9619, + "step": 2263 + }, + { + "epoch": 0.16658542193607725, + "grad_norm": 0.80078125, + "learning_rate": 4.67288749634464e-05, + "loss": 0.6256, + "step": 2264 + }, + { + "epoch": 0.16665900206944126, + "grad_norm": 0.72265625, + "learning_rate": 4.6726010206439155e-05, + "loss": 0.7546, + "step": 2265 + }, + { + "epoch": 0.16673258220280524, + "grad_norm": 2.53125, + "learning_rate": 4.672314428343798e-05, + "loss": 0.8527, + "step": 2266 + }, + { + "epoch": 0.16680616233616924, + "grad_norm": 0.98046875, + "learning_rate": 4.672027719459668e-05, + "loss": 0.833, + "step": 2267 + }, + { + "epoch": 0.16687974246953322, + "grad_norm": 0.875, + "learning_rate": 4.671740894006913e-05, + "loss": 1.0085, + "step": 2268 + }, + { + "epoch": 0.16695332260289722, + "grad_norm": 0.93359375, + "learning_rate": 4.671453952000926e-05, + "loss": 1.1527, + "step": 2269 + }, + { + "epoch": 0.1670269027362612, + "grad_norm": 0.78125, + "learning_rate": 4.671166893457106e-05, + "loss": 0.944, + "step": 2270 + }, + { + "epoch": 0.1671004828696252, + "grad_norm": 1.203125, + "learning_rate": 4.67087971839086e-05, + "loss": 1.3125, + "step": 2271 + }, + { + "epoch": 0.1671740630029892, + "grad_norm": 0.90234375, + "learning_rate": 4.6705924268176e-05, + "loss": 0.7686, + "step": 2272 + }, + { + "epoch": 0.16724764313635318, + "grad_norm": 0.79296875, + "learning_rate": 4.670305018752744e-05, + "loss": 0.7776, + "step": 2273 + }, + { + "epoch": 0.16732122326971718, + "grad_norm": 1.03125, + "learning_rate": 4.6700174942117165e-05, + "loss": 1.1477, + "step": 2274 + }, + { + "epoch": 0.16739480340308116, + "grad_norm": 0.95703125, + "learning_rate": 4.669729853209949e-05, + "loss": 1.0077, + "step": 2275 + }, + { + "epoch": 0.16746838353644516, + "grad_norm": 0.7890625, + "learning_rate": 4.6694420957628785e-05, + "loss": 0.7071, + "step": 2276 + }, + { + "epoch": 0.16754196366980914, + "grad_norm": 0.76953125, + "learning_rate": 4.6691542218859476e-05, + "loss": 1.012, + "step": 2277 + }, + { + "epoch": 0.16761554380317314, + "grad_norm": 0.796875, + "learning_rate": 4.668866231594606e-05, + "loss": 0.8759, + "step": 2278 + }, + { + "epoch": 0.16768912393653712, + "grad_norm": 0.78125, + "learning_rate": 4.668578124904312e-05, + "loss": 0.9433, + "step": 2279 + }, + { + "epoch": 0.16776270406990113, + "grad_norm": 0.9609375, + "learning_rate": 4.668289901830524e-05, + "loss": 0.8655, + "step": 2280 + }, + { + "epoch": 0.16783628420326513, + "grad_norm": 0.8984375, + "learning_rate": 4.668001562388713e-05, + "loss": 0.7792, + "step": 2281 + }, + { + "epoch": 0.1679098643366291, + "grad_norm": 0.984375, + "learning_rate": 4.667713106594353e-05, + "loss": 1.2321, + "step": 2282 + }, + { + "epoch": 0.1679834444699931, + "grad_norm": 1.015625, + "learning_rate": 4.667424534462925e-05, + "loss": 1.2005, + "step": 2283 + }, + { + "epoch": 0.16805702460335709, + "grad_norm": 1.015625, + "learning_rate": 4.667135846009916e-05, + "loss": 1.1626, + "step": 2284 + }, + { + "epoch": 0.1681306047367211, + "grad_norm": 0.9609375, + "learning_rate": 4.666847041250819e-05, + "loss": 1.0837, + "step": 2285 + }, + { + "epoch": 0.16820418487008507, + "grad_norm": 1.1015625, + "learning_rate": 4.6665581202011345e-05, + "loss": 1.2828, + "step": 2286 + }, + { + "epoch": 0.16827776500344907, + "grad_norm": 0.8359375, + "learning_rate": 4.666269082876367e-05, + "loss": 0.7669, + "step": 2287 + }, + { + "epoch": 0.16835134513681307, + "grad_norm": 0.859375, + "learning_rate": 4.665979929292029e-05, + "loss": 1.0711, + "step": 2288 + }, + { + "epoch": 0.16842492527017705, + "grad_norm": 0.94140625, + "learning_rate": 4.665690659463641e-05, + "loss": 0.7772, + "step": 2289 + }, + { + "epoch": 0.16849850540354105, + "grad_norm": 0.8046875, + "learning_rate": 4.6654012734067236e-05, + "loss": 0.7368, + "step": 2290 + }, + { + "epoch": 0.16857208553690503, + "grad_norm": 0.8515625, + "learning_rate": 4.665111771136811e-05, + "loss": 1.0117, + "step": 2291 + }, + { + "epoch": 0.16864566567026903, + "grad_norm": 0.89453125, + "learning_rate": 4.664822152669438e-05, + "loss": 0.8384, + "step": 2292 + }, + { + "epoch": 0.168719245803633, + "grad_norm": 0.796875, + "learning_rate": 4.6645324180201494e-05, + "loss": 0.7722, + "step": 2293 + }, + { + "epoch": 0.16879282593699702, + "grad_norm": 0.703125, + "learning_rate": 4.664242567204494e-05, + "loss": 0.7028, + "step": 2294 + }, + { + "epoch": 0.168866406070361, + "grad_norm": 0.84765625, + "learning_rate": 4.6639526002380275e-05, + "loss": 1.0704, + "step": 2295 + }, + { + "epoch": 0.168939986203725, + "grad_norm": 1.203125, + "learning_rate": 4.663662517136312e-05, + "loss": 1.3884, + "step": 2296 + }, + { + "epoch": 0.169013566337089, + "grad_norm": 0.71484375, + "learning_rate": 4.6633723179149166e-05, + "loss": 0.7911, + "step": 2297 + }, + { + "epoch": 0.16908714647045298, + "grad_norm": 1.03125, + "learning_rate": 4.6630820025894145e-05, + "loss": 1.1098, + "step": 2298 + }, + { + "epoch": 0.16916072660381698, + "grad_norm": 0.921875, + "learning_rate": 4.6627915711753866e-05, + "loss": 0.8922, + "step": 2299 + }, + { + "epoch": 0.16923430673718096, + "grad_norm": 1.0234375, + "learning_rate": 4.6625010236884204e-05, + "loss": 1.4102, + "step": 2300 + }, + { + "epoch": 0.16930788687054496, + "grad_norm": 0.89453125, + "learning_rate": 4.662210360144108e-05, + "loss": 0.8983, + "step": 2301 + }, + { + "epoch": 0.16938146700390894, + "grad_norm": 0.859375, + "learning_rate": 4.66191958055805e-05, + "loss": 0.8754, + "step": 2302 + }, + { + "epoch": 0.16945504713727294, + "grad_norm": 0.78125, + "learning_rate": 4.6616286849458515e-05, + "loss": 1.2154, + "step": 2303 + }, + { + "epoch": 0.16952862727063692, + "grad_norm": 0.84375, + "learning_rate": 4.6613376733231236e-05, + "loss": 0.8395, + "step": 2304 + }, + { + "epoch": 0.16960220740400092, + "grad_norm": 0.91015625, + "learning_rate": 4.661046545705485e-05, + "loss": 0.8515, + "step": 2305 + }, + { + "epoch": 0.16967578753736492, + "grad_norm": 1.0546875, + "learning_rate": 4.660755302108561e-05, + "loss": 0.8633, + "step": 2306 + }, + { + "epoch": 0.1697493676707289, + "grad_norm": 0.87890625, + "learning_rate": 4.66046394254798e-05, + "loss": 0.9357, + "step": 2307 + }, + { + "epoch": 0.1698229478040929, + "grad_norm": 0.91796875, + "learning_rate": 4.66017246703938e-05, + "loss": 1.0117, + "step": 2308 + }, + { + "epoch": 0.16989652793745688, + "grad_norm": 0.88671875, + "learning_rate": 4.6598808755984034e-05, + "loss": 1.0174, + "step": 2309 + }, + { + "epoch": 0.16997010807082089, + "grad_norm": 0.93359375, + "learning_rate": 4.6595891682407e-05, + "loss": 1.234, + "step": 2310 + }, + { + "epoch": 0.17004368820418486, + "grad_norm": 0.73046875, + "learning_rate": 4.6592973449819244e-05, + "loss": 0.5915, + "step": 2311 + }, + { + "epoch": 0.17011726833754887, + "grad_norm": 0.97265625, + "learning_rate": 4.659005405837739e-05, + "loss": 0.9665, + "step": 2312 + }, + { + "epoch": 0.17019084847091284, + "grad_norm": 1.046875, + "learning_rate": 4.6587133508238115e-05, + "loss": 1.6637, + "step": 2313 + }, + { + "epoch": 0.17026442860427685, + "grad_norm": 0.7578125, + "learning_rate": 4.658421179955815e-05, + "loss": 0.8542, + "step": 2314 + }, + { + "epoch": 0.17033800873764085, + "grad_norm": 0.80078125, + "learning_rate": 4.6581288932494304e-05, + "loss": 0.7991, + "step": 2315 + }, + { + "epoch": 0.17041158887100483, + "grad_norm": 1.109375, + "learning_rate": 4.657836490720345e-05, + "loss": 1.3094, + "step": 2316 + }, + { + "epoch": 0.17048516900436883, + "grad_norm": 0.984375, + "learning_rate": 4.657543972384251e-05, + "loss": 0.86, + "step": 2317 + }, + { + "epoch": 0.1705587491377328, + "grad_norm": 0.81640625, + "learning_rate": 4.657251338256846e-05, + "loss": 0.7694, + "step": 2318 + }, + { + "epoch": 0.1706323292710968, + "grad_norm": 0.95703125, + "learning_rate": 4.656958588353836e-05, + "loss": 0.9706, + "step": 2319 + }, + { + "epoch": 0.1707059094044608, + "grad_norm": 0.76171875, + "learning_rate": 4.656665722690933e-05, + "loss": 0.9386, + "step": 2320 + }, + { + "epoch": 0.1707794895378248, + "grad_norm": 0.92578125, + "learning_rate": 4.656372741283854e-05, + "loss": 1.0143, + "step": 2321 + }, + { + "epoch": 0.17085306967118877, + "grad_norm": 1.03125, + "learning_rate": 4.6560796441483234e-05, + "loss": 1.4127, + "step": 2322 + }, + { + "epoch": 0.17092664980455277, + "grad_norm": 0.83984375, + "learning_rate": 4.6557864313000695e-05, + "loss": 0.7166, + "step": 2323 + }, + { + "epoch": 0.17100022993791678, + "grad_norm": 0.796875, + "learning_rate": 4.65549310275483e-05, + "loss": 0.9397, + "step": 2324 + }, + { + "epoch": 0.17107381007128075, + "grad_norm": 0.8203125, + "learning_rate": 4.6551996585283476e-05, + "loss": 1.0492, + "step": 2325 + }, + { + "epoch": 0.17114739020464476, + "grad_norm": 0.8828125, + "learning_rate": 4.654906098636369e-05, + "loss": 0.6942, + "step": 2326 + }, + { + "epoch": 0.17122097033800873, + "grad_norm": 0.89453125, + "learning_rate": 4.6546124230946505e-05, + "loss": 0.9402, + "step": 2327 + }, + { + "epoch": 0.17129455047137274, + "grad_norm": 0.8359375, + "learning_rate": 4.6543186319189526e-05, + "loss": 0.7261, + "step": 2328 + }, + { + "epoch": 0.1713681306047367, + "grad_norm": 0.87109375, + "learning_rate": 4.6540247251250424e-05, + "loss": 0.8986, + "step": 2329 + }, + { + "epoch": 0.17144171073810072, + "grad_norm": 0.71875, + "learning_rate": 4.653730702728694e-05, + "loss": 0.7865, + "step": 2330 + }, + { + "epoch": 0.1715152908714647, + "grad_norm": 1.59375, + "learning_rate": 4.653436564745687e-05, + "loss": 1.1759, + "step": 2331 + }, + { + "epoch": 0.1715888710048287, + "grad_norm": 0.8984375, + "learning_rate": 4.653142311191806e-05, + "loss": 0.978, + "step": 2332 + }, + { + "epoch": 0.1716624511381927, + "grad_norm": 1.125, + "learning_rate": 4.652847942082844e-05, + "loss": 1.0943, + "step": 2333 + }, + { + "epoch": 0.17173603127155668, + "grad_norm": 0.8515625, + "learning_rate": 4.6525534574346e-05, + "loss": 0.942, + "step": 2334 + }, + { + "epoch": 0.17180961140492068, + "grad_norm": 1.1484375, + "learning_rate": 4.6522588572628765e-05, + "loss": 1.0099, + "step": 2335 + }, + { + "epoch": 0.17188319153828466, + "grad_norm": 0.78515625, + "learning_rate": 4.651964141583486e-05, + "loss": 0.7833, + "step": 2336 + }, + { + "epoch": 0.17195677167164866, + "grad_norm": 1.0234375, + "learning_rate": 4.6516693104122435e-05, + "loss": 1.1503, + "step": 2337 + }, + { + "epoch": 0.17203035180501264, + "grad_norm": 0.87109375, + "learning_rate": 4.6513743637649736e-05, + "loss": 0.6584, + "step": 2338 + }, + { + "epoch": 0.17210393193837664, + "grad_norm": 0.80078125, + "learning_rate": 4.651079301657505e-05, + "loss": 0.8908, + "step": 2339 + }, + { + "epoch": 0.17217751207174062, + "grad_norm": 0.7890625, + "learning_rate": 4.6507841241056735e-05, + "loss": 0.8444, + "step": 2340 + }, + { + "epoch": 0.17225109220510462, + "grad_norm": 0.765625, + "learning_rate": 4.6504888311253196e-05, + "loss": 0.8007, + "step": 2341 + }, + { + "epoch": 0.17232467233846863, + "grad_norm": 0.78125, + "learning_rate": 4.650193422732292e-05, + "loss": 0.7184, + "step": 2342 + }, + { + "epoch": 0.1723982524718326, + "grad_norm": 0.8515625, + "learning_rate": 4.649897898942445e-05, + "loss": 0.903, + "step": 2343 + }, + { + "epoch": 0.1724718326051966, + "grad_norm": 0.73046875, + "learning_rate": 4.649602259771638e-05, + "loss": 0.7455, + "step": 2344 + }, + { + "epoch": 0.17254541273856058, + "grad_norm": 0.765625, + "learning_rate": 4.649306505235738e-05, + "loss": 0.7128, + "step": 2345 + }, + { + "epoch": 0.1726189928719246, + "grad_norm": 0.84375, + "learning_rate": 4.649010635350617e-05, + "loss": 0.9328, + "step": 2346 + }, + { + "epoch": 0.17269257300528856, + "grad_norm": 0.87109375, + "learning_rate": 4.6487146501321535e-05, + "loss": 0.7971, + "step": 2347 + }, + { + "epoch": 0.17276615313865257, + "grad_norm": 0.91015625, + "learning_rate": 4.648418549596234e-05, + "loss": 0.9742, + "step": 2348 + }, + { + "epoch": 0.17283973327201654, + "grad_norm": 0.8125, + "learning_rate": 4.648122333758749e-05, + "loss": 0.9682, + "step": 2349 + }, + { + "epoch": 0.17291331340538055, + "grad_norm": 1.03125, + "learning_rate": 4.647826002635595e-05, + "loss": 0.7815, + "step": 2350 + }, + { + "epoch": 0.17298689353874455, + "grad_norm": 0.84375, + "learning_rate": 4.647529556242676e-05, + "loss": 0.9898, + "step": 2351 + }, + { + "epoch": 0.17306047367210853, + "grad_norm": 0.8515625, + "learning_rate": 4.6472329945959014e-05, + "loss": 1.1301, + "step": 2352 + }, + { + "epoch": 0.17313405380547253, + "grad_norm": 0.79296875, + "learning_rate": 4.646936317711188e-05, + "loss": 0.8775, + "step": 2353 + }, + { + "epoch": 0.1732076339388365, + "grad_norm": 0.9375, + "learning_rate": 4.6466395256044574e-05, + "loss": 1.4642, + "step": 2354 + }, + { + "epoch": 0.1732812140722005, + "grad_norm": 0.96484375, + "learning_rate": 4.646342618291638e-05, + "loss": 0.7355, + "step": 2355 + }, + { + "epoch": 0.1733547942055645, + "grad_norm": 0.59765625, + "learning_rate": 4.6460455957886646e-05, + "loss": 0.6268, + "step": 2356 + }, + { + "epoch": 0.1734283743389285, + "grad_norm": 0.89453125, + "learning_rate": 4.645748458111476e-05, + "loss": 1.1092, + "step": 2357 + }, + { + "epoch": 0.17350195447229247, + "grad_norm": 0.9765625, + "learning_rate": 4.6454512052760225e-05, + "loss": 1.0355, + "step": 2358 + }, + { + "epoch": 0.17357553460565647, + "grad_norm": 0.91015625, + "learning_rate": 4.6451538372982527e-05, + "loss": 1.0261, + "step": 2359 + }, + { + "epoch": 0.17364911473902048, + "grad_norm": 0.7734375, + "learning_rate": 4.6448563541941295e-05, + "loss": 1.1197, + "step": 2360 + }, + { + "epoch": 0.17372269487238445, + "grad_norm": 0.94140625, + "learning_rate": 4.6445587559796166e-05, + "loss": 0.8631, + "step": 2361 + }, + { + "epoch": 0.17379627500574846, + "grad_norm": 0.91796875, + "learning_rate": 4.6442610426706856e-05, + "loss": 1.0614, + "step": 2362 + }, + { + "epoch": 0.17386985513911243, + "grad_norm": 0.76953125, + "learning_rate": 4.643963214283314e-05, + "loss": 0.8628, + "step": 2363 + }, + { + "epoch": 0.17394343527247644, + "grad_norm": 0.91015625, + "learning_rate": 4.643665270833487e-05, + "loss": 0.8224, + "step": 2364 + }, + { + "epoch": 0.17401701540584041, + "grad_norm": 0.71875, + "learning_rate": 4.643367212337193e-05, + "loss": 0.8321, + "step": 2365 + }, + { + "epoch": 0.17409059553920442, + "grad_norm": 0.76171875, + "learning_rate": 4.643069038810429e-05, + "loss": 0.9729, + "step": 2366 + }, + { + "epoch": 0.1741641756725684, + "grad_norm": 0.76171875, + "learning_rate": 4.642770750269198e-05, + "loss": 0.7919, + "step": 2367 + }, + { + "epoch": 0.1742377558059324, + "grad_norm": 1.0078125, + "learning_rate": 4.642472346729507e-05, + "loss": 1.251, + "step": 2368 + }, + { + "epoch": 0.1743113359392964, + "grad_norm": 0.98828125, + "learning_rate": 4.642173828207372e-05, + "loss": 1.0962, + "step": 2369 + }, + { + "epoch": 0.17438491607266038, + "grad_norm": 1.015625, + "learning_rate": 4.6418751947188145e-05, + "loss": 1.1477, + "step": 2370 + }, + { + "epoch": 0.17445849620602438, + "grad_norm": 0.80078125, + "learning_rate": 4.641576446279861e-05, + "loss": 0.7813, + "step": 2371 + }, + { + "epoch": 0.17453207633938836, + "grad_norm": 1.1171875, + "learning_rate": 4.641277582906542e-05, + "loss": 1.1445, + "step": 2372 + }, + { + "epoch": 0.17460565647275236, + "grad_norm": 0.765625, + "learning_rate": 4.640978604614902e-05, + "loss": 0.9483, + "step": 2373 + }, + { + "epoch": 0.17467923660611634, + "grad_norm": 0.85546875, + "learning_rate": 4.640679511420983e-05, + "loss": 0.9605, + "step": 2374 + }, + { + "epoch": 0.17475281673948034, + "grad_norm": 0.8046875, + "learning_rate": 4.6403803033408375e-05, + "loss": 0.8761, + "step": 2375 + }, + { + "epoch": 0.17482639687284432, + "grad_norm": 0.69140625, + "learning_rate": 4.6400809803905244e-05, + "loss": 0.6966, + "step": 2376 + }, + { + "epoch": 0.17489997700620832, + "grad_norm": 0.79296875, + "learning_rate": 4.639781542586106e-05, + "loss": 0.7238, + "step": 2377 + }, + { + "epoch": 0.17497355713957233, + "grad_norm": 0.83203125, + "learning_rate": 4.639481989943655e-05, + "loss": 0.8654, + "step": 2378 + }, + { + "epoch": 0.1750471372729363, + "grad_norm": 0.93359375, + "learning_rate": 4.6391823224792456e-05, + "loss": 0.9046, + "step": 2379 + }, + { + "epoch": 0.1751207174063003, + "grad_norm": 0.83984375, + "learning_rate": 4.638882540208962e-05, + "loss": 0.817, + "step": 2380 + }, + { + "epoch": 0.17519429753966428, + "grad_norm": 1.0078125, + "learning_rate": 4.6385826431488914e-05, + "loss": 1.4932, + "step": 2381 + }, + { + "epoch": 0.1752678776730283, + "grad_norm": 0.75390625, + "learning_rate": 4.63828263131513e-05, + "loss": 0.8208, + "step": 2382 + }, + { + "epoch": 0.17534145780639226, + "grad_norm": 0.7734375, + "learning_rate": 4.637982504723779e-05, + "loss": 0.847, + "step": 2383 + }, + { + "epoch": 0.17541503793975627, + "grad_norm": 0.82421875, + "learning_rate": 4.637682263390944e-05, + "loss": 0.6693, + "step": 2384 + }, + { + "epoch": 0.17548861807312024, + "grad_norm": 0.7578125, + "learning_rate": 4.6373819073327403e-05, + "loss": 0.7124, + "step": 2385 + }, + { + "epoch": 0.17556219820648425, + "grad_norm": 1.015625, + "learning_rate": 4.637081436565286e-05, + "loss": 1.0774, + "step": 2386 + }, + { + "epoch": 0.17563577833984825, + "grad_norm": 0.98828125, + "learning_rate": 4.636780851104707e-05, + "loss": 0.9835, + "step": 2387 + }, + { + "epoch": 0.17570935847321223, + "grad_norm": 0.79296875, + "learning_rate": 4.636480150967136e-05, + "loss": 0.9992, + "step": 2388 + }, + { + "epoch": 0.17578293860657623, + "grad_norm": 0.859375, + "learning_rate": 4.636179336168711e-05, + "loss": 1.0624, + "step": 2389 + }, + { + "epoch": 0.1758565187399402, + "grad_norm": 0.81640625, + "learning_rate": 4.6358784067255755e-05, + "loss": 0.9743, + "step": 2390 + }, + { + "epoch": 0.1759300988733042, + "grad_norm": 0.91015625, + "learning_rate": 4.63557736265388e-05, + "loss": 0.8359, + "step": 2391 + }, + { + "epoch": 0.1760036790066682, + "grad_norm": 0.82421875, + "learning_rate": 4.635276203969781e-05, + "loss": 0.8541, + "step": 2392 + }, + { + "epoch": 0.1760772591400322, + "grad_norm": 0.859375, + "learning_rate": 4.634974930689441e-05, + "loss": 0.8042, + "step": 2393 + }, + { + "epoch": 0.17615083927339617, + "grad_norm": 0.8828125, + "learning_rate": 4.634673542829029e-05, + "loss": 0.9784, + "step": 2394 + }, + { + "epoch": 0.17622441940676017, + "grad_norm": 0.890625, + "learning_rate": 4.634372040404719e-05, + "loss": 0.9491, + "step": 2395 + }, + { + "epoch": 0.17629799954012418, + "grad_norm": 0.88671875, + "learning_rate": 4.6340704234326934e-05, + "loss": 0.9291, + "step": 2396 + }, + { + "epoch": 0.17637157967348815, + "grad_norm": 1.234375, + "learning_rate": 4.633768691929139e-05, + "loss": 1.245, + "step": 2397 + }, + { + "epoch": 0.17644515980685216, + "grad_norm": 0.75, + "learning_rate": 4.6334668459102484e-05, + "loss": 0.9182, + "step": 2398 + }, + { + "epoch": 0.17651873994021614, + "grad_norm": 0.984375, + "learning_rate": 4.6331648853922225e-05, + "loss": 1.4692, + "step": 2399 + }, + { + "epoch": 0.17659232007358014, + "grad_norm": 0.9609375, + "learning_rate": 4.6328628103912666e-05, + "loss": 0.989, + "step": 2400 + }, + { + "epoch": 0.17666590020694412, + "grad_norm": 0.953125, + "learning_rate": 4.632560620923591e-05, + "loss": 0.8725, + "step": 2401 + }, + { + "epoch": 0.17673948034030812, + "grad_norm": 0.91796875, + "learning_rate": 4.6322583170054146e-05, + "loss": 1.1465, + "step": 2402 + }, + { + "epoch": 0.1768130604736721, + "grad_norm": 0.76171875, + "learning_rate": 4.631955898652962e-05, + "loss": 0.6109, + "step": 2403 + }, + { + "epoch": 0.1768866406070361, + "grad_norm": 0.82421875, + "learning_rate": 4.631653365882463e-05, + "loss": 1.066, + "step": 2404 + }, + { + "epoch": 0.1769602207404001, + "grad_norm": 0.76171875, + "learning_rate": 4.6313507187101544e-05, + "loss": 0.8509, + "step": 2405 + }, + { + "epoch": 0.17703380087376408, + "grad_norm": 0.89453125, + "learning_rate": 4.631047957152278e-05, + "loss": 0.7434, + "step": 2406 + }, + { + "epoch": 0.17710738100712808, + "grad_norm": 0.83984375, + "learning_rate": 4.630745081225083e-05, + "loss": 1.0909, + "step": 2407 + }, + { + "epoch": 0.17718096114049206, + "grad_norm": 0.78515625, + "learning_rate": 4.6304420909448235e-05, + "loss": 1.0208, + "step": 2408 + }, + { + "epoch": 0.17725454127385606, + "grad_norm": 0.84375, + "learning_rate": 4.63013898632776e-05, + "loss": 1.0963, + "step": 2409 + }, + { + "epoch": 0.17732812140722004, + "grad_norm": 0.78125, + "learning_rate": 4.6298357673901615e-05, + "loss": 0.8698, + "step": 2410 + }, + { + "epoch": 0.17740170154058404, + "grad_norm": 1.015625, + "learning_rate": 4.6295324341483e-05, + "loss": 0.945, + "step": 2411 + }, + { + "epoch": 0.17747528167394802, + "grad_norm": 0.8046875, + "learning_rate": 4.6292289866184546e-05, + "loss": 0.9404, + "step": 2412 + }, + { + "epoch": 0.17754886180731203, + "grad_norm": 0.7734375, + "learning_rate": 4.628925424816911e-05, + "loss": 0.7535, + "step": 2413 + }, + { + "epoch": 0.17762244194067603, + "grad_norm": 0.828125, + "learning_rate": 4.6286217487599616e-05, + "loss": 0.991, + "step": 2414 + }, + { + "epoch": 0.17769602207404, + "grad_norm": 1.125, + "learning_rate": 4.628317958463902e-05, + "loss": 1.5002, + "step": 2415 + }, + { + "epoch": 0.177769602207404, + "grad_norm": 0.84375, + "learning_rate": 4.628014053945038e-05, + "loss": 0.8467, + "step": 2416 + }, + { + "epoch": 0.17784318234076799, + "grad_norm": 1.03125, + "learning_rate": 4.627710035219679e-05, + "loss": 0.8584, + "step": 2417 + }, + { + "epoch": 0.177916762474132, + "grad_norm": 0.76171875, + "learning_rate": 4.627405902304141e-05, + "loss": 0.6925, + "step": 2418 + }, + { + "epoch": 0.17799034260749597, + "grad_norm": 0.85546875, + "learning_rate": 4.627101655214746e-05, + "loss": 1.1767, + "step": 2419 + }, + { + "epoch": 0.17806392274085997, + "grad_norm": 0.84765625, + "learning_rate": 4.626797293967824e-05, + "loss": 1.1363, + "step": 2420 + }, + { + "epoch": 0.17813750287422395, + "grad_norm": 0.828125, + "learning_rate": 4.626492818579707e-05, + "loss": 0.9227, + "step": 2421 + }, + { + "epoch": 0.17821108300758795, + "grad_norm": 0.8359375, + "learning_rate": 4.626188229066737e-05, + "loss": 0.6957, + "step": 2422 + }, + { + "epoch": 0.17828466314095195, + "grad_norm": 0.7734375, + "learning_rate": 4.62588352544526e-05, + "loss": 0.9482, + "step": 2423 + }, + { + "epoch": 0.17835824327431593, + "grad_norm": 0.7890625, + "learning_rate": 4.62557870773163e-05, + "loss": 0.9393, + "step": 2424 + }, + { + "epoch": 0.17843182340767993, + "grad_norm": 0.984375, + "learning_rate": 4.625273775942206e-05, + "loss": 1.0312, + "step": 2425 + }, + { + "epoch": 0.1785054035410439, + "grad_norm": 0.99609375, + "learning_rate": 4.6249687300933516e-05, + "loss": 1.1365, + "step": 2426 + }, + { + "epoch": 0.17857898367440792, + "grad_norm": 0.83203125, + "learning_rate": 4.6246635702014396e-05, + "loss": 0.6551, + "step": 2427 + }, + { + "epoch": 0.1786525638077719, + "grad_norm": 0.95703125, + "learning_rate": 4.6243582962828466e-05, + "loss": 1.3892, + "step": 2428 + }, + { + "epoch": 0.1787261439411359, + "grad_norm": 0.81640625, + "learning_rate": 4.6240529083539564e-05, + "loss": 0.9388, + "step": 2429 + }, + { + "epoch": 0.17879972407449987, + "grad_norm": 1.1015625, + "learning_rate": 4.6237474064311574e-05, + "loss": 1.5013, + "step": 2430 + }, + { + "epoch": 0.17887330420786388, + "grad_norm": 0.77734375, + "learning_rate": 4.623441790530847e-05, + "loss": 0.9194, + "step": 2431 + }, + { + "epoch": 0.17894688434122788, + "grad_norm": 0.84375, + "learning_rate": 4.6231360606694263e-05, + "loss": 1.0206, + "step": 2432 + }, + { + "epoch": 0.17902046447459186, + "grad_norm": 0.84765625, + "learning_rate": 4.622830216863303e-05, + "loss": 0.9787, + "step": 2433 + }, + { + "epoch": 0.17909404460795586, + "grad_norm": 0.78125, + "learning_rate": 4.6225242591288914e-05, + "loss": 0.692, + "step": 2434 + }, + { + "epoch": 0.17916762474131984, + "grad_norm": 1.03125, + "learning_rate": 4.622218187482612e-05, + "loss": 1.0808, + "step": 2435 + }, + { + "epoch": 0.17924120487468384, + "grad_norm": 0.73046875, + "learning_rate": 4.62191200194089e-05, + "loss": 0.9005, + "step": 2436 + }, + { + "epoch": 0.17931478500804782, + "grad_norm": 0.87890625, + "learning_rate": 4.62160570252016e-05, + "loss": 0.9334, + "step": 2437 + }, + { + "epoch": 0.17938836514141182, + "grad_norm": 0.78125, + "learning_rate": 4.621299289236858e-05, + "loss": 0.9676, + "step": 2438 + }, + { + "epoch": 0.1794619452747758, + "grad_norm": 1.1875, + "learning_rate": 4.62099276210743e-05, + "loss": 1.203, + "step": 2439 + }, + { + "epoch": 0.1795355254081398, + "grad_norm": 0.90625, + "learning_rate": 4.620686121148326e-05, + "loss": 1.0788, + "step": 2440 + }, + { + "epoch": 0.1796091055415038, + "grad_norm": 0.7421875, + "learning_rate": 4.620379366376004e-05, + "loss": 0.7248, + "step": 2441 + }, + { + "epoch": 0.17968268567486778, + "grad_norm": 1.0546875, + "learning_rate": 4.620072497806926e-05, + "loss": 1.0184, + "step": 2442 + }, + { + "epoch": 0.17975626580823179, + "grad_norm": 0.8984375, + "learning_rate": 4.6197655154575615e-05, + "loss": 0.9649, + "step": 2443 + }, + { + "epoch": 0.17982984594159576, + "grad_norm": 0.87109375, + "learning_rate": 4.619458419344385e-05, + "loss": 0.9972, + "step": 2444 + }, + { + "epoch": 0.17990342607495977, + "grad_norm": 0.81640625, + "learning_rate": 4.619151209483878e-05, + "loss": 1.0228, + "step": 2445 + }, + { + "epoch": 0.17997700620832374, + "grad_norm": 0.80078125, + "learning_rate": 4.618843885892529e-05, + "loss": 0.9985, + "step": 2446 + }, + { + "epoch": 0.18005058634168775, + "grad_norm": 0.98828125, + "learning_rate": 4.61853644858683e-05, + "loss": 1.6175, + "step": 2447 + }, + { + "epoch": 0.18012416647505172, + "grad_norm": 1.25, + "learning_rate": 4.618228897583281e-05, + "loss": 1.115, + "step": 2448 + }, + { + "epoch": 0.18019774660841573, + "grad_norm": 0.95703125, + "learning_rate": 4.617921232898388e-05, + "loss": 1.3514, + "step": 2449 + }, + { + "epoch": 0.18027132674177973, + "grad_norm": 0.81640625, + "learning_rate": 4.617613454548663e-05, + "loss": 0.9386, + "step": 2450 + }, + { + "epoch": 0.1803449068751437, + "grad_norm": 0.890625, + "learning_rate": 4.6173055625506236e-05, + "loss": 0.8495, + "step": 2451 + }, + { + "epoch": 0.1804184870085077, + "grad_norm": 0.78515625, + "learning_rate": 4.616997556920793e-05, + "loss": 0.7815, + "step": 2452 + }, + { + "epoch": 0.1804920671418717, + "grad_norm": 0.9765625, + "learning_rate": 4.616689437675702e-05, + "loss": 0.9896, + "step": 2453 + }, + { + "epoch": 0.1805656472752357, + "grad_norm": 0.8671875, + "learning_rate": 4.616381204831887e-05, + "loss": 0.9494, + "step": 2454 + }, + { + "epoch": 0.18063922740859967, + "grad_norm": 0.76953125, + "learning_rate": 4.61607285840589e-05, + "loss": 0.6253, + "step": 2455 + }, + { + "epoch": 0.18071280754196367, + "grad_norm": 0.828125, + "learning_rate": 4.6157643984142595e-05, + "loss": 0.9087, + "step": 2456 + }, + { + "epoch": 0.18078638767532765, + "grad_norm": 1.0390625, + "learning_rate": 4.61545582487355e-05, + "loss": 1.4828, + "step": 2457 + }, + { + "epoch": 0.18085996780869165, + "grad_norm": 0.9765625, + "learning_rate": 4.615147137800321e-05, + "loss": 1.1915, + "step": 2458 + }, + { + "epoch": 0.18093354794205566, + "grad_norm": 0.953125, + "learning_rate": 4.6148383372111406e-05, + "loss": 1.0188, + "step": 2459 + }, + { + "epoch": 0.18100712807541963, + "grad_norm": 0.96875, + "learning_rate": 4.6145294231225816e-05, + "loss": 1.111, + "step": 2460 + }, + { + "epoch": 0.18108070820878364, + "grad_norm": 0.81640625, + "learning_rate": 4.614220395551222e-05, + "loss": 1.0745, + "step": 2461 + }, + { + "epoch": 0.1811542883421476, + "grad_norm": 0.81640625, + "learning_rate": 4.6139112545136466e-05, + "loss": 0.8347, + "step": 2462 + }, + { + "epoch": 0.18122786847551162, + "grad_norm": 0.81640625, + "learning_rate": 4.6136020000264466e-05, + "loss": 0.7947, + "step": 2463 + }, + { + "epoch": 0.1813014486088756, + "grad_norm": 1.0703125, + "learning_rate": 4.61329263210622e-05, + "loss": 0.8838, + "step": 2464 + }, + { + "epoch": 0.1813750287422396, + "grad_norm": 0.69921875, + "learning_rate": 4.6129831507695684e-05, + "loss": 0.7092, + "step": 2465 + }, + { + "epoch": 0.1814486088756036, + "grad_norm": 0.82421875, + "learning_rate": 4.612673556033103e-05, + "loss": 0.6203, + "step": 2466 + }, + { + "epoch": 0.18152218900896758, + "grad_norm": 1.09375, + "learning_rate": 4.612363847913437e-05, + "loss": 1.364, + "step": 2467 + }, + { + "epoch": 0.18159576914233158, + "grad_norm": 1.0234375, + "learning_rate": 4.612054026427193e-05, + "loss": 0.9108, + "step": 2468 + }, + { + "epoch": 0.18166934927569556, + "grad_norm": 1.4921875, + "learning_rate": 4.611744091590999e-05, + "loss": 0.9871, + "step": 2469 + }, + { + "epoch": 0.18174292940905956, + "grad_norm": 1.078125, + "learning_rate": 4.611434043421489e-05, + "loss": 1.2519, + "step": 2470 + }, + { + "epoch": 0.18181650954242354, + "grad_norm": 0.93359375, + "learning_rate": 4.6111238819353005e-05, + "loss": 0.969, + "step": 2471 + }, + { + "epoch": 0.18189008967578754, + "grad_norm": 0.82421875, + "learning_rate": 4.610813607149081e-05, + "loss": 0.8348, + "step": 2472 + }, + { + "epoch": 0.18196366980915152, + "grad_norm": 0.87890625, + "learning_rate": 4.6105032190794816e-05, + "loss": 1.1528, + "step": 2473 + }, + { + "epoch": 0.18203724994251552, + "grad_norm": 0.94140625, + "learning_rate": 4.610192717743162e-05, + "loss": 1.0308, + "step": 2474 + }, + { + "epoch": 0.18211083007587953, + "grad_norm": 0.9453125, + "learning_rate": 4.609882103156783e-05, + "loss": 1.0408, + "step": 2475 + }, + { + "epoch": 0.1821844102092435, + "grad_norm": 0.703125, + "learning_rate": 4.6095713753370174e-05, + "loss": 0.7387, + "step": 2476 + }, + { + "epoch": 0.1822579903426075, + "grad_norm": 1.3515625, + "learning_rate": 4.609260534300541e-05, + "loss": 0.9955, + "step": 2477 + }, + { + "epoch": 0.18233157047597148, + "grad_norm": 0.7578125, + "learning_rate": 4.608949580064035e-05, + "loss": 0.6883, + "step": 2478 + }, + { + "epoch": 0.1824051506093355, + "grad_norm": 0.859375, + "learning_rate": 4.608638512644188e-05, + "loss": 0.7779, + "step": 2479 + }, + { + "epoch": 0.18247873074269946, + "grad_norm": 0.8671875, + "learning_rate": 4.6083273320576945e-05, + "loss": 1.2412, + "step": 2480 + }, + { + "epoch": 0.18255231087606347, + "grad_norm": 1.09375, + "learning_rate": 4.6080160383212556e-05, + "loss": 1.2351, + "step": 2481 + }, + { + "epoch": 0.18262589100942744, + "grad_norm": 0.8359375, + "learning_rate": 4.607704631451578e-05, + "loss": 0.96, + "step": 2482 + }, + { + "epoch": 0.18269947114279145, + "grad_norm": 0.96875, + "learning_rate": 4.607393111465373e-05, + "loss": 0.9822, + "step": 2483 + }, + { + "epoch": 0.18277305127615545, + "grad_norm": 1.1171875, + "learning_rate": 4.60708147837936e-05, + "loss": 1.2662, + "step": 2484 + }, + { + "epoch": 0.18284663140951943, + "grad_norm": 0.87890625, + "learning_rate": 4.6067697322102646e-05, + "loss": 1.0083, + "step": 2485 + }, + { + "epoch": 0.18292021154288343, + "grad_norm": 0.89453125, + "learning_rate": 4.606457872974816e-05, + "loss": 0.9201, + "step": 2486 + }, + { + "epoch": 0.1829937916762474, + "grad_norm": 0.89453125, + "learning_rate": 4.606145900689751e-05, + "loss": 1.021, + "step": 2487 + }, + { + "epoch": 0.1830673718096114, + "grad_norm": 0.88671875, + "learning_rate": 4.605833815371815e-05, + "loss": 0.9086, + "step": 2488 + }, + { + "epoch": 0.1831409519429754, + "grad_norm": 0.703125, + "learning_rate": 4.605521617037755e-05, + "loss": 0.643, + "step": 2489 + }, + { + "epoch": 0.1832145320763394, + "grad_norm": 0.92578125, + "learning_rate": 4.6052093057043264e-05, + "loss": 1.0791, + "step": 2490 + }, + { + "epoch": 0.18328811220970337, + "grad_norm": 0.83984375, + "learning_rate": 4.604896881388291e-05, + "loss": 0.8522, + "step": 2491 + }, + { + "epoch": 0.18336169234306737, + "grad_norm": 0.9375, + "learning_rate": 4.6045843441064153e-05, + "loss": 0.7927, + "step": 2492 + }, + { + "epoch": 0.18343527247643138, + "grad_norm": 1.0625, + "learning_rate": 4.6042716938754726e-05, + "loss": 0.9329, + "step": 2493 + }, + { + "epoch": 0.18350885260979535, + "grad_norm": 0.796875, + "learning_rate": 4.603958930712242e-05, + "loss": 0.8862, + "step": 2494 + }, + { + "epoch": 0.18358243274315936, + "grad_norm": 1.0546875, + "learning_rate": 4.60364605463351e-05, + "loss": 1.2713, + "step": 2495 + }, + { + "epoch": 0.18365601287652333, + "grad_norm": 0.921875, + "learning_rate": 4.603333065656068e-05, + "loss": 1.0224, + "step": 2496 + }, + { + "epoch": 0.18372959300988734, + "grad_norm": 0.84375, + "learning_rate": 4.6030199637967126e-05, + "loss": 1.1144, + "step": 2497 + }, + { + "epoch": 0.1838031731432513, + "grad_norm": 0.73046875, + "learning_rate": 4.6027067490722475e-05, + "loss": 0.5707, + "step": 2498 + }, + { + "epoch": 0.18387675327661532, + "grad_norm": 0.82421875, + "learning_rate": 4.602393421499483e-05, + "loss": 0.7865, + "step": 2499 + }, + { + "epoch": 0.1839503334099793, + "grad_norm": 0.71484375, + "learning_rate": 4.602079981095234e-05, + "loss": 0.8931, + "step": 2500 + }, + { + "epoch": 0.1840239135433433, + "grad_norm": 1.0234375, + "learning_rate": 4.6017664278763225e-05, + "loss": 1.0626, + "step": 2501 + }, + { + "epoch": 0.1840974936767073, + "grad_norm": 0.828125, + "learning_rate": 4.6014527618595776e-05, + "loss": 0.8422, + "step": 2502 + }, + { + "epoch": 0.18417107381007128, + "grad_norm": 0.89453125, + "learning_rate": 4.601138983061831e-05, + "loss": 0.9965, + "step": 2503 + }, + { + "epoch": 0.18424465394343528, + "grad_norm": 1.2578125, + "learning_rate": 4.600825091499924e-05, + "loss": 1.3074, + "step": 2504 + }, + { + "epoch": 0.18431823407679926, + "grad_norm": 0.91796875, + "learning_rate": 4.6005110871907024e-05, + "loss": 0.6125, + "step": 2505 + }, + { + "epoch": 0.18439181421016326, + "grad_norm": 0.93359375, + "learning_rate": 4.6001969701510186e-05, + "loss": 1.3469, + "step": 2506 + }, + { + "epoch": 0.18446539434352724, + "grad_norm": 1.09375, + "learning_rate": 4.599882740397729e-05, + "loss": 1.126, + "step": 2507 + }, + { + "epoch": 0.18453897447689124, + "grad_norm": 0.98046875, + "learning_rate": 4.5995683979476995e-05, + "loss": 0.9961, + "step": 2508 + }, + { + "epoch": 0.18461255461025522, + "grad_norm": 0.84375, + "learning_rate": 4.599253942817799e-05, + "loss": 0.789, + "step": 2509 + }, + { + "epoch": 0.18468613474361922, + "grad_norm": 0.78125, + "learning_rate": 4.598939375024905e-05, + "loss": 0.7734, + "step": 2510 + }, + { + "epoch": 0.18475971487698323, + "grad_norm": 1.1484375, + "learning_rate": 4.598624694585899e-05, + "loss": 1.2937, + "step": 2511 + }, + { + "epoch": 0.1848332950103472, + "grad_norm": 0.83203125, + "learning_rate": 4.598309901517669e-05, + "loss": 0.9011, + "step": 2512 + }, + { + "epoch": 0.1849068751437112, + "grad_norm": 1.1484375, + "learning_rate": 4.59799499583711e-05, + "loss": 1.3486, + "step": 2513 + }, + { + "epoch": 0.18498045527707518, + "grad_norm": 1.0234375, + "learning_rate": 4.597679977561122e-05, + "loss": 0.8625, + "step": 2514 + }, + { + "epoch": 0.1850540354104392, + "grad_norm": 0.8984375, + "learning_rate": 4.597364846706612e-05, + "loss": 0.6354, + "step": 2515 + }, + { + "epoch": 0.18512761554380316, + "grad_norm": 0.73046875, + "learning_rate": 4.597049603290491e-05, + "loss": 0.649, + "step": 2516 + }, + { + "epoch": 0.18520119567716717, + "grad_norm": 0.77734375, + "learning_rate": 4.5967342473296794e-05, + "loss": 0.984, + "step": 2517 + }, + { + "epoch": 0.18527477581053114, + "grad_norm": 0.88671875, + "learning_rate": 4.5964187788411004e-05, + "loss": 0.7508, + "step": 2518 + }, + { + "epoch": 0.18534835594389515, + "grad_norm": 0.71875, + "learning_rate": 4.596103197841686e-05, + "loss": 0.7298, + "step": 2519 + }, + { + "epoch": 0.18542193607725915, + "grad_norm": 0.8828125, + "learning_rate": 4.595787504348371e-05, + "loss": 0.8501, + "step": 2520 + }, + { + "epoch": 0.18549551621062313, + "grad_norm": 1.7421875, + "learning_rate": 4.5954716983780995e-05, + "loss": 1.4432, + "step": 2521 + }, + { + "epoch": 0.18556909634398713, + "grad_norm": 0.921875, + "learning_rate": 4.59515577994782e-05, + "loss": 0.9085, + "step": 2522 + }, + { + "epoch": 0.1856426764773511, + "grad_norm": 0.8359375, + "learning_rate": 4.594839749074486e-05, + "loss": 1.2709, + "step": 2523 + }, + { + "epoch": 0.1857162566107151, + "grad_norm": 0.83984375, + "learning_rate": 4.59452360577506e-05, + "loss": 1.0379, + "step": 2524 + }, + { + "epoch": 0.1857898367440791, + "grad_norm": 0.80078125, + "learning_rate": 4.5942073500665076e-05, + "loss": 1.0317, + "step": 2525 + }, + { + "epoch": 0.1858634168774431, + "grad_norm": 0.9921875, + "learning_rate": 4.593890981965803e-05, + "loss": 0.8333, + "step": 2526 + }, + { + "epoch": 0.18593699701080707, + "grad_norm": 0.82421875, + "learning_rate": 4.593574501489923e-05, + "loss": 1.1169, + "step": 2527 + }, + { + "epoch": 0.18601057714417107, + "grad_norm": 0.87890625, + "learning_rate": 4.5932579086558545e-05, + "loss": 0.9457, + "step": 2528 + }, + { + "epoch": 0.18608415727753508, + "grad_norm": 0.86328125, + "learning_rate": 4.592941203480587e-05, + "loss": 1.0812, + "step": 2529 + }, + { + "epoch": 0.18615773741089905, + "grad_norm": 0.73046875, + "learning_rate": 4.592624385981119e-05, + "loss": 0.7941, + "step": 2530 + }, + { + "epoch": 0.18623131754426306, + "grad_norm": 0.91796875, + "learning_rate": 4.592307456174452e-05, + "loss": 1.1419, + "step": 2531 + }, + { + "epoch": 0.18630489767762703, + "grad_norm": 0.8359375, + "learning_rate": 4.591990414077596e-05, + "loss": 0.9335, + "step": 2532 + }, + { + "epoch": 0.18637847781099104, + "grad_norm": 0.80078125, + "learning_rate": 4.5916732597075653e-05, + "loss": 0.694, + "step": 2533 + }, + { + "epoch": 0.18645205794435502, + "grad_norm": 0.87890625, + "learning_rate": 4.591355993081382e-05, + "loss": 1.0429, + "step": 2534 + }, + { + "epoch": 0.18652563807771902, + "grad_norm": 1.0546875, + "learning_rate": 4.591038614216072e-05, + "loss": 1.0098, + "step": 2535 + }, + { + "epoch": 0.186599218211083, + "grad_norm": 0.84765625, + "learning_rate": 4.590721123128669e-05, + "loss": 0.8707, + "step": 2536 + }, + { + "epoch": 0.186672798344447, + "grad_norm": 0.90234375, + "learning_rate": 4.590403519836212e-05, + "loss": 0.9409, + "step": 2537 + }, + { + "epoch": 0.186746378477811, + "grad_norm": 0.9921875, + "learning_rate": 4.590085804355747e-05, + "loss": 1.2149, + "step": 2538 + }, + { + "epoch": 0.18681995861117498, + "grad_norm": 0.87109375, + "learning_rate": 4.5897679767043244e-05, + "loss": 0.6591, + "step": 2539 + }, + { + "epoch": 0.18689353874453898, + "grad_norm": 0.8984375, + "learning_rate": 4.589450036899001e-05, + "loss": 1.385, + "step": 2540 + }, + { + "epoch": 0.18696711887790296, + "grad_norm": 1.015625, + "learning_rate": 4.5891319849568406e-05, + "loss": 1.0407, + "step": 2541 + }, + { + "epoch": 0.18704069901126696, + "grad_norm": 0.87109375, + "learning_rate": 4.5888138208949126e-05, + "loss": 0.8642, + "step": 2542 + }, + { + "epoch": 0.18711427914463094, + "grad_norm": 0.890625, + "learning_rate": 4.5884955447302916e-05, + "loss": 0.9204, + "step": 2543 + }, + { + "epoch": 0.18718785927799494, + "grad_norm": 0.76171875, + "learning_rate": 4.58817715648006e-05, + "loss": 0.8527, + "step": 2544 + }, + { + "epoch": 0.18726143941135892, + "grad_norm": 0.91015625, + "learning_rate": 4.587858656161303e-05, + "loss": 1.162, + "step": 2545 + }, + { + "epoch": 0.18733501954472292, + "grad_norm": 1.0546875, + "learning_rate": 4.587540043791117e-05, + "loss": 1.3641, + "step": 2546 + }, + { + "epoch": 0.18740859967808693, + "grad_norm": 0.75390625, + "learning_rate": 4.587221319386599e-05, + "loss": 0.6579, + "step": 2547 + }, + { + "epoch": 0.1874821798114509, + "grad_norm": 1.0625, + "learning_rate": 4.5869024829648555e-05, + "loss": 1.2344, + "step": 2548 + }, + { + "epoch": 0.1875557599448149, + "grad_norm": 0.734375, + "learning_rate": 4.586583534542996e-05, + "loss": 0.7285, + "step": 2549 + }, + { + "epoch": 0.18762934007817889, + "grad_norm": 1.03125, + "learning_rate": 4.586264474138141e-05, + "loss": 1.0602, + "step": 2550 + }, + { + "epoch": 0.1877029202115429, + "grad_norm": 0.8828125, + "learning_rate": 4.58594530176741e-05, + "loss": 0.9655, + "step": 2551 + }, + { + "epoch": 0.18777650034490687, + "grad_norm": 0.79296875, + "learning_rate": 4.585626017447936e-05, + "loss": 0.7523, + "step": 2552 + }, + { + "epoch": 0.18785008047827087, + "grad_norm": 1.0078125, + "learning_rate": 4.585306621196853e-05, + "loss": 1.1229, + "step": 2553 + }, + { + "epoch": 0.18792366061163485, + "grad_norm": 1.3203125, + "learning_rate": 4.584987113031301e-05, + "loss": 1.4735, + "step": 2554 + }, + { + "epoch": 0.18799724074499885, + "grad_norm": 0.87890625, + "learning_rate": 4.58466749296843e-05, + "loss": 0.7991, + "step": 2555 + }, + { + "epoch": 0.18807082087836285, + "grad_norm": 0.89453125, + "learning_rate": 4.5843477610253906e-05, + "loss": 0.8562, + "step": 2556 + }, + { + "epoch": 0.18814440101172683, + "grad_norm": 1.0078125, + "learning_rate": 4.584027917219345e-05, + "loss": 1.2277, + "step": 2557 + }, + { + "epoch": 0.18821798114509083, + "grad_norm": 0.671875, + "learning_rate": 4.583707961567456e-05, + "loss": 0.7215, + "step": 2558 + }, + { + "epoch": 0.1882915612784548, + "grad_norm": 0.82421875, + "learning_rate": 4.5833878940868966e-05, + "loss": 0.7097, + "step": 2559 + }, + { + "epoch": 0.18836514141181881, + "grad_norm": 0.91015625, + "learning_rate": 4.5830677147948445e-05, + "loss": 0.9554, + "step": 2560 + }, + { + "epoch": 0.1884387215451828, + "grad_norm": 0.67578125, + "learning_rate": 4.5827474237084824e-05, + "loss": 0.7611, + "step": 2561 + }, + { + "epoch": 0.1885123016785468, + "grad_norm": 0.98046875, + "learning_rate": 4.5824270208449996e-05, + "loss": 1.4304, + "step": 2562 + }, + { + "epoch": 0.18858588181191077, + "grad_norm": 0.76953125, + "learning_rate": 4.582106506221591e-05, + "loss": 0.7863, + "step": 2563 + }, + { + "epoch": 0.18865946194527478, + "grad_norm": 0.859375, + "learning_rate": 4.5817858798554594e-05, + "loss": 0.9078, + "step": 2564 + }, + { + "epoch": 0.18873304207863878, + "grad_norm": 1.1171875, + "learning_rate": 4.581465141763812e-05, + "loss": 1.2897, + "step": 2565 + }, + { + "epoch": 0.18880662221200276, + "grad_norm": 0.79296875, + "learning_rate": 4.5811442919638614e-05, + "loss": 0.8086, + "step": 2566 + }, + { + "epoch": 0.18888020234536676, + "grad_norm": 0.80859375, + "learning_rate": 4.580823330472827e-05, + "loss": 0.9277, + "step": 2567 + }, + { + "epoch": 0.18895378247873074, + "grad_norm": 0.8125, + "learning_rate": 4.580502257307935e-05, + "loss": 0.6848, + "step": 2568 + }, + { + "epoch": 0.18902736261209474, + "grad_norm": 0.921875, + "learning_rate": 4.5801810724864165e-05, + "loss": 0.9872, + "step": 2569 + }, + { + "epoch": 0.18910094274545872, + "grad_norm": 0.8515625, + "learning_rate": 4.5798597760255076e-05, + "loss": 1.2729, + "step": 2570 + }, + { + "epoch": 0.18917452287882272, + "grad_norm": 0.6796875, + "learning_rate": 4.579538367942454e-05, + "loss": 0.6431, + "step": 2571 + }, + { + "epoch": 0.1892481030121867, + "grad_norm": 1.015625, + "learning_rate": 4.579216848254504e-05, + "loss": 1.1885, + "step": 2572 + }, + { + "epoch": 0.1893216831455507, + "grad_norm": 0.8125, + "learning_rate": 4.578895216978912e-05, + "loss": 0.9254, + "step": 2573 + }, + { + "epoch": 0.1893952632789147, + "grad_norm": 0.875, + "learning_rate": 4.578573474132941e-05, + "loss": 1.0611, + "step": 2574 + }, + { + "epoch": 0.18946884341227868, + "grad_norm": 0.953125, + "learning_rate": 4.578251619733858e-05, + "loss": 0.9559, + "step": 2575 + }, + { + "epoch": 0.18954242354564269, + "grad_norm": 0.9921875, + "learning_rate": 4.5779296537989344e-05, + "loss": 1.5196, + "step": 2576 + }, + { + "epoch": 0.18961600367900666, + "grad_norm": 0.8984375, + "learning_rate": 4.577607576345452e-05, + "loss": 1.1808, + "step": 2577 + }, + { + "epoch": 0.18968958381237067, + "grad_norm": 0.8671875, + "learning_rate": 4.577285387390694e-05, + "loss": 0.8348, + "step": 2578 + }, + { + "epoch": 0.18976316394573464, + "grad_norm": 1.203125, + "learning_rate": 4.576963086951953e-05, + "loss": 1.0998, + "step": 2579 + }, + { + "epoch": 0.18983674407909865, + "grad_norm": 0.8359375, + "learning_rate": 4.576640675046526e-05, + "loss": 1.0317, + "step": 2580 + }, + { + "epoch": 0.18991032421246262, + "grad_norm": 0.703125, + "learning_rate": 4.576318151691716e-05, + "loss": 0.882, + "step": 2581 + }, + { + "epoch": 0.18998390434582663, + "grad_norm": 0.86328125, + "learning_rate": 4.5759955169048334e-05, + "loss": 0.9327, + "step": 2582 + }, + { + "epoch": 0.19005748447919063, + "grad_norm": 0.90625, + "learning_rate": 4.575672770703192e-05, + "loss": 1.2367, + "step": 2583 + }, + { + "epoch": 0.1901310646125546, + "grad_norm": 0.87890625, + "learning_rate": 4.575349913104113e-05, + "loss": 0.913, + "step": 2584 + }, + { + "epoch": 0.1902046447459186, + "grad_norm": 1.0390625, + "learning_rate": 4.575026944124924e-05, + "loss": 0.8268, + "step": 2585 + }, + { + "epoch": 0.1902782248792826, + "grad_norm": 1.1015625, + "learning_rate": 4.5747038637829584e-05, + "loss": 1.0053, + "step": 2586 + }, + { + "epoch": 0.1903518050126466, + "grad_norm": 0.8359375, + "learning_rate": 4.5743806720955546e-05, + "loss": 1.0264, + "step": 2587 + }, + { + "epoch": 0.19042538514601057, + "grad_norm": 0.9140625, + "learning_rate": 4.574057369080058e-05, + "loss": 0.8716, + "step": 2588 + }, + { + "epoch": 0.19049896527937457, + "grad_norm": 0.81640625, + "learning_rate": 4.5737339547538204e-05, + "loss": 0.9254, + "step": 2589 + }, + { + "epoch": 0.19057254541273855, + "grad_norm": 0.953125, + "learning_rate": 4.5734104291341974e-05, + "loss": 0.8532, + "step": 2590 + }, + { + "epoch": 0.19064612554610255, + "grad_norm": 0.84765625, + "learning_rate": 4.5730867922385536e-05, + "loss": 1.2998, + "step": 2591 + }, + { + "epoch": 0.19071970567946656, + "grad_norm": 0.859375, + "learning_rate": 4.572763044084256e-05, + "loss": 0.8121, + "step": 2592 + }, + { + "epoch": 0.19079328581283053, + "grad_norm": 0.71875, + "learning_rate": 4.572439184688682e-05, + "loss": 0.9667, + "step": 2593 + }, + { + "epoch": 0.19086686594619454, + "grad_norm": 0.921875, + "learning_rate": 4.5721152140692105e-05, + "loss": 1.044, + "step": 2594 + }, + { + "epoch": 0.1909404460795585, + "grad_norm": 0.9375, + "learning_rate": 4.5717911322432295e-05, + "loss": 0.8499, + "step": 2595 + }, + { + "epoch": 0.19101402621292252, + "grad_norm": 0.84765625, + "learning_rate": 4.571466939228131e-05, + "loss": 1.106, + "step": 2596 + }, + { + "epoch": 0.1910876063462865, + "grad_norm": 0.89453125, + "learning_rate": 4.571142635041314e-05, + "loss": 0.5618, + "step": 2597 + }, + { + "epoch": 0.1911611864796505, + "grad_norm": 0.93359375, + "learning_rate": 4.570818219700185e-05, + "loss": 0.9936, + "step": 2598 + }, + { + "epoch": 0.19123476661301447, + "grad_norm": 0.671875, + "learning_rate": 4.570493693222152e-05, + "loss": 0.603, + "step": 2599 + }, + { + "epoch": 0.19130834674637848, + "grad_norm": 1.03125, + "learning_rate": 4.570169055624634e-05, + "loss": 0.8996, + "step": 2600 + }, + { + "epoch": 0.19138192687974248, + "grad_norm": 0.89453125, + "learning_rate": 4.569844306925052e-05, + "loss": 1.1775, + "step": 2601 + }, + { + "epoch": 0.19145550701310646, + "grad_norm": 0.98828125, + "learning_rate": 4.5695194471408366e-05, + "loss": 0.8291, + "step": 2602 + }, + { + "epoch": 0.19152908714647046, + "grad_norm": 0.9921875, + "learning_rate": 4.56919447628942e-05, + "loss": 0.9419, + "step": 2603 + }, + { + "epoch": 0.19160266727983444, + "grad_norm": 0.875, + "learning_rate": 4.5688693943882446e-05, + "loss": 1.0143, + "step": 2604 + }, + { + "epoch": 0.19167624741319844, + "grad_norm": 1.03125, + "learning_rate": 4.5685442014547563e-05, + "loss": 1.0611, + "step": 2605 + }, + { + "epoch": 0.19174982754656242, + "grad_norm": 1.1484375, + "learning_rate": 4.568218897506408e-05, + "loss": 0.9734, + "step": 2606 + }, + { + "epoch": 0.19182340767992642, + "grad_norm": 0.76953125, + "learning_rate": 4.567893482560657e-05, + "loss": 0.8655, + "step": 2607 + }, + { + "epoch": 0.1918969878132904, + "grad_norm": 0.94140625, + "learning_rate": 4.567567956634969e-05, + "loss": 1.0493, + "step": 2608 + }, + { + "epoch": 0.1919705679466544, + "grad_norm": 1.046875, + "learning_rate": 4.567242319746814e-05, + "loss": 1.328, + "step": 2609 + }, + { + "epoch": 0.1920441480800184, + "grad_norm": 0.9453125, + "learning_rate": 4.5669165719136675e-05, + "loss": 1.341, + "step": 2610 + }, + { + "epoch": 0.19211772821338238, + "grad_norm": 1.0859375, + "learning_rate": 4.566590713153013e-05, + "loss": 0.9836, + "step": 2611 + }, + { + "epoch": 0.1921913083467464, + "grad_norm": 1.3203125, + "learning_rate": 4.566264743482338e-05, + "loss": 0.8758, + "step": 2612 + }, + { + "epoch": 0.19226488848011036, + "grad_norm": 0.89453125, + "learning_rate": 4.565938662919137e-05, + "loss": 0.8905, + "step": 2613 + }, + { + "epoch": 0.19233846861347437, + "grad_norm": 0.86328125, + "learning_rate": 4.5656124714809096e-05, + "loss": 0.7487, + "step": 2614 + }, + { + "epoch": 0.19241204874683834, + "grad_norm": 0.8125, + "learning_rate": 4.5652861691851624e-05, + "loss": 0.6465, + "step": 2615 + }, + { + "epoch": 0.19248562888020235, + "grad_norm": 1.15625, + "learning_rate": 4.564959756049407e-05, + "loss": 1.4017, + "step": 2616 + }, + { + "epoch": 0.19255920901356632, + "grad_norm": 0.88671875, + "learning_rate": 4.564633232091162e-05, + "loss": 0.9095, + "step": 2617 + }, + { + "epoch": 0.19263278914693033, + "grad_norm": 0.9140625, + "learning_rate": 4.564306597327951e-05, + "loss": 0.6735, + "step": 2618 + }, + { + "epoch": 0.19270636928029433, + "grad_norm": 0.90625, + "learning_rate": 4.563979851777304e-05, + "loss": 1.0966, + "step": 2619 + }, + { + "epoch": 0.1927799494136583, + "grad_norm": 0.99609375, + "learning_rate": 4.563652995456756e-05, + "loss": 0.9686, + "step": 2620 + }, + { + "epoch": 0.1928535295470223, + "grad_norm": 0.92578125, + "learning_rate": 4.5633260283838504e-05, + "loss": 0.862, + "step": 2621 + }, + { + "epoch": 0.1929271096803863, + "grad_norm": 0.80859375, + "learning_rate": 4.5629989505761325e-05, + "loss": 1.1745, + "step": 2622 + }, + { + "epoch": 0.1930006898137503, + "grad_norm": 0.8984375, + "learning_rate": 4.562671762051159e-05, + "loss": 0.8492, + "step": 2623 + }, + { + "epoch": 0.19307426994711427, + "grad_norm": 0.7890625, + "learning_rate": 4.5623444628264864e-05, + "loss": 1.0222, + "step": 2624 + }, + { + "epoch": 0.19314785008047827, + "grad_norm": 0.796875, + "learning_rate": 4.562017052919683e-05, + "loss": 1.0419, + "step": 2625 + }, + { + "epoch": 0.19322143021384225, + "grad_norm": 1.0078125, + "learning_rate": 4.5616895323483184e-05, + "loss": 1.004, + "step": 2626 + }, + { + "epoch": 0.19329501034720625, + "grad_norm": 0.9375, + "learning_rate": 4.56136190112997e-05, + "loss": 0.8283, + "step": 2627 + }, + { + "epoch": 0.19336859048057026, + "grad_norm": 0.796875, + "learning_rate": 4.5610341592822224e-05, + "loss": 0.984, + "step": 2628 + }, + { + "epoch": 0.19344217061393423, + "grad_norm": 0.81640625, + "learning_rate": 4.560706306822664e-05, + "loss": 1.3144, + "step": 2629 + }, + { + "epoch": 0.19351575074729824, + "grad_norm": 0.9765625, + "learning_rate": 4.5603783437688906e-05, + "loss": 1.0334, + "step": 2630 + }, + { + "epoch": 0.1935893308806622, + "grad_norm": 0.625, + "learning_rate": 4.5600502701385026e-05, + "loss": 0.6328, + "step": 2631 + }, + { + "epoch": 0.19366291101402622, + "grad_norm": 0.80078125, + "learning_rate": 4.559722085949107e-05, + "loss": 0.8362, + "step": 2632 + }, + { + "epoch": 0.1937364911473902, + "grad_norm": 0.71484375, + "learning_rate": 4.559393791218318e-05, + "loss": 0.7109, + "step": 2633 + }, + { + "epoch": 0.1938100712807542, + "grad_norm": 0.8515625, + "learning_rate": 4.5590653859637545e-05, + "loss": 0.7784, + "step": 2634 + }, + { + "epoch": 0.19388365141411817, + "grad_norm": 1.0625, + "learning_rate": 4.5587368702030396e-05, + "loss": 1.3132, + "step": 2635 + }, + { + "epoch": 0.19395723154748218, + "grad_norm": 0.84375, + "learning_rate": 4.5584082439538055e-05, + "loss": 0.6926, + "step": 2636 + }, + { + "epoch": 0.19403081168084618, + "grad_norm": 0.953125, + "learning_rate": 4.5580795072336894e-05, + "loss": 0.7574, + "step": 2637 + }, + { + "epoch": 0.19410439181421016, + "grad_norm": 0.78125, + "learning_rate": 4.5577506600603326e-05, + "loss": 1.2969, + "step": 2638 + }, + { + "epoch": 0.19417797194757416, + "grad_norm": 0.76953125, + "learning_rate": 4.5574217024513846e-05, + "loss": 0.6515, + "step": 2639 + }, + { + "epoch": 0.19425155208093814, + "grad_norm": 0.99609375, + "learning_rate": 4.5570926344245003e-05, + "loss": 0.9621, + "step": 2640 + }, + { + "epoch": 0.19432513221430214, + "grad_norm": 0.7578125, + "learning_rate": 4.5567634559973394e-05, + "loss": 0.7323, + "step": 2641 + }, + { + "epoch": 0.19439871234766612, + "grad_norm": 0.953125, + "learning_rate": 4.5564341671875674e-05, + "loss": 1.1237, + "step": 2642 + }, + { + "epoch": 0.19447229248103012, + "grad_norm": 1.0625, + "learning_rate": 4.556104768012859e-05, + "loss": 1.3252, + "step": 2643 + }, + { + "epoch": 0.1945458726143941, + "grad_norm": 1.125, + "learning_rate": 4.5557752584908906e-05, + "loss": 1.3102, + "step": 2644 + }, + { + "epoch": 0.1946194527477581, + "grad_norm": 0.9453125, + "learning_rate": 4.555445638639347e-05, + "loss": 0.8272, + "step": 2645 + }, + { + "epoch": 0.1946930328811221, + "grad_norm": 0.74609375, + "learning_rate": 4.555115908475918e-05, + "loss": 0.8254, + "step": 2646 + }, + { + "epoch": 0.19476661301448608, + "grad_norm": 0.8046875, + "learning_rate": 4.5547860680183e-05, + "loss": 0.9639, + "step": 2647 + }, + { + "epoch": 0.1948401931478501, + "grad_norm": 2.0, + "learning_rate": 4.5544561172841936e-05, + "loss": 0.7421, + "step": 2648 + }, + { + "epoch": 0.19491377328121406, + "grad_norm": 0.984375, + "learning_rate": 4.5541260562913093e-05, + "loss": 0.7926, + "step": 2649 + }, + { + "epoch": 0.19498735341457807, + "grad_norm": 0.73828125, + "learning_rate": 4.553795885057358e-05, + "loss": 0.8785, + "step": 2650 + }, + { + "epoch": 0.19506093354794204, + "grad_norm": 1.0703125, + "learning_rate": 4.553465603600062e-05, + "loss": 1.2337, + "step": 2651 + }, + { + "epoch": 0.19513451368130605, + "grad_norm": 0.87109375, + "learning_rate": 4.553135211937144e-05, + "loss": 0.7467, + "step": 2652 + }, + { + "epoch": 0.19520809381467005, + "grad_norm": 2.625, + "learning_rate": 4.552804710086338e-05, + "loss": 1.179, + "step": 2653 + }, + { + "epoch": 0.19528167394803403, + "grad_norm": 0.73046875, + "learning_rate": 4.55247409806538e-05, + "loss": 0.5965, + "step": 2654 + }, + { + "epoch": 0.19535525408139803, + "grad_norm": 0.79296875, + "learning_rate": 4.552143375892014e-05, + "loss": 1.0021, + "step": 2655 + }, + { + "epoch": 0.195428834214762, + "grad_norm": 1.0234375, + "learning_rate": 4.5518125435839884e-05, + "loss": 0.9996, + "step": 2656 + }, + { + "epoch": 0.195502414348126, + "grad_norm": 0.84765625, + "learning_rate": 4.551481601159059e-05, + "loss": 0.8108, + "step": 2657 + }, + { + "epoch": 0.19557599448149, + "grad_norm": 0.85546875, + "learning_rate": 4.551150548634987e-05, + "loss": 1.0242, + "step": 2658 + }, + { + "epoch": 0.195649574614854, + "grad_norm": 0.92578125, + "learning_rate": 4.5508193860295396e-05, + "loss": 0.9833, + "step": 2659 + }, + { + "epoch": 0.19572315474821797, + "grad_norm": 1.0625, + "learning_rate": 4.5504881133604885e-05, + "loss": 1.6644, + "step": 2660 + }, + { + "epoch": 0.19579673488158197, + "grad_norm": 1.0390625, + "learning_rate": 4.550156730645614e-05, + "loss": 1.1838, + "step": 2661 + }, + { + "epoch": 0.19587031501494598, + "grad_norm": 0.9921875, + "learning_rate": 4.549825237902699e-05, + "loss": 1.1663, + "step": 2662 + }, + { + "epoch": 0.19594389514830995, + "grad_norm": 1.0, + "learning_rate": 4.549493635149535e-05, + "loss": 1.432, + "step": 2663 + }, + { + "epoch": 0.19601747528167396, + "grad_norm": 0.73828125, + "learning_rate": 4.549161922403919e-05, + "loss": 1.0644, + "step": 2664 + }, + { + "epoch": 0.19609105541503793, + "grad_norm": 0.80078125, + "learning_rate": 4.5488300996836534e-05, + "loss": 0.8674, + "step": 2665 + }, + { + "epoch": 0.19616463554840194, + "grad_norm": 1.109375, + "learning_rate": 4.5484981670065454e-05, + "loss": 1.1175, + "step": 2666 + }, + { + "epoch": 0.19623821568176592, + "grad_norm": 1.3984375, + "learning_rate": 4.548166124390411e-05, + "loss": 0.6985, + "step": 2667 + }, + { + "epoch": 0.19631179581512992, + "grad_norm": 0.859375, + "learning_rate": 4.547833971853067e-05, + "loss": 1.034, + "step": 2668 + }, + { + "epoch": 0.1963853759484939, + "grad_norm": 0.796875, + "learning_rate": 4.547501709412343e-05, + "loss": 0.8328, + "step": 2669 + }, + { + "epoch": 0.1964589560818579, + "grad_norm": 0.9296875, + "learning_rate": 4.547169337086069e-05, + "loss": 1.0259, + "step": 2670 + }, + { + "epoch": 0.1965325362152219, + "grad_norm": 1.03125, + "learning_rate": 4.546836854892084e-05, + "loss": 0.8838, + "step": 2671 + }, + { + "epoch": 0.19660611634858588, + "grad_norm": 0.734375, + "learning_rate": 4.54650426284823e-05, + "loss": 0.8251, + "step": 2672 + }, + { + "epoch": 0.19667969648194988, + "grad_norm": 0.6796875, + "learning_rate": 4.5461715609723575e-05, + "loss": 0.7196, + "step": 2673 + }, + { + "epoch": 0.19675327661531386, + "grad_norm": 0.81640625, + "learning_rate": 4.5458387492823226e-05, + "loss": 1.1062, + "step": 2674 + }, + { + "epoch": 0.19682685674867786, + "grad_norm": 0.7578125, + "learning_rate": 4.545505827795986e-05, + "loss": 0.8082, + "step": 2675 + }, + { + "epoch": 0.19690043688204184, + "grad_norm": 0.74609375, + "learning_rate": 4.545172796531214e-05, + "loss": 0.6577, + "step": 2676 + }, + { + "epoch": 0.19697401701540584, + "grad_norm": 0.84375, + "learning_rate": 4.544839655505882e-05, + "loss": 0.8486, + "step": 2677 + }, + { + "epoch": 0.19704759714876982, + "grad_norm": 1.328125, + "learning_rate": 4.544506404737868e-05, + "loss": 1.3335, + "step": 2678 + }, + { + "epoch": 0.19712117728213382, + "grad_norm": 1.2109375, + "learning_rate": 4.544173044245056e-05, + "loss": 1.4339, + "step": 2679 + }, + { + "epoch": 0.19719475741549783, + "grad_norm": 0.9453125, + "learning_rate": 4.5438395740453375e-05, + "loss": 0.8955, + "step": 2680 + }, + { + "epoch": 0.1972683375488618, + "grad_norm": 0.6796875, + "learning_rate": 4.54350599415661e-05, + "loss": 0.7659, + "step": 2681 + }, + { + "epoch": 0.1973419176822258, + "grad_norm": 0.94921875, + "learning_rate": 4.543172304596774e-05, + "loss": 0.847, + "step": 2682 + }, + { + "epoch": 0.19741549781558979, + "grad_norm": 0.98046875, + "learning_rate": 4.542838505383741e-05, + "loss": 1.1848, + "step": 2683 + }, + { + "epoch": 0.1974890779489538, + "grad_norm": 0.83984375, + "learning_rate": 4.542504596535424e-05, + "loss": 0.8251, + "step": 2684 + }, + { + "epoch": 0.19756265808231777, + "grad_norm": 0.8515625, + "learning_rate": 4.542170578069742e-05, + "loss": 0.772, + "step": 2685 + }, + { + "epoch": 0.19763623821568177, + "grad_norm": 0.8515625, + "learning_rate": 4.5418364500046226e-05, + "loss": 0.7522, + "step": 2686 + }, + { + "epoch": 0.19770981834904575, + "grad_norm": 0.96484375, + "learning_rate": 4.541502212357998e-05, + "loss": 0.8216, + "step": 2687 + }, + { + "epoch": 0.19778339848240975, + "grad_norm": 0.90625, + "learning_rate": 4.541167865147804e-05, + "loss": 0.805, + "step": 2688 + }, + { + "epoch": 0.19785697861577375, + "grad_norm": 0.8359375, + "learning_rate": 4.540833408391987e-05, + "loss": 0.8098, + "step": 2689 + }, + { + "epoch": 0.19793055874913773, + "grad_norm": 0.84765625, + "learning_rate": 4.540498842108495e-05, + "loss": 0.7311, + "step": 2690 + }, + { + "epoch": 0.19800413888250173, + "grad_norm": 1.4453125, + "learning_rate": 4.540164166315284e-05, + "loss": 0.8703, + "step": 2691 + }, + { + "epoch": 0.1980777190158657, + "grad_norm": 0.88671875, + "learning_rate": 4.539829381030316e-05, + "loss": 1.0887, + "step": 2692 + }, + { + "epoch": 0.19815129914922971, + "grad_norm": 1.15625, + "learning_rate": 4.539494486271558e-05, + "loss": 0.9607, + "step": 2693 + }, + { + "epoch": 0.1982248792825937, + "grad_norm": 0.90234375, + "learning_rate": 4.5391594820569827e-05, + "loss": 0.8763, + "step": 2694 + }, + { + "epoch": 0.1982984594159577, + "grad_norm": 1.0546875, + "learning_rate": 4.538824368404569e-05, + "loss": 1.3082, + "step": 2695 + }, + { + "epoch": 0.19837203954932167, + "grad_norm": 0.8046875, + "learning_rate": 4.538489145332303e-05, + "loss": 0.8263, + "step": 2696 + }, + { + "epoch": 0.19844561968268568, + "grad_norm": 0.91015625, + "learning_rate": 4.5381538128581746e-05, + "loss": 0.9347, + "step": 2697 + }, + { + "epoch": 0.19851919981604968, + "grad_norm": 0.84375, + "learning_rate": 4.5378183710001806e-05, + "loss": 0.7509, + "step": 2698 + }, + { + "epoch": 0.19859277994941366, + "grad_norm": 0.9921875, + "learning_rate": 4.537482819776323e-05, + "loss": 1.0527, + "step": 2699 + }, + { + "epoch": 0.19866636008277766, + "grad_norm": 0.828125, + "learning_rate": 4.537147159204611e-05, + "loss": 0.7255, + "step": 2700 + }, + { + "epoch": 0.19873994021614164, + "grad_norm": 0.9453125, + "learning_rate": 4.536811389303058e-05, + "loss": 0.7585, + "step": 2701 + }, + { + "epoch": 0.19881352034950564, + "grad_norm": 0.96875, + "learning_rate": 4.5364755100896864e-05, + "loss": 0.7841, + "step": 2702 + }, + { + "epoch": 0.19888710048286962, + "grad_norm": 0.8671875, + "learning_rate": 4.536139521582519e-05, + "loss": 0.8066, + "step": 2703 + }, + { + "epoch": 0.19896068061623362, + "grad_norm": 1.0703125, + "learning_rate": 4.53580342379959e-05, + "loss": 1.1482, + "step": 2704 + }, + { + "epoch": 0.1990342607495976, + "grad_norm": 0.796875, + "learning_rate": 4.535467216758936e-05, + "loss": 0.6796, + "step": 2705 + }, + { + "epoch": 0.1991078408829616, + "grad_norm": 0.8046875, + "learning_rate": 4.5351309004786e-05, + "loss": 0.7754, + "step": 2706 + }, + { + "epoch": 0.1991814210163256, + "grad_norm": 0.9140625, + "learning_rate": 4.534794474976634e-05, + "loss": 1.2938, + "step": 2707 + }, + { + "epoch": 0.19925500114968958, + "grad_norm": 1.015625, + "learning_rate": 4.534457940271091e-05, + "loss": 1.0347, + "step": 2708 + }, + { + "epoch": 0.19932858128305359, + "grad_norm": 0.6640625, + "learning_rate": 4.534121296380033e-05, + "loss": 0.6704, + "step": 2709 + }, + { + "epoch": 0.19940216141641756, + "grad_norm": 0.90625, + "learning_rate": 4.5337845433215266e-05, + "loss": 0.8379, + "step": 2710 + }, + { + "epoch": 0.19947574154978157, + "grad_norm": 0.85546875, + "learning_rate": 4.5334476811136455e-05, + "loss": 0.8135, + "step": 2711 + }, + { + "epoch": 0.19954932168314554, + "grad_norm": 0.97265625, + "learning_rate": 4.533110709774468e-05, + "loss": 1.1644, + "step": 2712 + }, + { + "epoch": 0.19962290181650955, + "grad_norm": 0.8828125, + "learning_rate": 4.532773629322078e-05, + "loss": 0.8303, + "step": 2713 + }, + { + "epoch": 0.19969648194987352, + "grad_norm": 0.9453125, + "learning_rate": 4.532436439774567e-05, + "loss": 0.8286, + "step": 2714 + }, + { + "epoch": 0.19977006208323753, + "grad_norm": 0.85546875, + "learning_rate": 4.532099141150031e-05, + "loss": 0.8525, + "step": 2715 + }, + { + "epoch": 0.19984364221660153, + "grad_norm": 0.8046875, + "learning_rate": 4.531761733466573e-05, + "loss": 0.8361, + "step": 2716 + }, + { + "epoch": 0.1999172223499655, + "grad_norm": 0.82421875, + "learning_rate": 4.5314242167422996e-05, + "loss": 0.7961, + "step": 2717 + }, + { + "epoch": 0.1999908024833295, + "grad_norm": 1.0625, + "learning_rate": 4.531086590995325e-05, + "loss": 1.0901, + "step": 2718 + }, + { + "epoch": 0.2000643826166935, + "grad_norm": 0.85546875, + "learning_rate": 4.53074885624377e-05, + "loss": 0.9963, + "step": 2719 + }, + { + "epoch": 0.2001379627500575, + "grad_norm": 0.8984375, + "learning_rate": 4.5304110125057584e-05, + "loss": 1.4383, + "step": 2720 + }, + { + "epoch": 0.20021154288342147, + "grad_norm": 0.91015625, + "learning_rate": 4.5300730597994236e-05, + "loss": 0.8235, + "step": 2721 + }, + { + "epoch": 0.20028512301678547, + "grad_norm": 0.6875, + "learning_rate": 4.5297349981429014e-05, + "loss": 0.6616, + "step": 2722 + }, + { + "epoch": 0.20035870315014945, + "grad_norm": 1.03125, + "learning_rate": 4.529396827554335e-05, + "loss": 1.1743, + "step": 2723 + }, + { + "epoch": 0.20043228328351345, + "grad_norm": 0.95703125, + "learning_rate": 4.529058548051875e-05, + "loss": 1.0245, + "step": 2724 + }, + { + "epoch": 0.20050586341687746, + "grad_norm": 1.28125, + "learning_rate": 4.528720159653674e-05, + "loss": 1.3024, + "step": 2725 + }, + { + "epoch": 0.20057944355024143, + "grad_norm": 0.6953125, + "learning_rate": 4.528381662377895e-05, + "loss": 0.6621, + "step": 2726 + }, + { + "epoch": 0.20065302368360544, + "grad_norm": 0.85546875, + "learning_rate": 4.528043056242702e-05, + "loss": 1.2441, + "step": 2727 + }, + { + "epoch": 0.2007266038169694, + "grad_norm": 0.83203125, + "learning_rate": 4.527704341266269e-05, + "loss": 1.2101, + "step": 2728 + }, + { + "epoch": 0.20080018395033342, + "grad_norm": 0.90234375, + "learning_rate": 4.5273655174667745e-05, + "loss": 0.9185, + "step": 2729 + }, + { + "epoch": 0.2008737640836974, + "grad_norm": 0.7578125, + "learning_rate": 4.5270265848624004e-05, + "loss": 0.8128, + "step": 2730 + }, + { + "epoch": 0.2009473442170614, + "grad_norm": 0.91015625, + "learning_rate": 4.52668754347134e-05, + "loss": 1.1705, + "step": 2731 + }, + { + "epoch": 0.20102092435042537, + "grad_norm": 0.72265625, + "learning_rate": 4.526348393311786e-05, + "loss": 0.8057, + "step": 2732 + }, + { + "epoch": 0.20109450448378938, + "grad_norm": 0.92578125, + "learning_rate": 4.52600913440194e-05, + "loss": 1.1357, + "step": 2733 + }, + { + "epoch": 0.20116808461715338, + "grad_norm": 0.75, + "learning_rate": 4.525669766760011e-05, + "loss": 0.6603, + "step": 2734 + }, + { + "epoch": 0.20124166475051736, + "grad_norm": 0.80859375, + "learning_rate": 4.525330290404212e-05, + "loss": 0.6845, + "step": 2735 + }, + { + "epoch": 0.20131524488388136, + "grad_norm": 1.2265625, + "learning_rate": 4.524990705352761e-05, + "loss": 1.2872, + "step": 2736 + }, + { + "epoch": 0.20138882501724534, + "grad_norm": 0.94921875, + "learning_rate": 4.524651011623884e-05, + "loss": 0.9801, + "step": 2737 + }, + { + "epoch": 0.20146240515060934, + "grad_norm": 0.9296875, + "learning_rate": 4.524311209235811e-05, + "loss": 0.8503, + "step": 2738 + }, + { + "epoch": 0.20153598528397332, + "grad_norm": 0.81640625, + "learning_rate": 4.523971298206779e-05, + "loss": 0.925, + "step": 2739 + }, + { + "epoch": 0.20160956541733732, + "grad_norm": 0.99609375, + "learning_rate": 4.52363127855503e-05, + "loss": 1.5058, + "step": 2740 + }, + { + "epoch": 0.2016831455507013, + "grad_norm": 0.75390625, + "learning_rate": 4.523291150298813e-05, + "loss": 0.7355, + "step": 2741 + }, + { + "epoch": 0.2017567256840653, + "grad_norm": 0.91796875, + "learning_rate": 4.522950913456381e-05, + "loss": 0.9563, + "step": 2742 + }, + { + "epoch": 0.2018303058174293, + "grad_norm": 0.84765625, + "learning_rate": 4.522610568045994e-05, + "loss": 0.7932, + "step": 2743 + }, + { + "epoch": 0.20190388595079328, + "grad_norm": 0.8203125, + "learning_rate": 4.522270114085917e-05, + "loss": 0.8767, + "step": 2744 + }, + { + "epoch": 0.2019774660841573, + "grad_norm": 1.0859375, + "learning_rate": 4.5219295515944244e-05, + "loss": 0.9709, + "step": 2745 + }, + { + "epoch": 0.20205104621752126, + "grad_norm": 0.890625, + "learning_rate": 4.5215888805897906e-05, + "loss": 0.8323, + "step": 2746 + }, + { + "epoch": 0.20212462635088527, + "grad_norm": 0.92578125, + "learning_rate": 4.5212481010903e-05, + "loss": 0.9393, + "step": 2747 + }, + { + "epoch": 0.20219820648424924, + "grad_norm": 0.90625, + "learning_rate": 4.5209072131142415e-05, + "loss": 1.0489, + "step": 2748 + }, + { + "epoch": 0.20227178661761325, + "grad_norm": 0.84375, + "learning_rate": 4.52056621667991e-05, + "loss": 0.7525, + "step": 2749 + }, + { + "epoch": 0.20234536675097722, + "grad_norm": 0.84765625, + "learning_rate": 4.5202251118056056e-05, + "loss": 1.0862, + "step": 2750 + }, + { + "epoch": 0.20241894688434123, + "grad_norm": 1.2109375, + "learning_rate": 4.519883898509635e-05, + "loss": 1.1756, + "step": 2751 + }, + { + "epoch": 0.20249252701770523, + "grad_norm": 0.89453125, + "learning_rate": 4.519542576810311e-05, + "loss": 0.9582, + "step": 2752 + }, + { + "epoch": 0.2025661071510692, + "grad_norm": 1.0625, + "learning_rate": 4.519201146725951e-05, + "loss": 1.0155, + "step": 2753 + }, + { + "epoch": 0.2026396872844332, + "grad_norm": 1.3046875, + "learning_rate": 4.518859608274879e-05, + "loss": 1.1281, + "step": 2754 + }, + { + "epoch": 0.2027132674177972, + "grad_norm": 1.0859375, + "learning_rate": 4.518517961475426e-05, + "loss": 1.1438, + "step": 2755 + }, + { + "epoch": 0.2027868475511612, + "grad_norm": 1.171875, + "learning_rate": 4.518176206345925e-05, + "loss": 1.2908, + "step": 2756 + }, + { + "epoch": 0.20286042768452517, + "grad_norm": 1.0390625, + "learning_rate": 4.51783434290472e-05, + "loss": 0.9362, + "step": 2757 + }, + { + "epoch": 0.20293400781788917, + "grad_norm": 0.91796875, + "learning_rate": 4.517492371170156e-05, + "loss": 1.2026, + "step": 2758 + }, + { + "epoch": 0.20300758795125315, + "grad_norm": 0.71875, + "learning_rate": 4.517150291160588e-05, + "loss": 0.8406, + "step": 2759 + }, + { + "epoch": 0.20308116808461715, + "grad_norm": 0.78125, + "learning_rate": 4.5168081028943726e-05, + "loss": 0.7653, + "step": 2760 + }, + { + "epoch": 0.20315474821798116, + "grad_norm": 0.77734375, + "learning_rate": 4.516465806389876e-05, + "loss": 0.8063, + "step": 2761 + }, + { + "epoch": 0.20322832835134513, + "grad_norm": 0.90234375, + "learning_rate": 4.5161234016654684e-05, + "loss": 0.928, + "step": 2762 + }, + { + "epoch": 0.20330190848470914, + "grad_norm": 1.0, + "learning_rate": 4.515780888739525e-05, + "loss": 0.7376, + "step": 2763 + }, + { + "epoch": 0.2033754886180731, + "grad_norm": 1.3125, + "learning_rate": 4.5154382676304295e-05, + "loss": 1.3377, + "step": 2764 + }, + { + "epoch": 0.20344906875143712, + "grad_norm": 0.7890625, + "learning_rate": 4.515095538356568e-05, + "loss": 1.0057, + "step": 2765 + }, + { + "epoch": 0.2035226488848011, + "grad_norm": 0.9609375, + "learning_rate": 4.5147527009363354e-05, + "loss": 0.9021, + "step": 2766 + }, + { + "epoch": 0.2035962290181651, + "grad_norm": 0.79296875, + "learning_rate": 4.51440975538813e-05, + "loss": 0.7933, + "step": 2767 + }, + { + "epoch": 0.20366980915152907, + "grad_norm": 0.90234375, + "learning_rate": 4.51406670173036e-05, + "loss": 0.995, + "step": 2768 + }, + { + "epoch": 0.20374338928489308, + "grad_norm": 0.82421875, + "learning_rate": 4.513723539981432e-05, + "loss": 1.0368, + "step": 2769 + }, + { + "epoch": 0.20381696941825708, + "grad_norm": 0.79296875, + "learning_rate": 4.513380270159765e-05, + "loss": 0.7137, + "step": 2770 + }, + { + "epoch": 0.20389054955162106, + "grad_norm": 0.98828125, + "learning_rate": 4.513036892283782e-05, + "loss": 0.912, + "step": 2771 + }, + { + "epoch": 0.20396412968498506, + "grad_norm": 1.015625, + "learning_rate": 4.5126934063719113e-05, + "loss": 1.8423, + "step": 2772 + }, + { + "epoch": 0.20403770981834904, + "grad_norm": 0.92578125, + "learning_rate": 4.5123498124425864e-05, + "loss": 0.9329, + "step": 2773 + }, + { + "epoch": 0.20411128995171304, + "grad_norm": 0.85546875, + "learning_rate": 4.5120061105142486e-05, + "loss": 0.9621, + "step": 2774 + }, + { + "epoch": 0.20418487008507702, + "grad_norm": 0.99609375, + "learning_rate": 4.511662300605343e-05, + "loss": 1.0549, + "step": 2775 + }, + { + "epoch": 0.20425845021844102, + "grad_norm": 1.1015625, + "learning_rate": 4.5113183827343206e-05, + "loss": 1.184, + "step": 2776 + }, + { + "epoch": 0.204332030351805, + "grad_norm": 0.93359375, + "learning_rate": 4.510974356919639e-05, + "loss": 1.4337, + "step": 2777 + }, + { + "epoch": 0.204405610485169, + "grad_norm": 0.8984375, + "learning_rate": 4.5106302231797624e-05, + "loss": 1.0132, + "step": 2778 + }, + { + "epoch": 0.204479190618533, + "grad_norm": 0.765625, + "learning_rate": 4.51028598153316e-05, + "loss": 0.9231, + "step": 2779 + }, + { + "epoch": 0.20455277075189698, + "grad_norm": 0.859375, + "learning_rate": 4.509941631998305e-05, + "loss": 0.8495, + "step": 2780 + }, + { + "epoch": 0.204626350885261, + "grad_norm": 0.9609375, + "learning_rate": 4.509597174593679e-05, + "loss": 0.8408, + "step": 2781 + }, + { + "epoch": 0.20469993101862496, + "grad_norm": 0.66015625, + "learning_rate": 4.5092526093377685e-05, + "loss": 0.7082, + "step": 2782 + }, + { + "epoch": 0.20477351115198897, + "grad_norm": 0.93359375, + "learning_rate": 4.508907936249065e-05, + "loss": 1.2779, + "step": 2783 + }, + { + "epoch": 0.20484709128535294, + "grad_norm": 0.76953125, + "learning_rate": 4.508563155346067e-05, + "loss": 0.6749, + "step": 2784 + }, + { + "epoch": 0.20492067141871695, + "grad_norm": 1.0078125, + "learning_rate": 4.508218266647278e-05, + "loss": 1.1346, + "step": 2785 + }, + { + "epoch": 0.20499425155208092, + "grad_norm": 1.109375, + "learning_rate": 4.507873270171208e-05, + "loss": 1.1635, + "step": 2786 + }, + { + "epoch": 0.20506783168544493, + "grad_norm": 1.1171875, + "learning_rate": 4.507528165936372e-05, + "loss": 1.3338, + "step": 2787 + }, + { + "epoch": 0.20514141181880893, + "grad_norm": 0.8515625, + "learning_rate": 4.50718295396129e-05, + "loss": 1.0394, + "step": 2788 + }, + { + "epoch": 0.2052149919521729, + "grad_norm": 0.83984375, + "learning_rate": 4.506837634264492e-05, + "loss": 1.0295, + "step": 2789 + }, + { + "epoch": 0.2052885720855369, + "grad_norm": 1.1015625, + "learning_rate": 4.5064922068645064e-05, + "loss": 1.2822, + "step": 2790 + }, + { + "epoch": 0.2053621522189009, + "grad_norm": 0.765625, + "learning_rate": 4.506146671779874e-05, + "loss": 0.6555, + "step": 2791 + }, + { + "epoch": 0.2054357323522649, + "grad_norm": 0.6953125, + "learning_rate": 4.50580102902914e-05, + "loss": 0.721, + "step": 2792 + }, + { + "epoch": 0.20550931248562887, + "grad_norm": 0.89453125, + "learning_rate": 4.505455278630852e-05, + "loss": 0.8942, + "step": 2793 + }, + { + "epoch": 0.20558289261899287, + "grad_norm": 1.03125, + "learning_rate": 4.505109420603568e-05, + "loss": 0.8244, + "step": 2794 + }, + { + "epoch": 0.20565647275235685, + "grad_norm": 0.8984375, + "learning_rate": 4.5047634549658475e-05, + "loss": 0.8666, + "step": 2795 + }, + { + "epoch": 0.20573005288572085, + "grad_norm": 2.109375, + "learning_rate": 4.5044173817362594e-05, + "loss": 1.1106, + "step": 2796 + }, + { + "epoch": 0.20580363301908486, + "grad_norm": 0.7421875, + "learning_rate": 4.5040712009333764e-05, + "loss": 0.9823, + "step": 2797 + }, + { + "epoch": 0.20587721315244883, + "grad_norm": 0.8828125, + "learning_rate": 4.5037249125757766e-05, + "loss": 1.3864, + "step": 2798 + }, + { + "epoch": 0.20595079328581284, + "grad_norm": 0.75390625, + "learning_rate": 4.503378516682046e-05, + "loss": 1.0485, + "step": 2799 + }, + { + "epoch": 0.20602437341917682, + "grad_norm": 0.8671875, + "learning_rate": 4.503032013270774e-05, + "loss": 0.8076, + "step": 2800 + }, + { + "epoch": 0.20609795355254082, + "grad_norm": 0.8203125, + "learning_rate": 4.502685402360556e-05, + "loss": 0.8173, + "step": 2801 + }, + { + "epoch": 0.2061715336859048, + "grad_norm": 0.71875, + "learning_rate": 4.502338683969997e-05, + "loss": 1.0205, + "step": 2802 + }, + { + "epoch": 0.2062451138192688, + "grad_norm": 1.0078125, + "learning_rate": 4.5019918581177015e-05, + "loss": 0.8801, + "step": 2803 + }, + { + "epoch": 0.20631869395263278, + "grad_norm": 0.921875, + "learning_rate": 4.5016449248222835e-05, + "loss": 0.8997, + "step": 2804 + }, + { + "epoch": 0.20639227408599678, + "grad_norm": 1.0859375, + "learning_rate": 4.501297884102363e-05, + "loss": 0.9744, + "step": 2805 + }, + { + "epoch": 0.20646585421936078, + "grad_norm": 0.98828125, + "learning_rate": 4.5009507359765666e-05, + "loss": 0.967, + "step": 2806 + }, + { + "epoch": 0.20653943435272476, + "grad_norm": 0.796875, + "learning_rate": 4.500603480463523e-05, + "loss": 0.8408, + "step": 2807 + }, + { + "epoch": 0.20661301448608876, + "grad_norm": 0.98828125, + "learning_rate": 4.500256117581868e-05, + "loss": 1.3726, + "step": 2808 + }, + { + "epoch": 0.20668659461945274, + "grad_norm": 0.97265625, + "learning_rate": 4.499908647350246e-05, + "loss": 1.1918, + "step": 2809 + }, + { + "epoch": 0.20676017475281674, + "grad_norm": 1.0703125, + "learning_rate": 4.499561069787305e-05, + "loss": 1.2052, + "step": 2810 + }, + { + "epoch": 0.20683375488618072, + "grad_norm": 0.7734375, + "learning_rate": 4.499213384911696e-05, + "loss": 0.7237, + "step": 2811 + }, + { + "epoch": 0.20690733501954472, + "grad_norm": 0.8984375, + "learning_rate": 4.498865592742082e-05, + "loss": 0.76, + "step": 2812 + }, + { + "epoch": 0.2069809151529087, + "grad_norm": 0.89453125, + "learning_rate": 4.498517693297127e-05, + "loss": 0.9004, + "step": 2813 + }, + { + "epoch": 0.2070544952862727, + "grad_norm": 1.015625, + "learning_rate": 4.498169686595501e-05, + "loss": 0.9867, + "step": 2814 + }, + { + "epoch": 0.2071280754196367, + "grad_norm": 0.73828125, + "learning_rate": 4.497821572655883e-05, + "loss": 0.9329, + "step": 2815 + }, + { + "epoch": 0.20720165555300069, + "grad_norm": 0.8046875, + "learning_rate": 4.497473351496955e-05, + "loss": 0.8902, + "step": 2816 + }, + { + "epoch": 0.2072752356863647, + "grad_norm": 0.71484375, + "learning_rate": 4.497125023137403e-05, + "loss": 0.6786, + "step": 2817 + }, + { + "epoch": 0.20734881581972867, + "grad_norm": 0.7421875, + "learning_rate": 4.496776587595924e-05, + "loss": 0.7063, + "step": 2818 + }, + { + "epoch": 0.20742239595309267, + "grad_norm": 0.828125, + "learning_rate": 4.496428044891218e-05, + "loss": 1.05, + "step": 2819 + }, + { + "epoch": 0.20749597608645665, + "grad_norm": 0.96875, + "learning_rate": 4.4960793950419884e-05, + "loss": 1.2781, + "step": 2820 + }, + { + "epoch": 0.20756955621982065, + "grad_norm": 0.7265625, + "learning_rate": 4.4957306380669475e-05, + "loss": 0.8069, + "step": 2821 + }, + { + "epoch": 0.20764313635318463, + "grad_norm": 0.78515625, + "learning_rate": 4.4953817739848134e-05, + "loss": 0.6579, + "step": 2822 + }, + { + "epoch": 0.20771671648654863, + "grad_norm": 0.71484375, + "learning_rate": 4.495032802814308e-05, + "loss": 1.2859, + "step": 2823 + }, + { + "epoch": 0.20779029661991263, + "grad_norm": 1.0703125, + "learning_rate": 4.49468372457416e-05, + "loss": 1.3279, + "step": 2824 + }, + { + "epoch": 0.2078638767532766, + "grad_norm": 0.72265625, + "learning_rate": 4.494334539283104e-05, + "loss": 0.6983, + "step": 2825 + }, + { + "epoch": 0.20793745688664061, + "grad_norm": 1.21875, + "learning_rate": 4.49398524695988e-05, + "loss": 0.9231, + "step": 2826 + }, + { + "epoch": 0.2080110370200046, + "grad_norm": 0.93359375, + "learning_rate": 4.4936358476232346e-05, + "loss": 0.8578, + "step": 2827 + }, + { + "epoch": 0.2080846171533686, + "grad_norm": 0.9921875, + "learning_rate": 4.493286341291918e-05, + "loss": 1.246, + "step": 2828 + }, + { + "epoch": 0.20815819728673257, + "grad_norm": 0.93359375, + "learning_rate": 4.492936727984688e-05, + "loss": 0.9413, + "step": 2829 + }, + { + "epoch": 0.20823177742009658, + "grad_norm": 0.90234375, + "learning_rate": 4.492587007720308e-05, + "loss": 0.9834, + "step": 2830 + }, + { + "epoch": 0.20830535755346058, + "grad_norm": 1.0546875, + "learning_rate": 4.4922371805175475e-05, + "loss": 1.1971, + "step": 2831 + }, + { + "epoch": 0.20837893768682456, + "grad_norm": 1.015625, + "learning_rate": 4.491887246395179e-05, + "loss": 1.4261, + "step": 2832 + }, + { + "epoch": 0.20845251782018856, + "grad_norm": 0.90625, + "learning_rate": 4.4915372053719856e-05, + "loss": 1.1543, + "step": 2833 + }, + { + "epoch": 0.20852609795355254, + "grad_norm": 0.921875, + "learning_rate": 4.4911870574667515e-05, + "loss": 0.7451, + "step": 2834 + }, + { + "epoch": 0.20859967808691654, + "grad_norm": 0.93359375, + "learning_rate": 4.4908368026982686e-05, + "loss": 0.7919, + "step": 2835 + }, + { + "epoch": 0.20867325822028052, + "grad_norm": 1.28125, + "learning_rate": 4.4904864410853344e-05, + "loss": 1.1645, + "step": 2836 + }, + { + "epoch": 0.20874683835364452, + "grad_norm": 1.109375, + "learning_rate": 4.490135972646752e-05, + "loss": 1.2171, + "step": 2837 + }, + { + "epoch": 0.2088204184870085, + "grad_norm": 0.86328125, + "learning_rate": 4.489785397401332e-05, + "loss": 0.8273, + "step": 2838 + }, + { + "epoch": 0.2088939986203725, + "grad_norm": 0.98828125, + "learning_rate": 4.489434715367887e-05, + "loss": 1.4749, + "step": 2839 + }, + { + "epoch": 0.2089675787537365, + "grad_norm": 0.90625, + "learning_rate": 4.489083926565238e-05, + "loss": 1.0795, + "step": 2840 + }, + { + "epoch": 0.20904115888710048, + "grad_norm": 0.859375, + "learning_rate": 4.488733031012213e-05, + "loss": 0.9256, + "step": 2841 + }, + { + "epoch": 0.20911473902046449, + "grad_norm": 0.87890625, + "learning_rate": 4.4883820287276415e-05, + "loss": 0.688, + "step": 2842 + }, + { + "epoch": 0.20918831915382846, + "grad_norm": 0.99609375, + "learning_rate": 4.4880309197303615e-05, + "loss": 0.8873, + "step": 2843 + }, + { + "epoch": 0.20926189928719247, + "grad_norm": 0.8515625, + "learning_rate": 4.4876797040392185e-05, + "loss": 0.98, + "step": 2844 + }, + { + "epoch": 0.20933547942055644, + "grad_norm": 1.0078125, + "learning_rate": 4.4873283816730584e-05, + "loss": 0.9738, + "step": 2845 + }, + { + "epoch": 0.20940905955392045, + "grad_norm": 1.1171875, + "learning_rate": 4.4869769526507376e-05, + "loss": 1.3381, + "step": 2846 + }, + { + "epoch": 0.20948263968728442, + "grad_norm": 0.98828125, + "learning_rate": 4.486625416991118e-05, + "loss": 1.1353, + "step": 2847 + }, + { + "epoch": 0.20955621982064843, + "grad_norm": 0.89453125, + "learning_rate": 4.486273774713064e-05, + "loss": 1.043, + "step": 2848 + }, + { + "epoch": 0.20962979995401243, + "grad_norm": 0.74609375, + "learning_rate": 4.4859220258354475e-05, + "loss": 0.7113, + "step": 2849 + }, + { + "epoch": 0.2097033800873764, + "grad_norm": 0.83203125, + "learning_rate": 4.485570170377146e-05, + "loss": 0.8909, + "step": 2850 + }, + { + "epoch": 0.2097769602207404, + "grad_norm": 0.953125, + "learning_rate": 4.485218208357045e-05, + "loss": 0.8449, + "step": 2851 + }, + { + "epoch": 0.2098505403541044, + "grad_norm": 0.8046875, + "learning_rate": 4.484866139794032e-05, + "loss": 1.045, + "step": 2852 + }, + { + "epoch": 0.2099241204874684, + "grad_norm": 0.8984375, + "learning_rate": 4.484513964707002e-05, + "loss": 0.9923, + "step": 2853 + }, + { + "epoch": 0.20999770062083237, + "grad_norm": 0.88671875, + "learning_rate": 4.484161683114856e-05, + "loss": 1.2173, + "step": 2854 + }, + { + "epoch": 0.21007128075419637, + "grad_norm": 0.83984375, + "learning_rate": 4.4838092950364995e-05, + "loss": 0.6753, + "step": 2855 + }, + { + "epoch": 0.21014486088756035, + "grad_norm": 1.0390625, + "learning_rate": 4.483456800490845e-05, + "loss": 0.8739, + "step": 2856 + }, + { + "epoch": 0.21021844102092435, + "grad_norm": 0.80859375, + "learning_rate": 4.483104199496811e-05, + "loss": 0.8887, + "step": 2857 + }, + { + "epoch": 0.21029202115428836, + "grad_norm": 0.8671875, + "learning_rate": 4.482751492073319e-05, + "loss": 0.9067, + "step": 2858 + }, + { + "epoch": 0.21036560128765233, + "grad_norm": 1.046875, + "learning_rate": 4.4823986782393e-05, + "loss": 1.1571, + "step": 2859 + }, + { + "epoch": 0.21043918142101634, + "grad_norm": 1.1015625, + "learning_rate": 4.482045758013689e-05, + "loss": 1.4446, + "step": 2860 + }, + { + "epoch": 0.2105127615543803, + "grad_norm": 0.8359375, + "learning_rate": 4.481692731415424e-05, + "loss": 1.0334, + "step": 2861 + }, + { + "epoch": 0.21058634168774432, + "grad_norm": 0.74609375, + "learning_rate": 4.481339598463454e-05, + "loss": 0.8166, + "step": 2862 + }, + { + "epoch": 0.2106599218211083, + "grad_norm": 0.90625, + "learning_rate": 4.48098635917673e-05, + "loss": 0.9122, + "step": 2863 + }, + { + "epoch": 0.2107335019544723, + "grad_norm": 0.96875, + "learning_rate": 4.4806330135742106e-05, + "loss": 1.5158, + "step": 2864 + }, + { + "epoch": 0.21080708208783627, + "grad_norm": 0.7109375, + "learning_rate": 4.480279561674856e-05, + "loss": 0.8485, + "step": 2865 + }, + { + "epoch": 0.21088066222120028, + "grad_norm": 0.875, + "learning_rate": 4.479926003497639e-05, + "loss": 0.7725, + "step": 2866 + }, + { + "epoch": 0.21095424235456428, + "grad_norm": 0.86328125, + "learning_rate": 4.479572339061533e-05, + "loss": 0.9324, + "step": 2867 + }, + { + "epoch": 0.21102782248792826, + "grad_norm": 0.875, + "learning_rate": 4.479218568385518e-05, + "loss": 1.1622, + "step": 2868 + }, + { + "epoch": 0.21110140262129226, + "grad_norm": 1.0390625, + "learning_rate": 4.47886469148858e-05, + "loss": 1.1396, + "step": 2869 + }, + { + "epoch": 0.21117498275465624, + "grad_norm": 0.84375, + "learning_rate": 4.478510708389713e-05, + "loss": 0.841, + "step": 2870 + }, + { + "epoch": 0.21124856288802024, + "grad_norm": 0.7734375, + "learning_rate": 4.478156619107912e-05, + "loss": 0.8657, + "step": 2871 + }, + { + "epoch": 0.21132214302138422, + "grad_norm": 1.0, + "learning_rate": 4.477802423662182e-05, + "loss": 1.0363, + "step": 2872 + }, + { + "epoch": 0.21139572315474822, + "grad_norm": 0.94921875, + "learning_rate": 4.4774481220715317e-05, + "loss": 1.0469, + "step": 2873 + }, + { + "epoch": 0.2114693032881122, + "grad_norm": 0.77734375, + "learning_rate": 4.477093714354975e-05, + "loss": 0.857, + "step": 2874 + }, + { + "epoch": 0.2115428834214762, + "grad_norm": 0.8125, + "learning_rate": 4.4767392005315334e-05, + "loss": 0.7133, + "step": 2875 + }, + { + "epoch": 0.2116164635548402, + "grad_norm": 0.78515625, + "learning_rate": 4.4763845806202323e-05, + "loss": 0.9766, + "step": 2876 + }, + { + "epoch": 0.21169004368820418, + "grad_norm": 0.87890625, + "learning_rate": 4.4760298546401026e-05, + "loss": 1.3768, + "step": 2877 + }, + { + "epoch": 0.2117636238215682, + "grad_norm": 0.66015625, + "learning_rate": 4.475675022610184e-05, + "loss": 0.6088, + "step": 2878 + }, + { + "epoch": 0.21183720395493216, + "grad_norm": 0.8046875, + "learning_rate": 4.475320084549518e-05, + "loss": 0.9634, + "step": 2879 + }, + { + "epoch": 0.21191078408829617, + "grad_norm": 0.8046875, + "learning_rate": 4.474965040477154e-05, + "loss": 0.8918, + "step": 2880 + }, + { + "epoch": 0.21198436422166014, + "grad_norm": 0.8046875, + "learning_rate": 4.4746098904121467e-05, + "loss": 1.0243, + "step": 2881 + }, + { + "epoch": 0.21205794435502415, + "grad_norm": 11.0625, + "learning_rate": 4.474254634373556e-05, + "loss": 1.5296, + "step": 2882 + }, + { + "epoch": 0.21213152448838812, + "grad_norm": 0.98828125, + "learning_rate": 4.473899272380447e-05, + "loss": 1.1332, + "step": 2883 + }, + { + "epoch": 0.21220510462175213, + "grad_norm": 1.03125, + "learning_rate": 4.473543804451893e-05, + "loss": 1.2872, + "step": 2884 + }, + { + "epoch": 0.21227868475511613, + "grad_norm": 0.96875, + "learning_rate": 4.4731882306069706e-05, + "loss": 0.9887, + "step": 2885 + }, + { + "epoch": 0.2123522648884801, + "grad_norm": 0.66796875, + "learning_rate": 4.472832550864763e-05, + "loss": 0.6461, + "step": 2886 + }, + { + "epoch": 0.2124258450218441, + "grad_norm": 0.765625, + "learning_rate": 4.472476765244358e-05, + "loss": 0.9668, + "step": 2887 + }, + { + "epoch": 0.2124994251552081, + "grad_norm": 0.953125, + "learning_rate": 4.47212087376485e-05, + "loss": 0.9317, + "step": 2888 + }, + { + "epoch": 0.2125730052885721, + "grad_norm": 0.796875, + "learning_rate": 4.47176487644534e-05, + "loss": 0.6655, + "step": 2889 + }, + { + "epoch": 0.21264658542193607, + "grad_norm": 0.70703125, + "learning_rate": 4.471408773304934e-05, + "loss": 0.7572, + "step": 2890 + }, + { + "epoch": 0.21272016555530007, + "grad_norm": 0.75, + "learning_rate": 4.471052564362742e-05, + "loss": 0.7705, + "step": 2891 + }, + { + "epoch": 0.21279374568866405, + "grad_norm": 0.8359375, + "learning_rate": 4.470696249637881e-05, + "loss": 0.7093, + "step": 2892 + }, + { + "epoch": 0.21286732582202805, + "grad_norm": 0.66796875, + "learning_rate": 4.4703398291494745e-05, + "loss": 0.6706, + "step": 2893 + }, + { + "epoch": 0.21294090595539206, + "grad_norm": 0.98046875, + "learning_rate": 4.4699833029166514e-05, + "loss": 1.0228, + "step": 2894 + }, + { + "epoch": 0.21301448608875603, + "grad_norm": 0.8984375, + "learning_rate": 4.4696266709585454e-05, + "loss": 0.9742, + "step": 2895 + }, + { + "epoch": 0.21308806622212004, + "grad_norm": 1.046875, + "learning_rate": 4.469269933294296e-05, + "loss": 0.7757, + "step": 2896 + }, + { + "epoch": 0.213161646355484, + "grad_norm": 0.9296875, + "learning_rate": 4.468913089943049e-05, + "loss": 0.9448, + "step": 2897 + }, + { + "epoch": 0.21323522648884802, + "grad_norm": 0.796875, + "learning_rate": 4.468556140923954e-05, + "loss": 0.964, + "step": 2898 + }, + { + "epoch": 0.213308806622212, + "grad_norm": 1.1171875, + "learning_rate": 4.468199086256169e-05, + "loss": 0.8315, + "step": 2899 + }, + { + "epoch": 0.213382386755576, + "grad_norm": 0.921875, + "learning_rate": 4.4678419259588576e-05, + "loss": 0.873, + "step": 2900 + }, + { + "epoch": 0.21345596688893997, + "grad_norm": 1.03125, + "learning_rate": 4.467484660051187e-05, + "loss": 0.943, + "step": 2901 + }, + { + "epoch": 0.21352954702230398, + "grad_norm": 0.8125, + "learning_rate": 4.4671272885523294e-05, + "loss": 0.8535, + "step": 2902 + }, + { + "epoch": 0.21360312715566798, + "grad_norm": 0.79296875, + "learning_rate": 4.466769811481466e-05, + "loss": 1.138, + "step": 2903 + }, + { + "epoch": 0.21367670728903196, + "grad_norm": 0.83984375, + "learning_rate": 4.466412228857782e-05, + "loss": 1.1713, + "step": 2904 + }, + { + "epoch": 0.21375028742239596, + "grad_norm": 0.9609375, + "learning_rate": 4.466054540700467e-05, + "loss": 0.9006, + "step": 2905 + }, + { + "epoch": 0.21382386755575994, + "grad_norm": 0.8671875, + "learning_rate": 4.465696747028719e-05, + "loss": 1.2365, + "step": 2906 + }, + { + "epoch": 0.21389744768912394, + "grad_norm": 0.73046875, + "learning_rate": 4.4653388478617385e-05, + "loss": 0.6855, + "step": 2907 + }, + { + "epoch": 0.21397102782248792, + "grad_norm": 0.84765625, + "learning_rate": 4.464980843218734e-05, + "loss": 0.9935, + "step": 2908 + }, + { + "epoch": 0.21404460795585192, + "grad_norm": 0.85546875, + "learning_rate": 4.46462273311892e-05, + "loss": 1.0645, + "step": 2909 + }, + { + "epoch": 0.2141181880892159, + "grad_norm": 0.75, + "learning_rate": 4.464264517581514e-05, + "loss": 0.875, + "step": 2910 + }, + { + "epoch": 0.2141917682225799, + "grad_norm": 1.0234375, + "learning_rate": 4.463906196625741e-05, + "loss": 1.2615, + "step": 2911 + }, + { + "epoch": 0.2142653483559439, + "grad_norm": 0.9453125, + "learning_rate": 4.463547770270832e-05, + "loss": 1.1156, + "step": 2912 + }, + { + "epoch": 0.21433892848930788, + "grad_norm": 0.83984375, + "learning_rate": 4.463189238536023e-05, + "loss": 0.681, + "step": 2913 + }, + { + "epoch": 0.2144125086226719, + "grad_norm": 0.7890625, + "learning_rate": 4.4628306014405554e-05, + "loss": 0.8803, + "step": 2914 + }, + { + "epoch": 0.21448608875603586, + "grad_norm": 0.87890625, + "learning_rate": 4.4624718590036763e-05, + "loss": 0.7558, + "step": 2915 + }, + { + "epoch": 0.21455966888939987, + "grad_norm": 0.8359375, + "learning_rate": 4.462113011244639e-05, + "loss": 0.7771, + "step": 2916 + }, + { + "epoch": 0.21463324902276384, + "grad_norm": 0.83203125, + "learning_rate": 4.461754058182703e-05, + "loss": 0.7916, + "step": 2917 + }, + { + "epoch": 0.21470682915612785, + "grad_norm": 0.671875, + "learning_rate": 4.461394999837131e-05, + "loss": 0.5213, + "step": 2918 + }, + { + "epoch": 0.21478040928949182, + "grad_norm": 0.87109375, + "learning_rate": 4.461035836227194e-05, + "loss": 1.3971, + "step": 2919 + }, + { + "epoch": 0.21485398942285583, + "grad_norm": 0.66015625, + "learning_rate": 4.4606765673721684e-05, + "loss": 0.6099, + "step": 2920 + }, + { + "epoch": 0.21492756955621983, + "grad_norm": 0.890625, + "learning_rate": 4.460317193291335e-05, + "loss": 0.868, + "step": 2921 + }, + { + "epoch": 0.2150011496895838, + "grad_norm": 0.8359375, + "learning_rate": 4.459957714003979e-05, + "loss": 1.1265, + "step": 2922 + }, + { + "epoch": 0.2150747298229478, + "grad_norm": 0.63671875, + "learning_rate": 4.459598129529395e-05, + "loss": 0.8591, + "step": 2923 + }, + { + "epoch": 0.2151483099563118, + "grad_norm": 0.81640625, + "learning_rate": 4.4592384398868804e-05, + "loss": 1.0894, + "step": 2924 + }, + { + "epoch": 0.2152218900896758, + "grad_norm": 0.87890625, + "learning_rate": 4.4588786450957384e-05, + "loss": 1.0027, + "step": 2925 + }, + { + "epoch": 0.21529547022303977, + "grad_norm": 0.80078125, + "learning_rate": 4.458518745175281e-05, + "loss": 0.9333, + "step": 2926 + }, + { + "epoch": 0.21536905035640377, + "grad_norm": 0.92578125, + "learning_rate": 4.458158740144821e-05, + "loss": 1.0506, + "step": 2927 + }, + { + "epoch": 0.21544263048976775, + "grad_norm": 0.77734375, + "learning_rate": 4.45779863002368e-05, + "loss": 0.8345, + "step": 2928 + }, + { + "epoch": 0.21551621062313175, + "grad_norm": 1.125, + "learning_rate": 4.457438414831183e-05, + "loss": 1.1072, + "step": 2929 + }, + { + "epoch": 0.21558979075649576, + "grad_norm": 0.96875, + "learning_rate": 4.4570780945866644e-05, + "loss": 0.861, + "step": 2930 + }, + { + "epoch": 0.21566337088985973, + "grad_norm": 1.1171875, + "learning_rate": 4.4567176693094606e-05, + "loss": 1.2429, + "step": 2931 + }, + { + "epoch": 0.21573695102322374, + "grad_norm": 0.93359375, + "learning_rate": 4.456357139018915e-05, + "loss": 0.7611, + "step": 2932 + }, + { + "epoch": 0.21581053115658771, + "grad_norm": 1.0859375, + "learning_rate": 4.4559965037343776e-05, + "loss": 0.9303, + "step": 2933 + }, + { + "epoch": 0.21588411128995172, + "grad_norm": 0.9921875, + "learning_rate": 4.455635763475202e-05, + "loss": 0.8904, + "step": 2934 + }, + { + "epoch": 0.2159576914233157, + "grad_norm": 0.890625, + "learning_rate": 4.455274918260748e-05, + "loss": 0.9982, + "step": 2935 + }, + { + "epoch": 0.2160312715566797, + "grad_norm": 1.2890625, + "learning_rate": 4.4549139681103825e-05, + "loss": 1.307, + "step": 2936 + }, + { + "epoch": 0.21610485169004368, + "grad_norm": 0.89453125, + "learning_rate": 4.454552913043477e-05, + "loss": 0.918, + "step": 2937 + }, + { + "epoch": 0.21617843182340768, + "grad_norm": 0.87109375, + "learning_rate": 4.454191753079408e-05, + "loss": 0.8802, + "step": 2938 + }, + { + "epoch": 0.21625201195677168, + "grad_norm": 0.9140625, + "learning_rate": 4.4538304882375584e-05, + "loss": 1.3401, + "step": 2939 + }, + { + "epoch": 0.21632559209013566, + "grad_norm": 0.8046875, + "learning_rate": 4.453469118537317e-05, + "loss": 0.817, + "step": 2940 + }, + { + "epoch": 0.21639917222349966, + "grad_norm": 0.99609375, + "learning_rate": 4.453107643998077e-05, + "loss": 1.4482, + "step": 2941 + }, + { + "epoch": 0.21647275235686364, + "grad_norm": 0.97265625, + "learning_rate": 4.452746064639239e-05, + "loss": 0.9268, + "step": 2942 + }, + { + "epoch": 0.21654633249022764, + "grad_norm": 0.95703125, + "learning_rate": 4.452384380480208e-05, + "loss": 1.0061, + "step": 2943 + }, + { + "epoch": 0.21661991262359162, + "grad_norm": 1.15625, + "learning_rate": 4.4520225915403945e-05, + "loss": 1.6814, + "step": 2944 + }, + { + "epoch": 0.21669349275695562, + "grad_norm": 0.734375, + "learning_rate": 4.451660697839216e-05, + "loss": 0.5645, + "step": 2945 + }, + { + "epoch": 0.2167670728903196, + "grad_norm": 1.03125, + "learning_rate": 4.4512986993960936e-05, + "loss": 1.4558, + "step": 2946 + }, + { + "epoch": 0.2168406530236836, + "grad_norm": 0.95703125, + "learning_rate": 4.450936596230456e-05, + "loss": 0.9441, + "step": 2947 + }, + { + "epoch": 0.2169142331570476, + "grad_norm": 1.0, + "learning_rate": 4.450574388361735e-05, + "loss": 1.1417, + "step": 2948 + }, + { + "epoch": 0.21698781329041159, + "grad_norm": 0.80859375, + "learning_rate": 4.450212075809371e-05, + "loss": 1.1416, + "step": 2949 + }, + { + "epoch": 0.2170613934237756, + "grad_norm": 0.92578125, + "learning_rate": 4.449849658592809e-05, + "loss": 1.0104, + "step": 2950 + }, + { + "epoch": 0.21713497355713957, + "grad_norm": 0.96484375, + "learning_rate": 4.4494871367314984e-05, + "loss": 1.3781, + "step": 2951 + }, + { + "epoch": 0.21720855369050357, + "grad_norm": 0.828125, + "learning_rate": 4.4491245102448955e-05, + "loss": 0.6403, + "step": 2952 + }, + { + "epoch": 0.21728213382386755, + "grad_norm": 0.80859375, + "learning_rate": 4.4487617791524605e-05, + "loss": 1.4903, + "step": 2953 + }, + { + "epoch": 0.21735571395723155, + "grad_norm": 0.77734375, + "learning_rate": 4.4483989434736624e-05, + "loss": 0.6208, + "step": 2954 + }, + { + "epoch": 0.21742929409059553, + "grad_norm": 0.8203125, + "learning_rate": 4.448036003227972e-05, + "loss": 0.8564, + "step": 2955 + }, + { + "epoch": 0.21750287422395953, + "grad_norm": 0.703125, + "learning_rate": 4.44767295843487e-05, + "loss": 0.7602, + "step": 2956 + }, + { + "epoch": 0.21757645435732353, + "grad_norm": 0.7890625, + "learning_rate": 4.4473098091138374e-05, + "loss": 0.9807, + "step": 2957 + }, + { + "epoch": 0.2176500344906875, + "grad_norm": 0.7421875, + "learning_rate": 4.446946555284366e-05, + "loss": 0.705, + "step": 2958 + }, + { + "epoch": 0.21772361462405151, + "grad_norm": 0.9140625, + "learning_rate": 4.446583196965951e-05, + "loss": 1.282, + "step": 2959 + }, + { + "epoch": 0.2177971947574155, + "grad_norm": 0.890625, + "learning_rate": 4.446219734178092e-05, + "loss": 0.9672, + "step": 2960 + }, + { + "epoch": 0.2178707748907795, + "grad_norm": 1.1015625, + "learning_rate": 4.445856166940296e-05, + "loss": 0.8693, + "step": 2961 + }, + { + "epoch": 0.21794435502414347, + "grad_norm": 0.9921875, + "learning_rate": 4.445492495272073e-05, + "loss": 1.3718, + "step": 2962 + }, + { + "epoch": 0.21801793515750748, + "grad_norm": 0.83984375, + "learning_rate": 4.4451287191929436e-05, + "loss": 1.0069, + "step": 2963 + }, + { + "epoch": 0.21809151529087145, + "grad_norm": 0.93359375, + "learning_rate": 4.444764838722429e-05, + "loss": 1.0451, + "step": 2964 + }, + { + "epoch": 0.21816509542423546, + "grad_norm": 0.7734375, + "learning_rate": 4.4444008538800604e-05, + "loss": 1.0163, + "step": 2965 + }, + { + "epoch": 0.21823867555759946, + "grad_norm": 0.86328125, + "learning_rate": 4.444036764685368e-05, + "loss": 1.4415, + "step": 2966 + }, + { + "epoch": 0.21831225569096344, + "grad_norm": 0.79296875, + "learning_rate": 4.4436725711578965e-05, + "loss": 1.0115, + "step": 2967 + }, + { + "epoch": 0.21838583582432744, + "grad_norm": 0.8125, + "learning_rate": 4.443308273317188e-05, + "loss": 0.8376, + "step": 2968 + }, + { + "epoch": 0.21845941595769142, + "grad_norm": 1.171875, + "learning_rate": 4.442943871182795e-05, + "loss": 1.134, + "step": 2969 + }, + { + "epoch": 0.21853299609105542, + "grad_norm": 0.90625, + "learning_rate": 4.442579364774274e-05, + "loss": 1.2769, + "step": 2970 + }, + { + "epoch": 0.2186065762244194, + "grad_norm": 0.7421875, + "learning_rate": 4.442214754111188e-05, + "loss": 1.0365, + "step": 2971 + }, + { + "epoch": 0.2186801563577834, + "grad_norm": 0.9140625, + "learning_rate": 4.441850039213104e-05, + "loss": 1.0726, + "step": 2972 + }, + { + "epoch": 0.21875373649114738, + "grad_norm": 1.046875, + "learning_rate": 4.441485220099596e-05, + "loss": 1.5218, + "step": 2973 + }, + { + "epoch": 0.21882731662451138, + "grad_norm": 0.7734375, + "learning_rate": 4.441120296790243e-05, + "loss": 0.7332, + "step": 2974 + }, + { + "epoch": 0.21890089675787539, + "grad_norm": 0.8828125, + "learning_rate": 4.440755269304631e-05, + "loss": 0.9733, + "step": 2975 + }, + { + "epoch": 0.21897447689123936, + "grad_norm": 0.984375, + "learning_rate": 4.440390137662348e-05, + "loss": 1.2675, + "step": 2976 + }, + { + "epoch": 0.21904805702460337, + "grad_norm": 1.03125, + "learning_rate": 4.440024901882992e-05, + "loss": 1.324, + "step": 2977 + }, + { + "epoch": 0.21912163715796734, + "grad_norm": 1.578125, + "learning_rate": 4.439659561986164e-05, + "loss": 0.7713, + "step": 2978 + }, + { + "epoch": 0.21919521729133135, + "grad_norm": 0.98828125, + "learning_rate": 4.4392941179914696e-05, + "loss": 1.0236, + "step": 2979 + }, + { + "epoch": 0.21926879742469532, + "grad_norm": 0.91015625, + "learning_rate": 4.4389285699185235e-05, + "loss": 1.0041, + "step": 2980 + }, + { + "epoch": 0.21934237755805933, + "grad_norm": 0.86328125, + "learning_rate": 4.438562917786943e-05, + "loss": 0.9832, + "step": 2981 + }, + { + "epoch": 0.2194159576914233, + "grad_norm": 0.97265625, + "learning_rate": 4.438197161616352e-05, + "loss": 1.1787, + "step": 2982 + }, + { + "epoch": 0.2194895378247873, + "grad_norm": 0.69921875, + "learning_rate": 4.43783130142638e-05, + "loss": 0.8349, + "step": 2983 + }, + { + "epoch": 0.2195631179581513, + "grad_norm": 0.87890625, + "learning_rate": 4.437465337236662e-05, + "loss": 0.8361, + "step": 2984 + }, + { + "epoch": 0.2196366980915153, + "grad_norm": 0.8046875, + "learning_rate": 4.437099269066839e-05, + "loss": 0.7906, + "step": 2985 + }, + { + "epoch": 0.2197102782248793, + "grad_norm": 1.015625, + "learning_rate": 4.436733096936557e-05, + "loss": 1.1253, + "step": 2986 + }, + { + "epoch": 0.21978385835824327, + "grad_norm": 0.8203125, + "learning_rate": 4.436366820865468e-05, + "loss": 0.9108, + "step": 2987 + }, + { + "epoch": 0.21985743849160727, + "grad_norm": 0.8203125, + "learning_rate": 4.436000440873228e-05, + "loss": 0.8993, + "step": 2988 + }, + { + "epoch": 0.21993101862497125, + "grad_norm": 0.8203125, + "learning_rate": 4.435633956979501e-05, + "loss": 0.9804, + "step": 2989 + }, + { + "epoch": 0.22000459875833525, + "grad_norm": 0.734375, + "learning_rate": 4.4352673692039564e-05, + "loss": 0.8479, + "step": 2990 + }, + { + "epoch": 0.22007817889169923, + "grad_norm": 1.09375, + "learning_rate": 4.4349006775662664e-05, + "loss": 1.5212, + "step": 2991 + }, + { + "epoch": 0.22015175902506323, + "grad_norm": 0.86328125, + "learning_rate": 4.434533882086112e-05, + "loss": 0.8776, + "step": 2992 + }, + { + "epoch": 0.22022533915842724, + "grad_norm": 0.9453125, + "learning_rate": 4.434166982783178e-05, + "loss": 0.7635, + "step": 2993 + }, + { + "epoch": 0.2202989192917912, + "grad_norm": 0.7890625, + "learning_rate": 4.433799979677155e-05, + "loss": 0.8248, + "step": 2994 + }, + { + "epoch": 0.22037249942515522, + "grad_norm": 0.73046875, + "learning_rate": 4.43343287278774e-05, + "loss": 0.987, + "step": 2995 + }, + { + "epoch": 0.2204460795585192, + "grad_norm": 0.859375, + "learning_rate": 4.4330656621346336e-05, + "loss": 0.9917, + "step": 2996 + }, + { + "epoch": 0.2205196596918832, + "grad_norm": 0.77734375, + "learning_rate": 4.432698347737545e-05, + "loss": 0.8746, + "step": 2997 + }, + { + "epoch": 0.22059323982524717, + "grad_norm": 0.74609375, + "learning_rate": 4.432330929616185e-05, + "loss": 0.8718, + "step": 2998 + }, + { + "epoch": 0.22066681995861118, + "grad_norm": 1.0, + "learning_rate": 4.4319634077902746e-05, + "loss": 1.2857, + "step": 2999 + }, + { + "epoch": 0.22074040009197515, + "grad_norm": 1.015625, + "learning_rate": 4.4315957822795374e-05, + "loss": 0.9954, + "step": 3000 + }, + { + "epoch": 0.22081398022533916, + "grad_norm": 0.61328125, + "learning_rate": 4.4312280531037025e-05, + "loss": 0.7495, + "step": 3001 + }, + { + "epoch": 0.22088756035870316, + "grad_norm": 0.78515625, + "learning_rate": 4.430860220282506e-05, + "loss": 0.7469, + "step": 3002 + }, + { + "epoch": 0.22096114049206714, + "grad_norm": 0.88671875, + "learning_rate": 4.430492283835688e-05, + "loss": 0.7839, + "step": 3003 + }, + { + "epoch": 0.22103472062543114, + "grad_norm": 1.1953125, + "learning_rate": 4.430124243782995e-05, + "loss": 1.5534, + "step": 3004 + }, + { + "epoch": 0.22110830075879512, + "grad_norm": 0.8359375, + "learning_rate": 4.429756100144179e-05, + "loss": 0.8677, + "step": 3005 + }, + { + "epoch": 0.22118188089215912, + "grad_norm": 0.71875, + "learning_rate": 4.429387852938999e-05, + "loss": 0.7379, + "step": 3006 + }, + { + "epoch": 0.2212554610255231, + "grad_norm": 0.80078125, + "learning_rate": 4.4290195021872164e-05, + "loss": 0.9089, + "step": 3007 + }, + { + "epoch": 0.2213290411588871, + "grad_norm": 0.92578125, + "learning_rate": 4.428651047908601e-05, + "loss": 1.0036, + "step": 3008 + }, + { + "epoch": 0.22140262129225108, + "grad_norm": 0.73828125, + "learning_rate": 4.428282490122926e-05, + "loss": 0.7464, + "step": 3009 + }, + { + "epoch": 0.22147620142561508, + "grad_norm": 0.8828125, + "learning_rate": 4.4279138288499725e-05, + "loss": 0.7255, + "step": 3010 + }, + { + "epoch": 0.2215497815589791, + "grad_norm": 0.66015625, + "learning_rate": 4.427545064109524e-05, + "loss": 0.6455, + "step": 3011 + }, + { + "epoch": 0.22162336169234306, + "grad_norm": 0.62890625, + "learning_rate": 4.427176195921373e-05, + "loss": 0.6021, + "step": 3012 + }, + { + "epoch": 0.22169694182570707, + "grad_norm": 0.8046875, + "learning_rate": 4.426807224305316e-05, + "loss": 1.094, + "step": 3013 + }, + { + "epoch": 0.22177052195907104, + "grad_norm": 0.8203125, + "learning_rate": 4.426438149281154e-05, + "loss": 1.2464, + "step": 3014 + }, + { + "epoch": 0.22184410209243505, + "grad_norm": 0.875, + "learning_rate": 4.4260689708686945e-05, + "loss": 1.2478, + "step": 3015 + }, + { + "epoch": 0.22191768222579902, + "grad_norm": 0.95703125, + "learning_rate": 4.425699689087752e-05, + "loss": 0.8136, + "step": 3016 + }, + { + "epoch": 0.22199126235916303, + "grad_norm": 0.8125, + "learning_rate": 4.4253303039581436e-05, + "loss": 1.1139, + "step": 3017 + }, + { + "epoch": 0.22206484249252703, + "grad_norm": 0.765625, + "learning_rate": 4.424960815499695e-05, + "loss": 0.7272, + "step": 3018 + }, + { + "epoch": 0.222138422625891, + "grad_norm": 0.66015625, + "learning_rate": 4.424591223732235e-05, + "loss": 0.525, + "step": 3019 + }, + { + "epoch": 0.222212002759255, + "grad_norm": 0.67578125, + "learning_rate": 4.4242215286755987e-05, + "loss": 0.6138, + "step": 3020 + }, + { + "epoch": 0.222285582892619, + "grad_norm": 0.9921875, + "learning_rate": 4.4238517303496276e-05, + "loss": 0.9417, + "step": 3021 + }, + { + "epoch": 0.222359163025983, + "grad_norm": 1.2265625, + "learning_rate": 4.423481828774168e-05, + "loss": 1.2537, + "step": 3022 + }, + { + "epoch": 0.22243274315934697, + "grad_norm": 0.80078125, + "learning_rate": 4.4231118239690714e-05, + "loss": 0.7667, + "step": 3023 + }, + { + "epoch": 0.22250632329271097, + "grad_norm": 0.76953125, + "learning_rate": 4.422741715954195e-05, + "loss": 0.8733, + "step": 3024 + }, + { + "epoch": 0.22257990342607495, + "grad_norm": 0.88671875, + "learning_rate": 4.422371504749403e-05, + "loss": 0.9773, + "step": 3025 + }, + { + "epoch": 0.22265348355943895, + "grad_norm": 1.171875, + "learning_rate": 4.422001190374563e-05, + "loss": 1.087, + "step": 3026 + }, + { + "epoch": 0.22272706369280296, + "grad_norm": 0.68359375, + "learning_rate": 4.421630772849549e-05, + "loss": 0.5611, + "step": 3027 + }, + { + "epoch": 0.22280064382616693, + "grad_norm": 0.875, + "learning_rate": 4.42126025219424e-05, + "loss": 1.0163, + "step": 3028 + }, + { + "epoch": 0.22287422395953094, + "grad_norm": 0.90234375, + "learning_rate": 4.4208896284285235e-05, + "loss": 1.0159, + "step": 3029 + }, + { + "epoch": 0.2229478040928949, + "grad_norm": 0.8671875, + "learning_rate": 4.420518901572288e-05, + "loss": 1.0498, + "step": 3030 + }, + { + "epoch": 0.22302138422625892, + "grad_norm": 1.09375, + "learning_rate": 4.42014807164543e-05, + "loss": 1.3386, + "step": 3031 + }, + { + "epoch": 0.2230949643596229, + "grad_norm": 0.59765625, + "learning_rate": 4.419777138667853e-05, + "loss": 0.7489, + "step": 3032 + }, + { + "epoch": 0.2231685444929869, + "grad_norm": 0.88671875, + "learning_rate": 4.4194061026594616e-05, + "loss": 1.1325, + "step": 3033 + }, + { + "epoch": 0.22324212462635087, + "grad_norm": 0.98828125, + "learning_rate": 4.419034963640171e-05, + "loss": 1.0251, + "step": 3034 + }, + { + "epoch": 0.22331570475971488, + "grad_norm": 1.125, + "learning_rate": 4.418663721629897e-05, + "loss": 1.1174, + "step": 3035 + }, + { + "epoch": 0.22338928489307888, + "grad_norm": 0.8359375, + "learning_rate": 4.418292376648566e-05, + "loss": 0.9524, + "step": 3036 + }, + { + "epoch": 0.22346286502644286, + "grad_norm": 0.97265625, + "learning_rate": 4.417920928716106e-05, + "loss": 1.2242, + "step": 3037 + }, + { + "epoch": 0.22353644515980686, + "grad_norm": 0.8515625, + "learning_rate": 4.417549377852452e-05, + "loss": 0.919, + "step": 3038 + }, + { + "epoch": 0.22361002529317084, + "grad_norm": 0.92578125, + "learning_rate": 4.4171777240775444e-05, + "loss": 1.1967, + "step": 3039 + }, + { + "epoch": 0.22368360542653484, + "grad_norm": 0.84765625, + "learning_rate": 4.41680596741133e-05, + "loss": 0.9151, + "step": 3040 + }, + { + "epoch": 0.22375718555989882, + "grad_norm": 0.84765625, + "learning_rate": 4.41643410787376e-05, + "loss": 0.7744, + "step": 3041 + }, + { + "epoch": 0.22383076569326282, + "grad_norm": 0.73828125, + "learning_rate": 4.416062145484791e-05, + "loss": 0.6924, + "step": 3042 + }, + { + "epoch": 0.2239043458266268, + "grad_norm": 1.1328125, + "learning_rate": 4.4156900802643844e-05, + "loss": 1.6845, + "step": 3043 + }, + { + "epoch": 0.2239779259599908, + "grad_norm": 0.76171875, + "learning_rate": 4.41531791223251e-05, + "loss": 0.8964, + "step": 3044 + }, + { + "epoch": 0.2240515060933548, + "grad_norm": 0.72265625, + "learning_rate": 4.4149456414091404e-05, + "loss": 1.0024, + "step": 3045 + }, + { + "epoch": 0.22412508622671878, + "grad_norm": 0.6875, + "learning_rate": 4.414573267814256e-05, + "loss": 0.8576, + "step": 3046 + }, + { + "epoch": 0.2241986663600828, + "grad_norm": 0.86328125, + "learning_rate": 4.41420079146784e-05, + "loss": 1.1391, + "step": 3047 + }, + { + "epoch": 0.22427224649344676, + "grad_norm": 0.71875, + "learning_rate": 4.413828212389883e-05, + "loss": 0.7299, + "step": 3048 + }, + { + "epoch": 0.22434582662681077, + "grad_norm": 0.84375, + "learning_rate": 4.4134555306003804e-05, + "loss": 0.9945, + "step": 3049 + }, + { + "epoch": 0.22441940676017474, + "grad_norm": 1.1015625, + "learning_rate": 4.4130827461193334e-05, + "loss": 1.0733, + "step": 3050 + }, + { + "epoch": 0.22449298689353875, + "grad_norm": 0.77734375, + "learning_rate": 4.412709858966749e-05, + "loss": 0.7298, + "step": 3051 + }, + { + "epoch": 0.22456656702690272, + "grad_norm": 1.0078125, + "learning_rate": 4.412336869162639e-05, + "loss": 0.8285, + "step": 3052 + }, + { + "epoch": 0.22464014716026673, + "grad_norm": 0.90234375, + "learning_rate": 4.4119637767270204e-05, + "loss": 0.9029, + "step": 3053 + }, + { + "epoch": 0.22471372729363073, + "grad_norm": 1.109375, + "learning_rate": 4.4115905816799186e-05, + "loss": 1.1086, + "step": 3054 + }, + { + "epoch": 0.2247873074269947, + "grad_norm": 0.88671875, + "learning_rate": 4.411217284041359e-05, + "loss": 0.9996, + "step": 3055 + }, + { + "epoch": 0.2248608875603587, + "grad_norm": 0.82421875, + "learning_rate": 4.410843883831379e-05, + "loss": 0.8473, + "step": 3056 + }, + { + "epoch": 0.2249344676937227, + "grad_norm": 0.76953125, + "learning_rate": 4.4104703810700167e-05, + "loss": 0.9594, + "step": 3057 + }, + { + "epoch": 0.2250080478270867, + "grad_norm": 0.84765625, + "learning_rate": 4.410096775777316e-05, + "loss": 0.7604, + "step": 3058 + }, + { + "epoch": 0.22508162796045067, + "grad_norm": 0.953125, + "learning_rate": 4.4097230679733305e-05, + "loss": 0.959, + "step": 3059 + }, + { + "epoch": 0.22515520809381467, + "grad_norm": 0.765625, + "learning_rate": 4.409349257678114e-05, + "loss": 0.6229, + "step": 3060 + }, + { + "epoch": 0.22522878822717865, + "grad_norm": 1.0, + "learning_rate": 4.4089753449117296e-05, + "loss": 0.9904, + "step": 3061 + }, + { + "epoch": 0.22530236836054265, + "grad_norm": 0.765625, + "learning_rate": 4.408601329694244e-05, + "loss": 0.8742, + "step": 3062 + }, + { + "epoch": 0.22537594849390666, + "grad_norm": 0.8359375, + "learning_rate": 4.4082272120457294e-05, + "loss": 0.9492, + "step": 3063 + }, + { + "epoch": 0.22544952862727063, + "grad_norm": 0.98046875, + "learning_rate": 4.407852991986265e-05, + "loss": 1.6065, + "step": 3064 + }, + { + "epoch": 0.22552310876063464, + "grad_norm": 0.7734375, + "learning_rate": 4.407478669535934e-05, + "loss": 0.7719, + "step": 3065 + }, + { + "epoch": 0.22559668889399861, + "grad_norm": 0.8828125, + "learning_rate": 4.4071042447148246e-05, + "loss": 1.1843, + "step": 3066 + }, + { + "epoch": 0.22567026902736262, + "grad_norm": 0.828125, + "learning_rate": 4.406729717543033e-05, + "loss": 0.9189, + "step": 3067 + }, + { + "epoch": 0.2257438491607266, + "grad_norm": 0.9140625, + "learning_rate": 4.406355088040659e-05, + "loss": 1.0585, + "step": 3068 + }, + { + "epoch": 0.2258174292940906, + "grad_norm": 0.85546875, + "learning_rate": 4.405980356227808e-05, + "loss": 1.1171, + "step": 3069 + }, + { + "epoch": 0.22589100942745458, + "grad_norm": 0.875, + "learning_rate": 4.4056055221245904e-05, + "loss": 1.3748, + "step": 3070 + }, + { + "epoch": 0.22596458956081858, + "grad_norm": 0.93359375, + "learning_rate": 4.4052305857511245e-05, + "loss": 0.9345, + "step": 3071 + }, + { + "epoch": 0.22603816969418258, + "grad_norm": 1.046875, + "learning_rate": 4.404855547127531e-05, + "loss": 1.442, + "step": 3072 + }, + { + "epoch": 0.22611174982754656, + "grad_norm": 1.09375, + "learning_rate": 4.4044804062739385e-05, + "loss": 1.0865, + "step": 3073 + }, + { + "epoch": 0.22618532996091056, + "grad_norm": 0.73046875, + "learning_rate": 4.4041051632104795e-05, + "loss": 0.6178, + "step": 3074 + }, + { + "epoch": 0.22625891009427454, + "grad_norm": 0.99609375, + "learning_rate": 4.403729817957293e-05, + "loss": 0.952, + "step": 3075 + }, + { + "epoch": 0.22633249022763854, + "grad_norm": 0.921875, + "learning_rate": 4.403354370534522e-05, + "loss": 1.2966, + "step": 3076 + }, + { + "epoch": 0.22640607036100252, + "grad_norm": 0.88671875, + "learning_rate": 4.402978820962317e-05, + "loss": 1.0168, + "step": 3077 + }, + { + "epoch": 0.22647965049436652, + "grad_norm": 0.890625, + "learning_rate": 4.402603169260834e-05, + "loss": 1.0222, + "step": 3078 + }, + { + "epoch": 0.2265532306277305, + "grad_norm": 0.86328125, + "learning_rate": 4.402227415450231e-05, + "loss": 1.119, + "step": 3079 + }, + { + "epoch": 0.2266268107610945, + "grad_norm": 0.9296875, + "learning_rate": 4.401851559550676e-05, + "loss": 0.9735, + "step": 3080 + }, + { + "epoch": 0.2267003908944585, + "grad_norm": 1.1640625, + "learning_rate": 4.40147560158234e-05, + "loss": 1.0884, + "step": 3081 + }, + { + "epoch": 0.22677397102782249, + "grad_norm": 0.921875, + "learning_rate": 4.4010995415654e-05, + "loss": 1.2867, + "step": 3082 + }, + { + "epoch": 0.2268475511611865, + "grad_norm": 1.171875, + "learning_rate": 4.400723379520038e-05, + "loss": 1.1063, + "step": 3083 + }, + { + "epoch": 0.22692113129455047, + "grad_norm": 1.1640625, + "learning_rate": 4.400347115466442e-05, + "loss": 1.4118, + "step": 3084 + }, + { + "epoch": 0.22699471142791447, + "grad_norm": 0.81640625, + "learning_rate": 4.399970749424805e-05, + "loss": 0.6913, + "step": 3085 + }, + { + "epoch": 0.22706829156127845, + "grad_norm": 0.90625, + "learning_rate": 4.399594281415328e-05, + "loss": 1.0776, + "step": 3086 + }, + { + "epoch": 0.22714187169464245, + "grad_norm": 1.0078125, + "learning_rate": 4.3992177114582124e-05, + "loss": 1.0784, + "step": 3087 + }, + { + "epoch": 0.22721545182800643, + "grad_norm": 0.859375, + "learning_rate": 4.39884103957367e-05, + "loss": 0.9732, + "step": 3088 + }, + { + "epoch": 0.22728903196137043, + "grad_norm": 0.8828125, + "learning_rate": 4.398464265781915e-05, + "loss": 0.836, + "step": 3089 + }, + { + "epoch": 0.22736261209473443, + "grad_norm": 0.828125, + "learning_rate": 4.3980873901031695e-05, + "loss": 0.8301, + "step": 3090 + }, + { + "epoch": 0.2274361922280984, + "grad_norm": 0.66796875, + "learning_rate": 4.397710412557657e-05, + "loss": 0.7434, + "step": 3091 + }, + { + "epoch": 0.22750977236146241, + "grad_norm": 0.75390625, + "learning_rate": 4.397333333165613e-05, + "loss": 1.0613, + "step": 3092 + }, + { + "epoch": 0.2275833524948264, + "grad_norm": 0.81640625, + "learning_rate": 4.3969561519472716e-05, + "loss": 0.7699, + "step": 3093 + }, + { + "epoch": 0.2276569326281904, + "grad_norm": 0.73828125, + "learning_rate": 4.3965788689228757e-05, + "loss": 0.7306, + "step": 3094 + }, + { + "epoch": 0.22773051276155437, + "grad_norm": 0.9140625, + "learning_rate": 4.3962014841126744e-05, + "loss": 1.2517, + "step": 3095 + }, + { + "epoch": 0.22780409289491838, + "grad_norm": 1.0546875, + "learning_rate": 4.3958239975369215e-05, + "loss": 1.1137, + "step": 3096 + }, + { + "epoch": 0.22787767302828235, + "grad_norm": 0.76171875, + "learning_rate": 4.3954464092158745e-05, + "loss": 0.8204, + "step": 3097 + }, + { + "epoch": 0.22795125316164636, + "grad_norm": 0.96484375, + "learning_rate": 4.395068719169799e-05, + "loss": 0.8983, + "step": 3098 + }, + { + "epoch": 0.22802483329501036, + "grad_norm": 0.8203125, + "learning_rate": 4.394690927418965e-05, + "loss": 0.8394, + "step": 3099 + }, + { + "epoch": 0.22809841342837434, + "grad_norm": 0.99609375, + "learning_rate": 4.394313033983648e-05, + "loss": 1.228, + "step": 3100 + }, + { + "epoch": 0.22817199356173834, + "grad_norm": 1.859375, + "learning_rate": 4.3939350388841274e-05, + "loss": 1.1662, + "step": 3101 + }, + { + "epoch": 0.22824557369510232, + "grad_norm": 0.8984375, + "learning_rate": 4.39355694214069e-05, + "loss": 0.8935, + "step": 3102 + }, + { + "epoch": 0.22831915382846632, + "grad_norm": 0.75390625, + "learning_rate": 4.393178743773629e-05, + "loss": 0.7445, + "step": 3103 + }, + { + "epoch": 0.2283927339618303, + "grad_norm": 0.9140625, + "learning_rate": 4.39280044380324e-05, + "loss": 1.2452, + "step": 3104 + }, + { + "epoch": 0.2284663140951943, + "grad_norm": 0.7578125, + "learning_rate": 4.392422042249826e-05, + "loss": 0.8325, + "step": 3105 + }, + { + "epoch": 0.22853989422855828, + "grad_norm": 0.953125, + "learning_rate": 4.3920435391336956e-05, + "loss": 1.4204, + "step": 3106 + }, + { + "epoch": 0.22861347436192228, + "grad_norm": 0.80859375, + "learning_rate": 4.391664934475162e-05, + "loss": 0.7727, + "step": 3107 + }, + { + "epoch": 0.22868705449528628, + "grad_norm": 0.91796875, + "learning_rate": 4.391286228294544e-05, + "loss": 1.6442, + "step": 3108 + }, + { + "epoch": 0.22876063462865026, + "grad_norm": 0.75, + "learning_rate": 4.390907420612166e-05, + "loss": 0.7756, + "step": 3109 + }, + { + "epoch": 0.22883421476201427, + "grad_norm": 1.03125, + "learning_rate": 4.3905285114483584e-05, + "loss": 1.0006, + "step": 3110 + }, + { + "epoch": 0.22890779489537824, + "grad_norm": 0.84765625, + "learning_rate": 4.390149500823457e-05, + "loss": 0.6596, + "step": 3111 + }, + { + "epoch": 0.22898137502874225, + "grad_norm": 0.9375, + "learning_rate": 4.3897703887578015e-05, + "loss": 0.9563, + "step": 3112 + }, + { + "epoch": 0.22905495516210622, + "grad_norm": 0.8046875, + "learning_rate": 4.3893911752717374e-05, + "loss": 0.9856, + "step": 3113 + }, + { + "epoch": 0.22912853529547023, + "grad_norm": 0.93359375, + "learning_rate": 4.389011860385619e-05, + "loss": 1.0432, + "step": 3114 + }, + { + "epoch": 0.2292021154288342, + "grad_norm": 1.078125, + "learning_rate": 4.3886324441198015e-05, + "loss": 1.5668, + "step": 3115 + }, + { + "epoch": 0.2292756955621982, + "grad_norm": 0.86328125, + "learning_rate": 4.388252926494647e-05, + "loss": 0.9112, + "step": 3116 + }, + { + "epoch": 0.2293492756955622, + "grad_norm": 0.88671875, + "learning_rate": 4.387873307530525e-05, + "loss": 0.948, + "step": 3117 + }, + { + "epoch": 0.2294228558289262, + "grad_norm": 0.89453125, + "learning_rate": 4.3874935872478086e-05, + "loss": 1.1635, + "step": 3118 + }, + { + "epoch": 0.2294964359622902, + "grad_norm": 0.85546875, + "learning_rate": 4.387113765666876e-05, + "loss": 0.953, + "step": 3119 + }, + { + "epoch": 0.22957001609565417, + "grad_norm": 1.1796875, + "learning_rate": 4.386733842808112e-05, + "loss": 1.118, + "step": 3120 + }, + { + "epoch": 0.22964359622901817, + "grad_norm": 1.2109375, + "learning_rate": 4.386353818691906e-05, + "loss": 0.6992, + "step": 3121 + }, + { + "epoch": 0.22971717636238215, + "grad_norm": 0.85546875, + "learning_rate": 4.385973693338653e-05, + "loss": 1.0678, + "step": 3122 + }, + { + "epoch": 0.22979075649574615, + "grad_norm": 1.046875, + "learning_rate": 4.3855934667687545e-05, + "loss": 1.6797, + "step": 3123 + }, + { + "epoch": 0.22986433662911013, + "grad_norm": 0.75, + "learning_rate": 4.385213139002616e-05, + "loss": 0.8481, + "step": 3124 + }, + { + "epoch": 0.22993791676247413, + "grad_norm": 0.80078125, + "learning_rate": 4.3848327100606486e-05, + "loss": 0.7289, + "step": 3125 + }, + { + "epoch": 0.23001149689583814, + "grad_norm": 1.0, + "learning_rate": 4.38445217996327e-05, + "loss": 1.4812, + "step": 3126 + }, + { + "epoch": 0.2300850770292021, + "grad_norm": 0.8671875, + "learning_rate": 4.3840715487309015e-05, + "loss": 1.2339, + "step": 3127 + }, + { + "epoch": 0.23015865716256612, + "grad_norm": 0.7109375, + "learning_rate": 4.3836908163839715e-05, + "loss": 0.7561, + "step": 3128 + }, + { + "epoch": 0.2302322372959301, + "grad_norm": 0.7109375, + "learning_rate": 4.383309982942914e-05, + "loss": 0.6729, + "step": 3129 + }, + { + "epoch": 0.2303058174292941, + "grad_norm": 0.9921875, + "learning_rate": 4.382929048428167e-05, + "loss": 0.9063, + "step": 3130 + }, + { + "epoch": 0.23037939756265807, + "grad_norm": 1.09375, + "learning_rate": 4.3825480128601734e-05, + "loss": 1.3051, + "step": 3131 + }, + { + "epoch": 0.23045297769602208, + "grad_norm": 0.79296875, + "learning_rate": 4.382166876259384e-05, + "loss": 1.0217, + "step": 3132 + }, + { + "epoch": 0.23052655782938605, + "grad_norm": 1.0, + "learning_rate": 4.381785638646253e-05, + "loss": 1.2486, + "step": 3133 + }, + { + "epoch": 0.23060013796275006, + "grad_norm": 0.703125, + "learning_rate": 4.3814043000412405e-05, + "loss": 0.6965, + "step": 3134 + }, + { + "epoch": 0.23067371809611406, + "grad_norm": 0.73046875, + "learning_rate": 4.381022860464814e-05, + "loss": 0.8494, + "step": 3135 + }, + { + "epoch": 0.23074729822947804, + "grad_norm": 0.75390625, + "learning_rate": 4.380641319937442e-05, + "loss": 0.9846, + "step": 3136 + }, + { + "epoch": 0.23082087836284204, + "grad_norm": 0.72265625, + "learning_rate": 4.380259678479604e-05, + "loss": 0.9081, + "step": 3137 + }, + { + "epoch": 0.23089445849620602, + "grad_norm": 0.88671875, + "learning_rate": 4.379877936111779e-05, + "loss": 0.839, + "step": 3138 + }, + { + "epoch": 0.23096803862957002, + "grad_norm": 0.87109375, + "learning_rate": 4.379496092854456e-05, + "loss": 1.0511, + "step": 3139 + }, + { + "epoch": 0.231041618762934, + "grad_norm": 0.8515625, + "learning_rate": 4.379114148728128e-05, + "loss": 0.9411, + "step": 3140 + }, + { + "epoch": 0.231115198896298, + "grad_norm": 1.125, + "learning_rate": 4.378732103753292e-05, + "loss": 1.3649, + "step": 3141 + }, + { + "epoch": 0.23118877902966198, + "grad_norm": 0.9140625, + "learning_rate": 4.378349957950453e-05, + "loss": 0.7724, + "step": 3142 + }, + { + "epoch": 0.23126235916302598, + "grad_norm": 0.9375, + "learning_rate": 4.37796771134012e-05, + "loss": 1.2308, + "step": 3143 + }, + { + "epoch": 0.23133593929639, + "grad_norm": 2.546875, + "learning_rate": 4.377585363942805e-05, + "loss": 1.2, + "step": 3144 + }, + { + "epoch": 0.23140951942975396, + "grad_norm": 0.83984375, + "learning_rate": 4.377202915779032e-05, + "loss": 0.7315, + "step": 3145 + }, + { + "epoch": 0.23148309956311797, + "grad_norm": 0.83984375, + "learning_rate": 4.376820366869323e-05, + "loss": 0.65, + "step": 3146 + }, + { + "epoch": 0.23155667969648194, + "grad_norm": 0.76953125, + "learning_rate": 4.3764377172342095e-05, + "loss": 0.7376, + "step": 3147 + }, + { + "epoch": 0.23163025982984595, + "grad_norm": 1.0078125, + "learning_rate": 4.376054966894229e-05, + "loss": 1.3158, + "step": 3148 + }, + { + "epoch": 0.23170383996320992, + "grad_norm": 0.953125, + "learning_rate": 4.37567211586992e-05, + "loss": 0.972, + "step": 3149 + }, + { + "epoch": 0.23177742009657393, + "grad_norm": 0.9375, + "learning_rate": 4.375289164181832e-05, + "loss": 0.8627, + "step": 3150 + }, + { + "epoch": 0.2318510002299379, + "grad_norm": 0.90625, + "learning_rate": 4.374906111850517e-05, + "loss": 0.9473, + "step": 3151 + }, + { + "epoch": 0.2319245803633019, + "grad_norm": 0.703125, + "learning_rate": 4.374522958896532e-05, + "loss": 0.7579, + "step": 3152 + }, + { + "epoch": 0.2319981604966659, + "grad_norm": 0.87109375, + "learning_rate": 4.37413970534044e-05, + "loss": 0.9142, + "step": 3153 + }, + { + "epoch": 0.2320717406300299, + "grad_norm": 0.77734375, + "learning_rate": 4.373756351202809e-05, + "loss": 0.8533, + "step": 3154 + }, + { + "epoch": 0.2321453207633939, + "grad_norm": 1.0, + "learning_rate": 4.373372896504215e-05, + "loss": 1.1974, + "step": 3155 + }, + { + "epoch": 0.23221890089675787, + "grad_norm": 0.828125, + "learning_rate": 4.372989341265235e-05, + "loss": 0.7317, + "step": 3156 + }, + { + "epoch": 0.23229248103012187, + "grad_norm": 0.75, + "learning_rate": 4.372605685506455e-05, + "loss": 0.8871, + "step": 3157 + }, + { + "epoch": 0.23236606116348585, + "grad_norm": 5.6875, + "learning_rate": 4.372221929248464e-05, + "loss": 1.2479, + "step": 3158 + }, + { + "epoch": 0.23243964129684985, + "grad_norm": 0.7109375, + "learning_rate": 4.37183807251186e-05, + "loss": 0.7314, + "step": 3159 + }, + { + "epoch": 0.23251322143021383, + "grad_norm": 0.8515625, + "learning_rate": 4.3714541153172405e-05, + "loss": 0.7643, + "step": 3160 + }, + { + "epoch": 0.23258680156357783, + "grad_norm": 0.9296875, + "learning_rate": 4.371070057685214e-05, + "loss": 0.7784, + "step": 3161 + }, + { + "epoch": 0.23266038169694184, + "grad_norm": 0.81640625, + "learning_rate": 4.3706858996363906e-05, + "loss": 0.7401, + "step": 3162 + }, + { + "epoch": 0.2327339618303058, + "grad_norm": 0.9375, + "learning_rate": 4.3703016411913886e-05, + "loss": 0.9218, + "step": 3163 + }, + { + "epoch": 0.23280754196366982, + "grad_norm": 1.046875, + "learning_rate": 4.369917282370831e-05, + "loss": 0.7555, + "step": 3164 + }, + { + "epoch": 0.2328811220970338, + "grad_norm": 0.859375, + "learning_rate": 4.369532823195344e-05, + "loss": 1.0403, + "step": 3165 + }, + { + "epoch": 0.2329547022303978, + "grad_norm": 0.6484375, + "learning_rate": 4.369148263685561e-05, + "loss": 0.6592, + "step": 3166 + }, + { + "epoch": 0.23302828236376177, + "grad_norm": 0.8125, + "learning_rate": 4.36876360386212e-05, + "loss": 1.0878, + "step": 3167 + }, + { + "epoch": 0.23310186249712578, + "grad_norm": 0.60546875, + "learning_rate": 4.368378843745668e-05, + "loss": 0.5583, + "step": 3168 + }, + { + "epoch": 0.23317544263048975, + "grad_norm": 0.8515625, + "learning_rate": 4.367993983356852e-05, + "loss": 0.8004, + "step": 3169 + }, + { + "epoch": 0.23324902276385376, + "grad_norm": 1.0078125, + "learning_rate": 4.367609022716326e-05, + "loss": 1.3917, + "step": 3170 + }, + { + "epoch": 0.23332260289721776, + "grad_norm": 1.125, + "learning_rate": 4.367223961844752e-05, + "loss": 1.025, + "step": 3171 + }, + { + "epoch": 0.23339618303058174, + "grad_norm": 0.75, + "learning_rate": 4.366838800762795e-05, + "loss": 0.8707, + "step": 3172 + }, + { + "epoch": 0.23346976316394574, + "grad_norm": 1.2421875, + "learning_rate": 4.366453539491124e-05, + "loss": 1.419, + "step": 3173 + }, + { + "epoch": 0.23354334329730972, + "grad_norm": 1.15625, + "learning_rate": 4.3660681780504176e-05, + "loss": 1.1744, + "step": 3174 + }, + { + "epoch": 0.23361692343067372, + "grad_norm": 0.86328125, + "learning_rate": 4.365682716461357e-05, + "loss": 0.6458, + "step": 3175 + }, + { + "epoch": 0.2336905035640377, + "grad_norm": 0.85546875, + "learning_rate": 4.365297154744629e-05, + "loss": 0.9822, + "step": 3176 + }, + { + "epoch": 0.2337640836974017, + "grad_norm": 0.85546875, + "learning_rate": 4.3649114929209245e-05, + "loss": 0.9373, + "step": 3177 + }, + { + "epoch": 0.23383766383076568, + "grad_norm": 0.92578125, + "learning_rate": 4.364525731010943e-05, + "loss": 1.223, + "step": 3178 + }, + { + "epoch": 0.23391124396412968, + "grad_norm": 0.80078125, + "learning_rate": 4.3641398690353875e-05, + "loss": 0.9826, + "step": 3179 + }, + { + "epoch": 0.2339848240974937, + "grad_norm": 0.7578125, + "learning_rate": 4.363753907014966e-05, + "loss": 0.9722, + "step": 3180 + }, + { + "epoch": 0.23405840423085766, + "grad_norm": 0.796875, + "learning_rate": 4.363367844970392e-05, + "loss": 0.8164, + "step": 3181 + }, + { + "epoch": 0.23413198436422167, + "grad_norm": 0.81640625, + "learning_rate": 4.362981682922386e-05, + "loss": 1.3447, + "step": 3182 + }, + { + "epoch": 0.23420556449758564, + "grad_norm": 0.82421875, + "learning_rate": 4.3625954208916705e-05, + "loss": 0.7872, + "step": 3183 + }, + { + "epoch": 0.23427914463094965, + "grad_norm": 0.8671875, + "learning_rate": 4.3622090588989784e-05, + "loss": 1.0747, + "step": 3184 + }, + { + "epoch": 0.23435272476431362, + "grad_norm": 0.953125, + "learning_rate": 4.3618225969650416e-05, + "loss": 1.0023, + "step": 3185 + }, + { + "epoch": 0.23442630489767763, + "grad_norm": 1.140625, + "learning_rate": 4.361436035110604e-05, + "loss": 1.1984, + "step": 3186 + }, + { + "epoch": 0.2344998850310416, + "grad_norm": 1.0703125, + "learning_rate": 4.3610493733564095e-05, + "loss": 1.2764, + "step": 3187 + }, + { + "epoch": 0.2345734651644056, + "grad_norm": 0.90234375, + "learning_rate": 4.36066261172321e-05, + "loss": 1.021, + "step": 3188 + }, + { + "epoch": 0.2346470452977696, + "grad_norm": 0.9296875, + "learning_rate": 4.360275750231763e-05, + "loss": 0.7916, + "step": 3189 + }, + { + "epoch": 0.2347206254311336, + "grad_norm": 1.0546875, + "learning_rate": 4.359888788902828e-05, + "loss": 1.236, + "step": 3190 + }, + { + "epoch": 0.2347942055644976, + "grad_norm": 0.86328125, + "learning_rate": 4.3595017277571774e-05, + "loss": 0.7047, + "step": 3191 + }, + { + "epoch": 0.23486778569786157, + "grad_norm": 0.7109375, + "learning_rate": 4.359114566815579e-05, + "loss": 0.4961, + "step": 3192 + }, + { + "epoch": 0.23494136583122557, + "grad_norm": 0.78515625, + "learning_rate": 4.358727306098814e-05, + "loss": 0.7679, + "step": 3193 + }, + { + "epoch": 0.23501494596458955, + "grad_norm": 1.0390625, + "learning_rate": 4.358339945627665e-05, + "loss": 1.1169, + "step": 3194 + }, + { + "epoch": 0.23508852609795355, + "grad_norm": 0.74609375, + "learning_rate": 4.3579524854229215e-05, + "loss": 0.956, + "step": 3195 + }, + { + "epoch": 0.23516210623131756, + "grad_norm": 1.0234375, + "learning_rate": 4.3575649255053774e-05, + "loss": 0.903, + "step": 3196 + }, + { + "epoch": 0.23523568636468153, + "grad_norm": 0.83203125, + "learning_rate": 4.357177265895831e-05, + "loss": 0.9522, + "step": 3197 + }, + { + "epoch": 0.23530926649804554, + "grad_norm": 0.85546875, + "learning_rate": 4.3567895066150894e-05, + "loss": 0.8687, + "step": 3198 + }, + { + "epoch": 0.23538284663140951, + "grad_norm": 0.70703125, + "learning_rate": 4.356401647683962e-05, + "loss": 0.624, + "step": 3199 + }, + { + "epoch": 0.23545642676477352, + "grad_norm": 0.76171875, + "learning_rate": 4.356013689123264e-05, + "loss": 1.0651, + "step": 3200 + }, + { + "epoch": 0.2355300068981375, + "grad_norm": 0.68359375, + "learning_rate": 4.355625630953817e-05, + "loss": 0.7108, + "step": 3201 + }, + { + "epoch": 0.2356035870315015, + "grad_norm": 0.8828125, + "learning_rate": 4.355237473196447e-05, + "loss": 0.7632, + "step": 3202 + }, + { + "epoch": 0.23567716716486548, + "grad_norm": 0.765625, + "learning_rate": 4.354849215871986e-05, + "loss": 0.704, + "step": 3203 + }, + { + "epoch": 0.23575074729822948, + "grad_norm": 0.86328125, + "learning_rate": 4.354460859001272e-05, + "loss": 1.0499, + "step": 3204 + }, + { + "epoch": 0.23582432743159348, + "grad_norm": 0.8046875, + "learning_rate": 4.354072402605144e-05, + "loss": 0.992, + "step": 3205 + }, + { + "epoch": 0.23589790756495746, + "grad_norm": 0.9609375, + "learning_rate": 4.353683846704453e-05, + "loss": 0.788, + "step": 3206 + }, + { + "epoch": 0.23597148769832146, + "grad_norm": 0.65234375, + "learning_rate": 4.3532951913200516e-05, + "loss": 0.6209, + "step": 3207 + }, + { + "epoch": 0.23604506783168544, + "grad_norm": 0.90234375, + "learning_rate": 4.3529064364727974e-05, + "loss": 1.0249, + "step": 3208 + }, + { + "epoch": 0.23611864796504944, + "grad_norm": 0.83984375, + "learning_rate": 4.352517582183554e-05, + "loss": 1.0625, + "step": 3209 + }, + { + "epoch": 0.23619222809841342, + "grad_norm": 0.96875, + "learning_rate": 4.352128628473191e-05, + "loss": 0.8386, + "step": 3210 + }, + { + "epoch": 0.23626580823177742, + "grad_norm": 0.83203125, + "learning_rate": 4.351739575362583e-05, + "loss": 0.976, + "step": 3211 + }, + { + "epoch": 0.2363393883651414, + "grad_norm": 0.875, + "learning_rate": 4.3513504228726096e-05, + "loss": 1.0203, + "step": 3212 + }, + { + "epoch": 0.2364129684985054, + "grad_norm": 0.98828125, + "learning_rate": 4.350961171024155e-05, + "loss": 0.9551, + "step": 3213 + }, + { + "epoch": 0.2364865486318694, + "grad_norm": 1.015625, + "learning_rate": 4.350571819838111e-05, + "loss": 1.2153, + "step": 3214 + }, + { + "epoch": 0.23656012876523339, + "grad_norm": 0.71875, + "learning_rate": 4.350182369335372e-05, + "loss": 0.7942, + "step": 3215 + }, + { + "epoch": 0.2366337088985974, + "grad_norm": 0.8828125, + "learning_rate": 4.3497928195368406e-05, + "loss": 1.0052, + "step": 3216 + }, + { + "epoch": 0.23670728903196137, + "grad_norm": 0.76171875, + "learning_rate": 4.349403170463421e-05, + "loss": 0.6689, + "step": 3217 + }, + { + "epoch": 0.23678086916532537, + "grad_norm": 0.9921875, + "learning_rate": 4.349013422136028e-05, + "loss": 1.0577, + "step": 3218 + }, + { + "epoch": 0.23685444929868935, + "grad_norm": 0.66015625, + "learning_rate": 4.348623574575575e-05, + "loss": 0.651, + "step": 3219 + }, + { + "epoch": 0.23692802943205335, + "grad_norm": 0.85546875, + "learning_rate": 4.348233627802988e-05, + "loss": 0.8211, + "step": 3220 + }, + { + "epoch": 0.23700160956541733, + "grad_norm": 0.7890625, + "learning_rate": 4.347843581839193e-05, + "loss": 1.2313, + "step": 3221 + }, + { + "epoch": 0.23707518969878133, + "grad_norm": 0.72265625, + "learning_rate": 4.3474534367051216e-05, + "loss": 0.632, + "step": 3222 + }, + { + "epoch": 0.23714876983214533, + "grad_norm": 0.984375, + "learning_rate": 4.347063192421714e-05, + "loss": 1.0363, + "step": 3223 + }, + { + "epoch": 0.2372223499655093, + "grad_norm": 0.8984375, + "learning_rate": 4.3466728490099143e-05, + "loss": 1.0407, + "step": 3224 + }, + { + "epoch": 0.23729593009887331, + "grad_norm": 0.78515625, + "learning_rate": 4.3462824064906704e-05, + "loss": 0.8869, + "step": 3225 + }, + { + "epoch": 0.2373695102322373, + "grad_norm": 1.0546875, + "learning_rate": 4.345891864884937e-05, + "loss": 1.3454, + "step": 3226 + }, + { + "epoch": 0.2374430903656013, + "grad_norm": 0.77734375, + "learning_rate": 4.345501224213673e-05, + "loss": 0.8759, + "step": 3227 + }, + { + "epoch": 0.23751667049896527, + "grad_norm": 0.91015625, + "learning_rate": 4.3451104844978444e-05, + "loss": 1.0211, + "step": 3228 + }, + { + "epoch": 0.23759025063232928, + "grad_norm": 0.890625, + "learning_rate": 4.34471964575842e-05, + "loss": 0.7893, + "step": 3229 + }, + { + "epoch": 0.23766383076569325, + "grad_norm": 0.88671875, + "learning_rate": 4.3443287080163777e-05, + "loss": 1.0153, + "step": 3230 + }, + { + "epoch": 0.23773741089905726, + "grad_norm": 0.828125, + "learning_rate": 4.343937671292697e-05, + "loss": 1.2079, + "step": 3231 + }, + { + "epoch": 0.23781099103242126, + "grad_norm": 1.1796875, + "learning_rate": 4.343546535608364e-05, + "loss": 1.1658, + "step": 3232 + }, + { + "epoch": 0.23788457116578524, + "grad_norm": 0.765625, + "learning_rate": 4.343155300984369e-05, + "loss": 0.6722, + "step": 3233 + }, + { + "epoch": 0.23795815129914924, + "grad_norm": 1.03125, + "learning_rate": 4.342763967441712e-05, + "loss": 1.0483, + "step": 3234 + }, + { + "epoch": 0.23803173143251322, + "grad_norm": 0.90625, + "learning_rate": 4.342372535001393e-05, + "loss": 0.8137, + "step": 3235 + }, + { + "epoch": 0.23810531156587722, + "grad_norm": 0.875, + "learning_rate": 4.34198100368442e-05, + "loss": 1.3357, + "step": 3236 + }, + { + "epoch": 0.2381788916992412, + "grad_norm": 0.8359375, + "learning_rate": 4.341589373511805e-05, + "loss": 0.7589, + "step": 3237 + }, + { + "epoch": 0.2382524718326052, + "grad_norm": 0.87890625, + "learning_rate": 4.3411976445045664e-05, + "loss": 0.9209, + "step": 3238 + }, + { + "epoch": 0.23832605196596918, + "grad_norm": 0.94140625, + "learning_rate": 4.340805816683728e-05, + "loss": 1.4987, + "step": 3239 + }, + { + "epoch": 0.23839963209933318, + "grad_norm": 0.73046875, + "learning_rate": 4.340413890070318e-05, + "loss": 0.5365, + "step": 3240 + }, + { + "epoch": 0.23847321223269718, + "grad_norm": 1.0703125, + "learning_rate": 4.340021864685371e-05, + "loss": 1.1462, + "step": 3241 + }, + { + "epoch": 0.23854679236606116, + "grad_norm": 0.8515625, + "learning_rate": 4.339629740549926e-05, + "loss": 0.7768, + "step": 3242 + }, + { + "epoch": 0.23862037249942517, + "grad_norm": 0.6875, + "learning_rate": 4.339237517685027e-05, + "loss": 0.7225, + "step": 3243 + }, + { + "epoch": 0.23869395263278914, + "grad_norm": 0.96484375, + "learning_rate": 4.3388451961117246e-05, + "loss": 0.7952, + "step": 3244 + }, + { + "epoch": 0.23876753276615315, + "grad_norm": 0.91015625, + "learning_rate": 4.338452775851073e-05, + "loss": 0.8929, + "step": 3245 + }, + { + "epoch": 0.23884111289951712, + "grad_norm": 0.83203125, + "learning_rate": 4.338060256924134e-05, + "loss": 0.9899, + "step": 3246 + }, + { + "epoch": 0.23891469303288113, + "grad_norm": 0.8125, + "learning_rate": 4.337667639351972e-05, + "loss": 1.0291, + "step": 3247 + }, + { + "epoch": 0.2389882731662451, + "grad_norm": 0.76953125, + "learning_rate": 4.337274923155659e-05, + "loss": 0.6663, + "step": 3248 + }, + { + "epoch": 0.2390618532996091, + "grad_norm": 0.90234375, + "learning_rate": 4.33688210835627e-05, + "loss": 1.2437, + "step": 3249 + }, + { + "epoch": 0.2391354334329731, + "grad_norm": 0.78125, + "learning_rate": 4.336489194974889e-05, + "loss": 0.7705, + "step": 3250 + }, + { + "epoch": 0.2392090135663371, + "grad_norm": 0.94921875, + "learning_rate": 4.336096183032601e-05, + "loss": 1.0608, + "step": 3251 + }, + { + "epoch": 0.2392825936997011, + "grad_norm": 0.7734375, + "learning_rate": 4.335703072550499e-05, + "loss": 0.6806, + "step": 3252 + }, + { + "epoch": 0.23935617383306507, + "grad_norm": 0.65625, + "learning_rate": 4.33530986354968e-05, + "loss": 0.68, + "step": 3253 + }, + { + "epoch": 0.23942975396642907, + "grad_norm": 1.015625, + "learning_rate": 4.334916556051247e-05, + "loss": 1.1363, + "step": 3254 + }, + { + "epoch": 0.23950333409979305, + "grad_norm": 0.69921875, + "learning_rate": 4.3345231500763084e-05, + "loss": 0.7675, + "step": 3255 + }, + { + "epoch": 0.23957691423315705, + "grad_norm": 0.94140625, + "learning_rate": 4.334129645645977e-05, + "loss": 1.0675, + "step": 3256 + }, + { + "epoch": 0.23965049436652103, + "grad_norm": 0.7265625, + "learning_rate": 4.333736042781371e-05, + "loss": 0.6276, + "step": 3257 + }, + { + "epoch": 0.23972407449988503, + "grad_norm": 0.96484375, + "learning_rate": 4.333342341503616e-05, + "loss": 0.7948, + "step": 3258 + }, + { + "epoch": 0.23979765463324904, + "grad_norm": 0.99609375, + "learning_rate": 4.3329485418338403e-05, + "loss": 0.8094, + "step": 3259 + }, + { + "epoch": 0.239871234766613, + "grad_norm": 0.92578125, + "learning_rate": 4.332554643793177e-05, + "loss": 0.9658, + "step": 3260 + }, + { + "epoch": 0.23994481489997702, + "grad_norm": 0.92578125, + "learning_rate": 4.332160647402769e-05, + "loss": 0.9935, + "step": 3261 + }, + { + "epoch": 0.240018395033341, + "grad_norm": 0.86328125, + "learning_rate": 4.331766552683758e-05, + "loss": 1.2022, + "step": 3262 + }, + { + "epoch": 0.240091975166705, + "grad_norm": 0.7890625, + "learning_rate": 4.331372359657297e-05, + "loss": 0.731, + "step": 3263 + }, + { + "epoch": 0.24016555530006897, + "grad_norm": 0.8515625, + "learning_rate": 4.33097806834454e-05, + "loss": 0.7802, + "step": 3264 + }, + { + "epoch": 0.24023913543343298, + "grad_norm": 0.76171875, + "learning_rate": 4.3305836787666475e-05, + "loss": 0.6186, + "step": 3265 + }, + { + "epoch": 0.24031271556679695, + "grad_norm": 0.9375, + "learning_rate": 4.330189190944788e-05, + "loss": 1.4881, + "step": 3266 + }, + { + "epoch": 0.24038629570016096, + "grad_norm": 1.015625, + "learning_rate": 4.329794604900131e-05, + "loss": 1.1482, + "step": 3267 + }, + { + "epoch": 0.24045987583352496, + "grad_norm": 0.84375, + "learning_rate": 4.329399920653852e-05, + "loss": 0.8388, + "step": 3268 + }, + { + "epoch": 0.24053345596688894, + "grad_norm": 0.93359375, + "learning_rate": 4.3290051382271356e-05, + "loss": 1.34, + "step": 3269 + }, + { + "epoch": 0.24060703610025294, + "grad_norm": 1.015625, + "learning_rate": 4.328610257641168e-05, + "loss": 1.4536, + "step": 3270 + }, + { + "epoch": 0.24068061623361692, + "grad_norm": 0.94140625, + "learning_rate": 4.3282152789171414e-05, + "loss": 0.9708, + "step": 3271 + }, + { + "epoch": 0.24075419636698092, + "grad_norm": 0.7109375, + "learning_rate": 4.327820202076254e-05, + "loss": 0.7082, + "step": 3272 + }, + { + "epoch": 0.2408277765003449, + "grad_norm": 0.8515625, + "learning_rate": 4.3274250271397085e-05, + "loss": 0.8295, + "step": 3273 + }, + { + "epoch": 0.2409013566337089, + "grad_norm": 0.76953125, + "learning_rate": 4.327029754128714e-05, + "loss": 0.7234, + "step": 3274 + }, + { + "epoch": 0.24097493676707288, + "grad_norm": 1.0390625, + "learning_rate": 4.326634383064482e-05, + "loss": 1.2526, + "step": 3275 + }, + { + "epoch": 0.24104851690043688, + "grad_norm": 0.99609375, + "learning_rate": 4.326238913968234e-05, + "loss": 1.2503, + "step": 3276 + }, + { + "epoch": 0.2411220970338009, + "grad_norm": 0.87109375, + "learning_rate": 4.3258433468611914e-05, + "loss": 0.6983, + "step": 3277 + }, + { + "epoch": 0.24119567716716486, + "grad_norm": 0.8515625, + "learning_rate": 4.325447681764586e-05, + "loss": 0.8075, + "step": 3278 + }, + { + "epoch": 0.24126925730052887, + "grad_norm": 0.875, + "learning_rate": 4.325051918699651e-05, + "loss": 0.8359, + "step": 3279 + }, + { + "epoch": 0.24134283743389284, + "grad_norm": 0.9765625, + "learning_rate": 4.324656057687626e-05, + "loss": 1.1151, + "step": 3280 + }, + { + "epoch": 0.24141641756725685, + "grad_norm": 0.8046875, + "learning_rate": 4.324260098749757e-05, + "loss": 0.6284, + "step": 3281 + }, + { + "epoch": 0.24148999770062082, + "grad_norm": 0.94140625, + "learning_rate": 4.3238640419072945e-05, + "loss": 1.3291, + "step": 3282 + }, + { + "epoch": 0.24156357783398483, + "grad_norm": 0.75, + "learning_rate": 4.323467887181494e-05, + "loss": 0.7874, + "step": 3283 + }, + { + "epoch": 0.2416371579673488, + "grad_norm": 0.84375, + "learning_rate": 4.323071634593615e-05, + "loss": 1.1782, + "step": 3284 + }, + { + "epoch": 0.2417107381007128, + "grad_norm": 0.84765625, + "learning_rate": 4.322675284164925e-05, + "loss": 0.6891, + "step": 3285 + }, + { + "epoch": 0.2417843182340768, + "grad_norm": 0.83984375, + "learning_rate": 4.3222788359166957e-05, + "loss": 0.7173, + "step": 3286 + }, + { + "epoch": 0.2418578983674408, + "grad_norm": 0.78515625, + "learning_rate": 4.321882289870202e-05, + "loss": 1.1282, + "step": 3287 + }, + { + "epoch": 0.2419314785008048, + "grad_norm": 0.69140625, + "learning_rate": 4.321485646046727e-05, + "loss": 0.969, + "step": 3288 + }, + { + "epoch": 0.24200505863416877, + "grad_norm": 0.82421875, + "learning_rate": 4.321088904467559e-05, + "loss": 0.9398, + "step": 3289 + }, + { + "epoch": 0.24207863876753277, + "grad_norm": 0.9375, + "learning_rate": 4.3206920651539873e-05, + "loss": 0.8897, + "step": 3290 + }, + { + "epoch": 0.24215221890089675, + "grad_norm": 0.82421875, + "learning_rate": 4.320295128127312e-05, + "loss": 0.8354, + "step": 3291 + }, + { + "epoch": 0.24222579903426075, + "grad_norm": 0.765625, + "learning_rate": 4.3198980934088354e-05, + "loss": 0.7275, + "step": 3292 + }, + { + "epoch": 0.24229937916762473, + "grad_norm": 0.81640625, + "learning_rate": 4.319500961019865e-05, + "loss": 1.0167, + "step": 3293 + }, + { + "epoch": 0.24237295930098873, + "grad_norm": 0.71484375, + "learning_rate": 4.319103730981715e-05, + "loss": 0.815, + "step": 3294 + }, + { + "epoch": 0.24244653943435274, + "grad_norm": 0.84375, + "learning_rate": 4.318706403315703e-05, + "loss": 0.8111, + "step": 3295 + }, + { + "epoch": 0.2425201195677167, + "grad_norm": 1.0625, + "learning_rate": 4.318308978043154e-05, + "loss": 1.174, + "step": 3296 + }, + { + "epoch": 0.24259369970108072, + "grad_norm": 0.9375, + "learning_rate": 4.317911455185396e-05, + "loss": 1.1623, + "step": 3297 + }, + { + "epoch": 0.2426672798344447, + "grad_norm": 0.8359375, + "learning_rate": 4.3175138347637646e-05, + "loss": 0.6876, + "step": 3298 + }, + { + "epoch": 0.2427408599678087, + "grad_norm": 1.03125, + "learning_rate": 4.317116116799598e-05, + "loss": 1.4496, + "step": 3299 + }, + { + "epoch": 0.24281444010117267, + "grad_norm": 0.80078125, + "learning_rate": 4.316718301314241e-05, + "loss": 1.355, + "step": 3300 + }, + { + "epoch": 0.24288802023453668, + "grad_norm": 0.7890625, + "learning_rate": 4.316320388329044e-05, + "loss": 0.7542, + "step": 3301 + }, + { + "epoch": 0.24296160036790065, + "grad_norm": 0.77734375, + "learning_rate": 4.315922377865363e-05, + "loss": 0.9499, + "step": 3302 + }, + { + "epoch": 0.24303518050126466, + "grad_norm": 0.65234375, + "learning_rate": 4.315524269944557e-05, + "loss": 0.5671, + "step": 3303 + }, + { + "epoch": 0.24310876063462866, + "grad_norm": 0.8046875, + "learning_rate": 4.3151260645879934e-05, + "loss": 0.6535, + "step": 3304 + }, + { + "epoch": 0.24318234076799264, + "grad_norm": 0.91796875, + "learning_rate": 4.3147277618170415e-05, + "loss": 1.1626, + "step": 3305 + }, + { + "epoch": 0.24325592090135664, + "grad_norm": 0.96484375, + "learning_rate": 4.3143293616530784e-05, + "loss": 0.9798, + "step": 3306 + }, + { + "epoch": 0.24332950103472062, + "grad_norm": 0.82421875, + "learning_rate": 4.3139308641174855e-05, + "loss": 0.9162, + "step": 3307 + }, + { + "epoch": 0.24340308116808462, + "grad_norm": 0.75, + "learning_rate": 4.3135322692316484e-05, + "loss": 0.7868, + "step": 3308 + }, + { + "epoch": 0.2434766613014486, + "grad_norm": 1.1484375, + "learning_rate": 4.31313357701696e-05, + "loss": 1.0084, + "step": 3309 + }, + { + "epoch": 0.2435502414348126, + "grad_norm": 0.98046875, + "learning_rate": 4.3127347874948177e-05, + "loss": 1.2432, + "step": 3310 + }, + { + "epoch": 0.24362382156817658, + "grad_norm": 0.7890625, + "learning_rate": 4.312335900686623e-05, + "loss": 0.8526, + "step": 3311 + }, + { + "epoch": 0.24369740170154058, + "grad_norm": 0.8203125, + "learning_rate": 4.311936916613782e-05, + "loss": 0.7488, + "step": 3312 + }, + { + "epoch": 0.2437709818349046, + "grad_norm": 0.9140625, + "learning_rate": 4.311537835297711e-05, + "loss": 0.8447, + "step": 3313 + }, + { + "epoch": 0.24384456196826856, + "grad_norm": 1.046875, + "learning_rate": 4.3111386567598245e-05, + "loss": 1.2009, + "step": 3314 + }, + { + "epoch": 0.24391814210163257, + "grad_norm": 0.796875, + "learning_rate": 4.3107393810215475e-05, + "loss": 0.912, + "step": 3315 + }, + { + "epoch": 0.24399172223499654, + "grad_norm": 1.015625, + "learning_rate": 4.310340008104308e-05, + "loss": 1.3959, + "step": 3316 + }, + { + "epoch": 0.24406530236836055, + "grad_norm": 0.9921875, + "learning_rate": 4.30994053802954e-05, + "loss": 1.2222, + "step": 3317 + }, + { + "epoch": 0.24413888250172452, + "grad_norm": 0.96875, + "learning_rate": 4.3095409708186804e-05, + "loss": 1.1964, + "step": 3318 + }, + { + "epoch": 0.24421246263508853, + "grad_norm": 1.015625, + "learning_rate": 4.309141306493176e-05, + "loss": 1.406, + "step": 3319 + }, + { + "epoch": 0.2442860427684525, + "grad_norm": 0.7578125, + "learning_rate": 4.308741545074474e-05, + "loss": 0.6776, + "step": 3320 + }, + { + "epoch": 0.2443596229018165, + "grad_norm": 0.89453125, + "learning_rate": 4.3083416865840296e-05, + "loss": 0.68, + "step": 3321 + }, + { + "epoch": 0.2444332030351805, + "grad_norm": 0.7890625, + "learning_rate": 4.307941731043302e-05, + "loss": 0.8754, + "step": 3322 + }, + { + "epoch": 0.2445067831685445, + "grad_norm": 0.83984375, + "learning_rate": 4.307541678473758e-05, + "loss": 0.9996, + "step": 3323 + }, + { + "epoch": 0.2445803633019085, + "grad_norm": 0.81640625, + "learning_rate": 4.307141528896864e-05, + "loss": 0.7086, + "step": 3324 + }, + { + "epoch": 0.24465394343527247, + "grad_norm": 0.828125, + "learning_rate": 4.306741282334099e-05, + "loss": 0.9456, + "step": 3325 + }, + { + "epoch": 0.24472752356863647, + "grad_norm": 0.84375, + "learning_rate": 4.306340938806941e-05, + "loss": 1.3534, + "step": 3326 + }, + { + "epoch": 0.24480110370200045, + "grad_norm": 0.80859375, + "learning_rate": 4.305940498336876e-05, + "loss": 0.7491, + "step": 3327 + }, + { + "epoch": 0.24487468383536445, + "grad_norm": 0.8203125, + "learning_rate": 4.305539960945396e-05, + "loss": 0.9569, + "step": 3328 + }, + { + "epoch": 0.24494826396872843, + "grad_norm": 0.76953125, + "learning_rate": 4.3051393266539964e-05, + "loss": 0.8684, + "step": 3329 + }, + { + "epoch": 0.24502184410209243, + "grad_norm": 0.7421875, + "learning_rate": 4.304738595484179e-05, + "loss": 0.9582, + "step": 3330 + }, + { + "epoch": 0.24509542423545644, + "grad_norm": 0.875, + "learning_rate": 4.304337767457449e-05, + "loss": 0.9837, + "step": 3331 + }, + { + "epoch": 0.24516900436882041, + "grad_norm": 0.8203125, + "learning_rate": 4.30393684259532e-05, + "loss": 1.0825, + "step": 3332 + }, + { + "epoch": 0.24524258450218442, + "grad_norm": 0.93359375, + "learning_rate": 4.3035358209193065e-05, + "loss": 1.397, + "step": 3333 + }, + { + "epoch": 0.2453161646355484, + "grad_norm": 0.796875, + "learning_rate": 4.3031347024509336e-05, + "loss": 0.8771, + "step": 3334 + }, + { + "epoch": 0.2453897447689124, + "grad_norm": 0.9375, + "learning_rate": 4.302733487211725e-05, + "loss": 0.9097, + "step": 3335 + }, + { + "epoch": 0.24546332490227638, + "grad_norm": 0.80078125, + "learning_rate": 4.302332175223216e-05, + "loss": 0.9695, + "step": 3336 + }, + { + "epoch": 0.24553690503564038, + "grad_norm": 0.92578125, + "learning_rate": 4.301930766506943e-05, + "loss": 1.063, + "step": 3337 + }, + { + "epoch": 0.24561048516900436, + "grad_norm": 0.73828125, + "learning_rate": 4.301529261084449e-05, + "loss": 0.7604, + "step": 3338 + }, + { + "epoch": 0.24568406530236836, + "grad_norm": 0.8828125, + "learning_rate": 4.301127658977283e-05, + "loss": 0.822, + "step": 3339 + }, + { + "epoch": 0.24575764543573236, + "grad_norm": 0.78515625, + "learning_rate": 4.3007259602069974e-05, + "loss": 0.8355, + "step": 3340 + }, + { + "epoch": 0.24583122556909634, + "grad_norm": 0.765625, + "learning_rate": 4.3003241647951494e-05, + "loss": 0.7786, + "step": 3341 + }, + { + "epoch": 0.24590480570246034, + "grad_norm": 0.9453125, + "learning_rate": 4.299922272763305e-05, + "loss": 1.0491, + "step": 3342 + }, + { + "epoch": 0.24597838583582432, + "grad_norm": 0.953125, + "learning_rate": 4.2995202841330305e-05, + "loss": 0.8744, + "step": 3343 + }, + { + "epoch": 0.24605196596918832, + "grad_norm": 0.92578125, + "learning_rate": 4.299118198925902e-05, + "loss": 1.2905, + "step": 3344 + }, + { + "epoch": 0.2461255461025523, + "grad_norm": 0.73828125, + "learning_rate": 4.2987160171634975e-05, + "loss": 0.8037, + "step": 3345 + }, + { + "epoch": 0.2461991262359163, + "grad_norm": 0.703125, + "learning_rate": 4.2983137388674024e-05, + "loss": 0.855, + "step": 3346 + }, + { + "epoch": 0.24627270636928028, + "grad_norm": 0.87890625, + "learning_rate": 4.297911364059205e-05, + "loss": 0.9314, + "step": 3347 + }, + { + "epoch": 0.24634628650264428, + "grad_norm": 0.92578125, + "learning_rate": 4.2975088927605e-05, + "loss": 0.9432, + "step": 3348 + }, + { + "epoch": 0.2464198666360083, + "grad_norm": 0.9609375, + "learning_rate": 4.297106324992888e-05, + "loss": 1.0491, + "step": 3349 + }, + { + "epoch": 0.24649344676937227, + "grad_norm": 0.796875, + "learning_rate": 4.2967036607779745e-05, + "loss": 1.1294, + "step": 3350 + }, + { + "epoch": 0.24656702690273627, + "grad_norm": 0.87109375, + "learning_rate": 4.2963009001373684e-05, + "loss": 1.3044, + "step": 3351 + }, + { + "epoch": 0.24664060703610025, + "grad_norm": 0.85546875, + "learning_rate": 4.295898043092685e-05, + "loss": 0.7408, + "step": 3352 + }, + { + "epoch": 0.24671418716946425, + "grad_norm": 0.734375, + "learning_rate": 4.2954950896655466e-05, + "loss": 0.8605, + "step": 3353 + }, + { + "epoch": 0.24678776730282823, + "grad_norm": 0.6953125, + "learning_rate": 4.2950920398775783e-05, + "loss": 0.7389, + "step": 3354 + }, + { + "epoch": 0.24686134743619223, + "grad_norm": 1.1328125, + "learning_rate": 4.29468889375041e-05, + "loss": 1.1959, + "step": 3355 + }, + { + "epoch": 0.2469349275695562, + "grad_norm": 0.86328125, + "learning_rate": 4.2942856513056785e-05, + "loss": 1.1662, + "step": 3356 + }, + { + "epoch": 0.2470085077029202, + "grad_norm": 0.90234375, + "learning_rate": 4.293882312565025e-05, + "loss": 0.8959, + "step": 3357 + }, + { + "epoch": 0.24708208783628421, + "grad_norm": 1.015625, + "learning_rate": 4.293478877550096e-05, + "loss": 0.9975, + "step": 3358 + }, + { + "epoch": 0.2471556679696482, + "grad_norm": 1.0078125, + "learning_rate": 4.293075346282543e-05, + "loss": 1.4542, + "step": 3359 + }, + { + "epoch": 0.2472292481030122, + "grad_norm": 0.875, + "learning_rate": 4.2926717187840225e-05, + "loss": 0.6137, + "step": 3360 + }, + { + "epoch": 0.24730282823637617, + "grad_norm": 0.84765625, + "learning_rate": 4.2922679950761975e-05, + "loss": 0.9547, + "step": 3361 + }, + { + "epoch": 0.24737640836974017, + "grad_norm": 0.87890625, + "learning_rate": 4.291864175180734e-05, + "loss": 0.9009, + "step": 3362 + }, + { + "epoch": 0.24744998850310415, + "grad_norm": 0.83984375, + "learning_rate": 4.2914602591193045e-05, + "loss": 0.9671, + "step": 3363 + }, + { + "epoch": 0.24752356863646816, + "grad_norm": 0.94921875, + "learning_rate": 4.2910562469135864e-05, + "loss": 0.9419, + "step": 3364 + }, + { + "epoch": 0.24759714876983213, + "grad_norm": 0.82421875, + "learning_rate": 4.290652138585262e-05, + "loss": 0.8009, + "step": 3365 + }, + { + "epoch": 0.24767072890319614, + "grad_norm": 0.9609375, + "learning_rate": 4.2902479341560195e-05, + "loss": 1.1007, + "step": 3366 + }, + { + "epoch": 0.24774430903656014, + "grad_norm": 1.0625, + "learning_rate": 4.2898436336475525e-05, + "loss": 0.8986, + "step": 3367 + }, + { + "epoch": 0.24781788916992412, + "grad_norm": 0.76953125, + "learning_rate": 4.289439237081557e-05, + "loss": 0.6785, + "step": 3368 + }, + { + "epoch": 0.24789146930328812, + "grad_norm": 1.0390625, + "learning_rate": 4.2890347444797384e-05, + "loss": 1.1315, + "step": 3369 + }, + { + "epoch": 0.2479650494366521, + "grad_norm": 0.79296875, + "learning_rate": 4.2886301558638035e-05, + "loss": 0.7375, + "step": 3370 + }, + { + "epoch": 0.2480386295700161, + "grad_norm": 0.953125, + "learning_rate": 4.288225471255467e-05, + "loss": 0.9889, + "step": 3371 + }, + { + "epoch": 0.24811220970338008, + "grad_norm": 0.83984375, + "learning_rate": 4.2878206906764466e-05, + "loss": 0.8403, + "step": 3372 + }, + { + "epoch": 0.24818578983674408, + "grad_norm": 0.7265625, + "learning_rate": 4.2874158141484665e-05, + "loss": 0.6619, + "step": 3373 + }, + { + "epoch": 0.24825936997010806, + "grad_norm": 0.77734375, + "learning_rate": 4.287010841693255e-05, + "loss": 0.9374, + "step": 3374 + }, + { + "epoch": 0.24833295010347206, + "grad_norm": 0.6953125, + "learning_rate": 4.286605773332548e-05, + "loss": 0.7605, + "step": 3375 + }, + { + "epoch": 0.24840653023683607, + "grad_norm": 0.765625, + "learning_rate": 4.286200609088082e-05, + "loss": 0.8482, + "step": 3376 + }, + { + "epoch": 0.24848011037020004, + "grad_norm": 0.97265625, + "learning_rate": 4.2857953489816046e-05, + "loss": 1.5558, + "step": 3377 + }, + { + "epoch": 0.24855369050356405, + "grad_norm": 0.84765625, + "learning_rate": 4.285389993034863e-05, + "loss": 1.1048, + "step": 3378 + }, + { + "epoch": 0.24862727063692802, + "grad_norm": 0.890625, + "learning_rate": 4.284984541269612e-05, + "loss": 1.2074, + "step": 3379 + }, + { + "epoch": 0.24870085077029203, + "grad_norm": 0.80859375, + "learning_rate": 4.284578993707613e-05, + "loss": 0.8701, + "step": 3380 + }, + { + "epoch": 0.248774430903656, + "grad_norm": 0.7109375, + "learning_rate": 4.284173350370629e-05, + "loss": 0.6837, + "step": 3381 + }, + { + "epoch": 0.24884801103702, + "grad_norm": 1.125, + "learning_rate": 4.2837676112804323e-05, + "loss": 1.0353, + "step": 3382 + }, + { + "epoch": 0.248921591170384, + "grad_norm": 0.8515625, + "learning_rate": 4.283361776458796e-05, + "loss": 0.8353, + "step": 3383 + }, + { + "epoch": 0.248995171303748, + "grad_norm": 0.9140625, + "learning_rate": 4.282955845927502e-05, + "loss": 1.0915, + "step": 3384 + }, + { + "epoch": 0.249068751437112, + "grad_norm": 0.8984375, + "learning_rate": 4.282549819708335e-05, + "loss": 0.8539, + "step": 3385 + }, + { + "epoch": 0.24914233157047597, + "grad_norm": 0.86328125, + "learning_rate": 4.282143697823086e-05, + "loss": 1.1669, + "step": 3386 + }, + { + "epoch": 0.24921591170383997, + "grad_norm": 0.88671875, + "learning_rate": 4.281737480293551e-05, + "loss": 0.9369, + "step": 3387 + }, + { + "epoch": 0.24928949183720395, + "grad_norm": 0.75390625, + "learning_rate": 4.2813311671415305e-05, + "loss": 0.7198, + "step": 3388 + }, + { + "epoch": 0.24936307197056795, + "grad_norm": 0.89453125, + "learning_rate": 4.28092475838883e-05, + "loss": 0.7981, + "step": 3389 + }, + { + "epoch": 0.24943665210393193, + "grad_norm": 0.91015625, + "learning_rate": 4.280518254057262e-05, + "loss": 1.0883, + "step": 3390 + }, + { + "epoch": 0.24951023223729593, + "grad_norm": 1.1796875, + "learning_rate": 4.280111654168642e-05, + "loss": 1.2146, + "step": 3391 + }, + { + "epoch": 0.24958381237065994, + "grad_norm": 0.85546875, + "learning_rate": 4.279704958744792e-05, + "loss": 0.9971, + "step": 3392 + }, + { + "epoch": 0.2496573925040239, + "grad_norm": 0.98828125, + "learning_rate": 4.279298167807538e-05, + "loss": 1.0907, + "step": 3393 + }, + { + "epoch": 0.24973097263738792, + "grad_norm": 0.80078125, + "learning_rate": 4.278891281378713e-05, + "loss": 0.9807, + "step": 3394 + }, + { + "epoch": 0.2498045527707519, + "grad_norm": 0.984375, + "learning_rate": 4.2784842994801517e-05, + "loss": 1.0418, + "step": 3395 + }, + { + "epoch": 0.2498781329041159, + "grad_norm": 0.76953125, + "learning_rate": 4.278077222133697e-05, + "loss": 0.665, + "step": 3396 + }, + { + "epoch": 0.24995171303747987, + "grad_norm": 1.0234375, + "learning_rate": 4.277670049361197e-05, + "loss": 1.1062, + "step": 3397 + }, + { + "epoch": 0.2500252931708439, + "grad_norm": 0.765625, + "learning_rate": 4.277262781184502e-05, + "loss": 0.8011, + "step": 3398 + }, + { + "epoch": 0.2500988733042079, + "grad_norm": 0.78125, + "learning_rate": 4.2768554176254705e-05, + "loss": 0.8366, + "step": 3399 + }, + { + "epoch": 0.25017245343757183, + "grad_norm": 1.0078125, + "learning_rate": 4.2764479587059646e-05, + "loss": 0.9901, + "step": 3400 + }, + { + "epoch": 0.25024603357093583, + "grad_norm": 0.9375, + "learning_rate": 4.276040404447853e-05, + "loss": 0.9289, + "step": 3401 + }, + { + "epoch": 0.25031961370429984, + "grad_norm": 0.83984375, + "learning_rate": 4.275632754873007e-05, + "loss": 1.0131, + "step": 3402 + }, + { + "epoch": 0.25039319383766384, + "grad_norm": 0.9296875, + "learning_rate": 4.275225010003304e-05, + "loss": 0.8746, + "step": 3403 + }, + { + "epoch": 0.25046677397102785, + "grad_norm": 0.91015625, + "learning_rate": 4.274817169860628e-05, + "loss": 1.025, + "step": 3404 + }, + { + "epoch": 0.2505403541043918, + "grad_norm": 0.796875, + "learning_rate": 4.274409234466867e-05, + "loss": 0.9384, + "step": 3405 + }, + { + "epoch": 0.2506139342377558, + "grad_norm": 0.8046875, + "learning_rate": 4.2740012038439135e-05, + "loss": 0.8604, + "step": 3406 + }, + { + "epoch": 0.2506875143711198, + "grad_norm": 0.828125, + "learning_rate": 4.273593078013666e-05, + "loss": 0.8112, + "step": 3407 + }, + { + "epoch": 0.2507610945044838, + "grad_norm": 0.90234375, + "learning_rate": 4.2731848569980275e-05, + "loss": 0.8045, + "step": 3408 + }, + { + "epoch": 0.25083467463784775, + "grad_norm": 0.73828125, + "learning_rate": 4.272776540818907e-05, + "loss": 0.821, + "step": 3409 + }, + { + "epoch": 0.25090825477121176, + "grad_norm": 0.7890625, + "learning_rate": 4.272368129498218e-05, + "loss": 0.8682, + "step": 3410 + }, + { + "epoch": 0.25098183490457576, + "grad_norm": 0.70703125, + "learning_rate": 4.271959623057879e-05, + "loss": 0.5253, + "step": 3411 + }, + { + "epoch": 0.25105541503793977, + "grad_norm": 1.0078125, + "learning_rate": 4.271551021519814e-05, + "loss": 1.1089, + "step": 3412 + }, + { + "epoch": 0.25112899517130377, + "grad_norm": 0.87109375, + "learning_rate": 4.271142324905951e-05, + "loss": 0.8853, + "step": 3413 + }, + { + "epoch": 0.2512025753046677, + "grad_norm": 0.8359375, + "learning_rate": 4.270733533238226e-05, + "loss": 0.9163, + "step": 3414 + }, + { + "epoch": 0.2512761554380317, + "grad_norm": 0.8828125, + "learning_rate": 4.270324646538576e-05, + "loss": 0.8709, + "step": 3415 + }, + { + "epoch": 0.2513497355713957, + "grad_norm": 0.76171875, + "learning_rate": 4.269915664828945e-05, + "loss": 1.1787, + "step": 3416 + }, + { + "epoch": 0.25142331570475973, + "grad_norm": 0.8203125, + "learning_rate": 4.269506588131284e-05, + "loss": 0.8657, + "step": 3417 + }, + { + "epoch": 0.2514968958381237, + "grad_norm": 0.78515625, + "learning_rate": 4.269097416467547e-05, + "loss": 0.9562, + "step": 3418 + }, + { + "epoch": 0.2515704759714877, + "grad_norm": 0.69140625, + "learning_rate": 4.268688149859692e-05, + "loss": 0.8756, + "step": 3419 + }, + { + "epoch": 0.2516440561048517, + "grad_norm": 0.875, + "learning_rate": 4.268278788329685e-05, + "loss": 0.7495, + "step": 3420 + }, + { + "epoch": 0.2517176362382157, + "grad_norm": 0.8828125, + "learning_rate": 4.267869331899495e-05, + "loss": 0.7501, + "step": 3421 + }, + { + "epoch": 0.2517912163715797, + "grad_norm": 0.91796875, + "learning_rate": 4.267459780591097e-05, + "loss": 1.1245, + "step": 3422 + }, + { + "epoch": 0.25186479650494364, + "grad_norm": 1.1796875, + "learning_rate": 4.267050134426471e-05, + "loss": 1.3597, + "step": 3423 + }, + { + "epoch": 0.25193837663830765, + "grad_norm": 0.92578125, + "learning_rate": 4.266640393427602e-05, + "loss": 1.1856, + "step": 3424 + }, + { + "epoch": 0.25201195677167165, + "grad_norm": 1.0546875, + "learning_rate": 4.2662305576164794e-05, + "loss": 1.0716, + "step": 3425 + }, + { + "epoch": 0.25208553690503566, + "grad_norm": 0.80859375, + "learning_rate": 4.265820627015099e-05, + "loss": 1.0572, + "step": 3426 + }, + { + "epoch": 0.2521591170383996, + "grad_norm": 0.7734375, + "learning_rate": 4.265410601645461e-05, + "loss": 0.7815, + "step": 3427 + }, + { + "epoch": 0.2522326971717636, + "grad_norm": 0.953125, + "learning_rate": 4.26500048152957e-05, + "loss": 1.116, + "step": 3428 + }, + { + "epoch": 0.2523062773051276, + "grad_norm": 1.046875, + "learning_rate": 4.2645902666894366e-05, + "loss": 1.1073, + "step": 3429 + }, + { + "epoch": 0.2523798574384916, + "grad_norm": 0.88671875, + "learning_rate": 4.264179957147077e-05, + "loss": 1.2016, + "step": 3430 + }, + { + "epoch": 0.2524534375718556, + "grad_norm": 0.96875, + "learning_rate": 4.2637695529245104e-05, + "loss": 0.8755, + "step": 3431 + }, + { + "epoch": 0.25252701770521957, + "grad_norm": 0.70703125, + "learning_rate": 4.263359054043763e-05, + "loss": 0.6902, + "step": 3432 + }, + { + "epoch": 0.2526005978385836, + "grad_norm": 0.828125, + "learning_rate": 4.262948460526867e-05, + "loss": 0.8044, + "step": 3433 + }, + { + "epoch": 0.2526741779719476, + "grad_norm": 0.80859375, + "learning_rate": 4.262537772395856e-05, + "loss": 0.8182, + "step": 3434 + }, + { + "epoch": 0.2527477581053116, + "grad_norm": 0.80859375, + "learning_rate": 4.262126989672772e-05, + "loss": 0.6779, + "step": 3435 + }, + { + "epoch": 0.25282133823867553, + "grad_norm": 1.2578125, + "learning_rate": 4.26171611237966e-05, + "loss": 0.9907, + "step": 3436 + }, + { + "epoch": 0.25289491837203953, + "grad_norm": 1.078125, + "learning_rate": 4.261305140538572e-05, + "loss": 1.2386, + "step": 3437 + }, + { + "epoch": 0.25296849850540354, + "grad_norm": 0.9921875, + "learning_rate": 4.260894074171564e-05, + "loss": 1.3068, + "step": 3438 + }, + { + "epoch": 0.25304207863876754, + "grad_norm": 0.87890625, + "learning_rate": 4.260482913300697e-05, + "loss": 0.7777, + "step": 3439 + }, + { + "epoch": 0.25311565877213155, + "grad_norm": 0.81640625, + "learning_rate": 4.260071657948036e-05, + "loss": 1.0844, + "step": 3440 + }, + { + "epoch": 0.2531892389054955, + "grad_norm": 0.796875, + "learning_rate": 4.259660308135655e-05, + "loss": 0.8623, + "step": 3441 + }, + { + "epoch": 0.2532628190388595, + "grad_norm": 0.8828125, + "learning_rate": 4.2592488638856274e-05, + "loss": 1.1151, + "step": 3442 + }, + { + "epoch": 0.2533363991722235, + "grad_norm": 0.75390625, + "learning_rate": 4.2588373252200374e-05, + "loss": 0.8084, + "step": 3443 + }, + { + "epoch": 0.2534099793055875, + "grad_norm": 0.76953125, + "learning_rate": 4.258425692160969e-05, + "loss": 0.9513, + "step": 3444 + }, + { + "epoch": 0.25348355943895146, + "grad_norm": 0.85546875, + "learning_rate": 4.258013964730515e-05, + "loss": 0.8542, + "step": 3445 + }, + { + "epoch": 0.25355713957231546, + "grad_norm": 0.84375, + "learning_rate": 4.257602142950773e-05, + "loss": 1.0133, + "step": 3446 + }, + { + "epoch": 0.25363071970567946, + "grad_norm": 0.7734375, + "learning_rate": 4.2571902268438426e-05, + "loss": 0.8895, + "step": 3447 + }, + { + "epoch": 0.25370429983904347, + "grad_norm": 0.80078125, + "learning_rate": 4.256778216431832e-05, + "loss": 1.0018, + "step": 3448 + }, + { + "epoch": 0.25377787997240747, + "grad_norm": 0.76171875, + "learning_rate": 4.2563661117368524e-05, + "loss": 0.938, + "step": 3449 + }, + { + "epoch": 0.2538514601057714, + "grad_norm": 1.1875, + "learning_rate": 4.255953912781021e-05, + "loss": 1.0347, + "step": 3450 + }, + { + "epoch": 0.2539250402391354, + "grad_norm": 0.7890625, + "learning_rate": 4.255541619586459e-05, + "loss": 0.8427, + "step": 3451 + }, + { + "epoch": 0.25399862037249943, + "grad_norm": 0.953125, + "learning_rate": 4.255129232175295e-05, + "loss": 1.2341, + "step": 3452 + }, + { + "epoch": 0.25407220050586343, + "grad_norm": 0.8203125, + "learning_rate": 4.25471675056966e-05, + "loss": 0.7688, + "step": 3453 + }, + { + "epoch": 0.2541457806392274, + "grad_norm": 0.9921875, + "learning_rate": 4.254304174791691e-05, + "loss": 1.219, + "step": 3454 + }, + { + "epoch": 0.2542193607725914, + "grad_norm": 0.7109375, + "learning_rate": 4.25389150486353e-05, + "loss": 0.6127, + "step": 3455 + }, + { + "epoch": 0.2542929409059554, + "grad_norm": 0.91796875, + "learning_rate": 4.253478740807325e-05, + "loss": 1.0402, + "step": 3456 + }, + { + "epoch": 0.2543665210393194, + "grad_norm": 0.9609375, + "learning_rate": 4.253065882645228e-05, + "loss": 0.9826, + "step": 3457 + }, + { + "epoch": 0.2544401011726834, + "grad_norm": 0.84765625, + "learning_rate": 4.252652930399395e-05, + "loss": 0.9286, + "step": 3458 + }, + { + "epoch": 0.25451368130604735, + "grad_norm": 0.7421875, + "learning_rate": 4.252239884091991e-05, + "loss": 0.6667, + "step": 3459 + }, + { + "epoch": 0.25458726143941135, + "grad_norm": 0.68359375, + "learning_rate": 4.251826743745181e-05, + "loss": 0.6704, + "step": 3460 + }, + { + "epoch": 0.25466084157277535, + "grad_norm": 1.03125, + "learning_rate": 4.251413509381138e-05, + "loss": 1.0261, + "step": 3461 + }, + { + "epoch": 0.25473442170613936, + "grad_norm": 0.7109375, + "learning_rate": 4.2510001810220415e-05, + "loss": 0.7498, + "step": 3462 + }, + { + "epoch": 0.2548080018395033, + "grad_norm": 0.8515625, + "learning_rate": 4.2505867586900705e-05, + "loss": 0.8054, + "step": 3463 + }, + { + "epoch": 0.2548815819728673, + "grad_norm": 0.5234375, + "learning_rate": 4.250173242407416e-05, + "loss": 0.4933, + "step": 3464 + }, + { + "epoch": 0.2549551621062313, + "grad_norm": 0.97265625, + "learning_rate": 4.2497596321962685e-05, + "loss": 1.0394, + "step": 3465 + }, + { + "epoch": 0.2550287422395953, + "grad_norm": 0.99609375, + "learning_rate": 4.249345928078827e-05, + "loss": 0.8976, + "step": 3466 + }, + { + "epoch": 0.2551023223729593, + "grad_norm": 0.93359375, + "learning_rate": 4.2489321300772927e-05, + "loss": 1.02, + "step": 3467 + }, + { + "epoch": 0.25517590250632327, + "grad_norm": 0.8984375, + "learning_rate": 4.248518238213875e-05, + "loss": 1.0248, + "step": 3468 + }, + { + "epoch": 0.2552494826396873, + "grad_norm": 1.09375, + "learning_rate": 4.2481042525107854e-05, + "loss": 1.3725, + "step": 3469 + }, + { + "epoch": 0.2553230627730513, + "grad_norm": 0.97265625, + "learning_rate": 4.247690172990242e-05, + "loss": 1.2737, + "step": 3470 + }, + { + "epoch": 0.2553966429064153, + "grad_norm": 0.9140625, + "learning_rate": 4.247275999674469e-05, + "loss": 0.8972, + "step": 3471 + }, + { + "epoch": 0.2554702230397793, + "grad_norm": 0.890625, + "learning_rate": 4.2468617325856924e-05, + "loss": 0.8607, + "step": 3472 + }, + { + "epoch": 0.25554380317314324, + "grad_norm": 0.77734375, + "learning_rate": 4.246447371746147e-05, + "loss": 0.7903, + "step": 3473 + }, + { + "epoch": 0.25561738330650724, + "grad_norm": 0.765625, + "learning_rate": 4.2460329171780684e-05, + "loss": 0.7285, + "step": 3474 + }, + { + "epoch": 0.25569096343987124, + "grad_norm": 1.0703125, + "learning_rate": 4.245618368903702e-05, + "loss": 1.1474, + "step": 3475 + }, + { + "epoch": 0.25576454357323525, + "grad_norm": 0.81640625, + "learning_rate": 4.245203726945294e-05, + "loss": 1.3079, + "step": 3476 + }, + { + "epoch": 0.2558381237065992, + "grad_norm": 1.0, + "learning_rate": 4.244788991325098e-05, + "loss": 1.0751, + "step": 3477 + }, + { + "epoch": 0.2559117038399632, + "grad_norm": 0.92578125, + "learning_rate": 4.244374162065373e-05, + "loss": 1.1544, + "step": 3478 + }, + { + "epoch": 0.2559852839733272, + "grad_norm": 0.82421875, + "learning_rate": 4.243959239188381e-05, + "loss": 1.2168, + "step": 3479 + }, + { + "epoch": 0.2560588641066912, + "grad_norm": 0.703125, + "learning_rate": 4.243544222716391e-05, + "loss": 0.9321, + "step": 3480 + }, + { + "epoch": 0.2561324442400552, + "grad_norm": 0.9296875, + "learning_rate": 4.243129112671674e-05, + "loss": 0.6829, + "step": 3481 + }, + { + "epoch": 0.25620602437341916, + "grad_norm": 0.83984375, + "learning_rate": 4.242713909076512e-05, + "loss": 0.7281, + "step": 3482 + }, + { + "epoch": 0.25627960450678317, + "grad_norm": 1.0078125, + "learning_rate": 4.242298611953185e-05, + "loss": 1.1156, + "step": 3483 + }, + { + "epoch": 0.25635318464014717, + "grad_norm": 0.95703125, + "learning_rate": 4.241883221323982e-05, + "loss": 0.9548, + "step": 3484 + }, + { + "epoch": 0.2564267647735112, + "grad_norm": 0.90234375, + "learning_rate": 4.241467737211197e-05, + "loss": 1.0641, + "step": 3485 + }, + { + "epoch": 0.2565003449068751, + "grad_norm": 0.70703125, + "learning_rate": 4.241052159637128e-05, + "loss": 0.8736, + "step": 3486 + }, + { + "epoch": 0.2565739250402391, + "grad_norm": 0.796875, + "learning_rate": 4.240636488624077e-05, + "loss": 0.9449, + "step": 3487 + }, + { + "epoch": 0.25664750517360313, + "grad_norm": 0.80078125, + "learning_rate": 4.2402207241943534e-05, + "loss": 0.5731, + "step": 3488 + }, + { + "epoch": 0.25672108530696713, + "grad_norm": 0.86328125, + "learning_rate": 4.239804866370271e-05, + "loss": 0.7721, + "step": 3489 + }, + { + "epoch": 0.25679466544033114, + "grad_norm": 0.90234375, + "learning_rate": 4.239388915174147e-05, + "loss": 1.068, + "step": 3490 + }, + { + "epoch": 0.2568682455736951, + "grad_norm": 0.8515625, + "learning_rate": 4.2389728706283056e-05, + "loss": 0.6954, + "step": 3491 + }, + { + "epoch": 0.2569418257070591, + "grad_norm": 0.71484375, + "learning_rate": 4.2385567327550743e-05, + "loss": 0.7014, + "step": 3492 + }, + { + "epoch": 0.2570154058404231, + "grad_norm": 0.7734375, + "learning_rate": 4.238140501576786e-05, + "loss": 0.9295, + "step": 3493 + }, + { + "epoch": 0.2570889859737871, + "grad_norm": 0.77734375, + "learning_rate": 4.237724177115781e-05, + "loss": 0.8264, + "step": 3494 + }, + { + "epoch": 0.25716256610715105, + "grad_norm": 0.8203125, + "learning_rate": 4.237307759394401e-05, + "loss": 1.0209, + "step": 3495 + }, + { + "epoch": 0.25723614624051505, + "grad_norm": 1.0390625, + "learning_rate": 4.236891248434995e-05, + "loss": 1.5645, + "step": 3496 + }, + { + "epoch": 0.25730972637387906, + "grad_norm": 0.8046875, + "learning_rate": 4.236474644259917e-05, + "loss": 1.1092, + "step": 3497 + }, + { + "epoch": 0.25738330650724306, + "grad_norm": 0.63671875, + "learning_rate": 4.236057946891524e-05, + "loss": 0.7523, + "step": 3498 + }, + { + "epoch": 0.25745688664060706, + "grad_norm": 0.6875, + "learning_rate": 4.2356411563521794e-05, + "loss": 0.7251, + "step": 3499 + }, + { + "epoch": 0.257530466773971, + "grad_norm": 0.8671875, + "learning_rate": 4.235224272664253e-05, + "loss": 0.7612, + "step": 3500 + }, + { + "epoch": 0.257604046907335, + "grad_norm": 0.7578125, + "learning_rate": 4.234807295850117e-05, + "loss": 0.7898, + "step": 3501 + }, + { + "epoch": 0.257677627040699, + "grad_norm": 1.1796875, + "learning_rate": 4.234390225932149e-05, + "loss": 1.2333, + "step": 3502 + }, + { + "epoch": 0.257751207174063, + "grad_norm": 0.78515625, + "learning_rate": 4.2339730629327346e-05, + "loss": 0.9314, + "step": 3503 + }, + { + "epoch": 0.257824787307427, + "grad_norm": 0.98828125, + "learning_rate": 4.233555806874261e-05, + "loss": 1.0388, + "step": 3504 + }, + { + "epoch": 0.257898367440791, + "grad_norm": 0.9375, + "learning_rate": 4.233138457779121e-05, + "loss": 0.9291, + "step": 3505 + }, + { + "epoch": 0.257971947574155, + "grad_norm": 0.890625, + "learning_rate": 4.232721015669713e-05, + "loss": 0.8482, + "step": 3506 + }, + { + "epoch": 0.258045527707519, + "grad_norm": 1.171875, + "learning_rate": 4.232303480568441e-05, + "loss": 0.7691, + "step": 3507 + }, + { + "epoch": 0.258119107840883, + "grad_norm": 0.74609375, + "learning_rate": 4.2318858524977134e-05, + "loss": 0.6183, + "step": 3508 + }, + { + "epoch": 0.25819268797424694, + "grad_norm": 0.890625, + "learning_rate": 4.231468131479943e-05, + "loss": 0.8489, + "step": 3509 + }, + { + "epoch": 0.25826626810761094, + "grad_norm": 1.3203125, + "learning_rate": 4.2310503175375484e-05, + "loss": 1.2199, + "step": 3510 + }, + { + "epoch": 0.25833984824097495, + "grad_norm": 0.69140625, + "learning_rate": 4.230632410692952e-05, + "loss": 0.6541, + "step": 3511 + }, + { + "epoch": 0.25841342837433895, + "grad_norm": 2.375, + "learning_rate": 4.230214410968584e-05, + "loss": 1.0912, + "step": 3512 + }, + { + "epoch": 0.2584870085077029, + "grad_norm": 0.7109375, + "learning_rate": 4.229796318386875e-05, + "loss": 0.8423, + "step": 3513 + }, + { + "epoch": 0.2585605886410669, + "grad_norm": 0.9375, + "learning_rate": 4.229378132970266e-05, + "loss": 1.0736, + "step": 3514 + }, + { + "epoch": 0.2586341687744309, + "grad_norm": 0.8125, + "learning_rate": 4.228959854741198e-05, + "loss": 0.8272, + "step": 3515 + }, + { + "epoch": 0.2587077489077949, + "grad_norm": 0.78515625, + "learning_rate": 4.228541483722121e-05, + "loss": 0.9831, + "step": 3516 + }, + { + "epoch": 0.2587813290411589, + "grad_norm": 0.91015625, + "learning_rate": 4.228123019935487e-05, + "loss": 1.0159, + "step": 3517 + }, + { + "epoch": 0.25885490917452286, + "grad_norm": 0.79296875, + "learning_rate": 4.227704463403754e-05, + "loss": 1.1013, + "step": 3518 + }, + { + "epoch": 0.25892848930788687, + "grad_norm": 0.8203125, + "learning_rate": 4.2272858141493864e-05, + "loss": 0.8541, + "step": 3519 + }, + { + "epoch": 0.25900206944125087, + "grad_norm": 0.73046875, + "learning_rate": 4.226867072194851e-05, + "loss": 0.6593, + "step": 3520 + }, + { + "epoch": 0.2590756495746149, + "grad_norm": 0.94140625, + "learning_rate": 4.226448237562621e-05, + "loss": 0.8965, + "step": 3521 + }, + { + "epoch": 0.2591492297079788, + "grad_norm": 0.74609375, + "learning_rate": 4.226029310275176e-05, + "loss": 0.6424, + "step": 3522 + }, + { + "epoch": 0.2592228098413428, + "grad_norm": 1.125, + "learning_rate": 4.2256102903549976e-05, + "loss": 1.1124, + "step": 3523 + }, + { + "epoch": 0.25929638997470683, + "grad_norm": 0.9140625, + "learning_rate": 4.225191177824574e-05, + "loss": 1.2578, + "step": 3524 + }, + { + "epoch": 0.25936997010807084, + "grad_norm": 0.89453125, + "learning_rate": 4.224771972706398e-05, + "loss": 0.7997, + "step": 3525 + }, + { + "epoch": 0.25944355024143484, + "grad_norm": 0.7265625, + "learning_rate": 4.224352675022968e-05, + "loss": 0.7046, + "step": 3526 + }, + { + "epoch": 0.2595171303747988, + "grad_norm": 0.88671875, + "learning_rate": 4.223933284796787e-05, + "loss": 0.974, + "step": 3527 + }, + { + "epoch": 0.2595907105081628, + "grad_norm": 1.0859375, + "learning_rate": 4.223513802050363e-05, + "loss": 0.9775, + "step": 3528 + }, + { + "epoch": 0.2596642906415268, + "grad_norm": 0.84765625, + "learning_rate": 4.223094226806207e-05, + "loss": 1.0559, + "step": 3529 + }, + { + "epoch": 0.2597378707748908, + "grad_norm": 0.83203125, + "learning_rate": 4.22267455908684e-05, + "loss": 0.9653, + "step": 3530 + }, + { + "epoch": 0.25981145090825475, + "grad_norm": 0.859375, + "learning_rate": 4.222254798914782e-05, + "loss": 1.349, + "step": 3531 + }, + { + "epoch": 0.25988503104161875, + "grad_norm": 0.87890625, + "learning_rate": 4.221834946312563e-05, + "loss": 0.8148, + "step": 3532 + }, + { + "epoch": 0.25995861117498276, + "grad_norm": 0.75390625, + "learning_rate": 4.221415001302713e-05, + "loss": 0.6854, + "step": 3533 + }, + { + "epoch": 0.26003219130834676, + "grad_norm": 0.70703125, + "learning_rate": 4.220994963907772e-05, + "loss": 0.6915, + "step": 3534 + }, + { + "epoch": 0.26010577144171076, + "grad_norm": 0.9765625, + "learning_rate": 4.220574834150281e-05, + "loss": 1.1557, + "step": 3535 + }, + { + "epoch": 0.2601793515750747, + "grad_norm": 0.89453125, + "learning_rate": 4.22015461205279e-05, + "loss": 0.8733, + "step": 3536 + }, + { + "epoch": 0.2602529317084387, + "grad_norm": 0.96484375, + "learning_rate": 4.219734297637849e-05, + "loss": 1.2008, + "step": 3537 + }, + { + "epoch": 0.2603265118418027, + "grad_norm": 0.9296875, + "learning_rate": 4.2193138909280163e-05, + "loss": 1.0873, + "step": 3538 + }, + { + "epoch": 0.2604000919751667, + "grad_norm": 1.921875, + "learning_rate": 4.218893391945854e-05, + "loss": 0.6881, + "step": 3539 + }, + { + "epoch": 0.2604736721085307, + "grad_norm": 0.7265625, + "learning_rate": 4.218472800713931e-05, + "loss": 0.7883, + "step": 3540 + }, + { + "epoch": 0.2605472522418947, + "grad_norm": 0.8046875, + "learning_rate": 4.218052117254817e-05, + "loss": 0.8724, + "step": 3541 + }, + { + "epoch": 0.2606208323752587, + "grad_norm": 0.9140625, + "learning_rate": 4.217631341591092e-05, + "loss": 0.7856, + "step": 3542 + }, + { + "epoch": 0.2606944125086227, + "grad_norm": 0.80078125, + "learning_rate": 4.2172104737453365e-05, + "loss": 0.8145, + "step": 3543 + }, + { + "epoch": 0.2607679926419867, + "grad_norm": 0.84765625, + "learning_rate": 4.216789513740139e-05, + "loss": 1.2071, + "step": 3544 + }, + { + "epoch": 0.26084157277535064, + "grad_norm": 1.078125, + "learning_rate": 4.21636846159809e-05, + "loss": 1.4071, + "step": 3545 + }, + { + "epoch": 0.26091515290871464, + "grad_norm": 0.7578125, + "learning_rate": 4.2159473173417875e-05, + "loss": 0.6581, + "step": 3546 + }, + { + "epoch": 0.26098873304207865, + "grad_norm": 0.71875, + "learning_rate": 4.2155260809938335e-05, + "loss": 0.8168, + "step": 3547 + }, + { + "epoch": 0.26106231317544265, + "grad_norm": 1.234375, + "learning_rate": 4.215104752576835e-05, + "loss": 0.8958, + "step": 3548 + }, + { + "epoch": 0.2611358933088066, + "grad_norm": 0.79296875, + "learning_rate": 4.214683332113404e-05, + "loss": 0.8052, + "step": 3549 + }, + { + "epoch": 0.2612094734421706, + "grad_norm": 0.90234375, + "learning_rate": 4.214261819626156e-05, + "loss": 0.6614, + "step": 3550 + }, + { + "epoch": 0.2612830535755346, + "grad_norm": 0.890625, + "learning_rate": 4.2138402151377156e-05, + "loss": 0.9995, + "step": 3551 + }, + { + "epoch": 0.2613566337088986, + "grad_norm": 0.7265625, + "learning_rate": 4.213418518670707e-05, + "loss": 0.6394, + "step": 3552 + }, + { + "epoch": 0.2614302138422626, + "grad_norm": 0.78515625, + "learning_rate": 4.212996730247763e-05, + "loss": 0.6211, + "step": 3553 + }, + { + "epoch": 0.26150379397562656, + "grad_norm": 1.1015625, + "learning_rate": 4.212574849891519e-05, + "loss": 0.799, + "step": 3554 + }, + { + "epoch": 0.26157737410899057, + "grad_norm": 0.734375, + "learning_rate": 4.212152877624619e-05, + "loss": 0.8535, + "step": 3555 + }, + { + "epoch": 0.26165095424235457, + "grad_norm": 0.91796875, + "learning_rate": 4.211730813469706e-05, + "loss": 1.134, + "step": 3556 + }, + { + "epoch": 0.2617245343757186, + "grad_norm": 0.8828125, + "learning_rate": 4.211308657449434e-05, + "loss": 1.0679, + "step": 3557 + }, + { + "epoch": 0.2617981145090825, + "grad_norm": 0.65234375, + "learning_rate": 4.2108864095864595e-05, + "loss": 0.6499, + "step": 3558 + }, + { + "epoch": 0.26187169464244653, + "grad_norm": 0.69921875, + "learning_rate": 4.210464069903442e-05, + "loss": 0.9486, + "step": 3559 + }, + { + "epoch": 0.26194527477581053, + "grad_norm": 0.72265625, + "learning_rate": 4.210041638423049e-05, + "loss": 0.9701, + "step": 3560 + }, + { + "epoch": 0.26201885490917454, + "grad_norm": 0.9140625, + "learning_rate": 4.20961911516795e-05, + "loss": 1.1911, + "step": 3561 + }, + { + "epoch": 0.26209243504253854, + "grad_norm": 0.84765625, + "learning_rate": 4.209196500160823e-05, + "loss": 0.8494, + "step": 3562 + }, + { + "epoch": 0.2621660151759025, + "grad_norm": 0.9921875, + "learning_rate": 4.2087737934243475e-05, + "loss": 0.8744, + "step": 3563 + }, + { + "epoch": 0.2622395953092665, + "grad_norm": 0.87890625, + "learning_rate": 4.2083509949812104e-05, + "loss": 0.8249, + "step": 3564 + }, + { + "epoch": 0.2623131754426305, + "grad_norm": 0.828125, + "learning_rate": 4.2079281048541016e-05, + "loss": 0.9578, + "step": 3565 + }, + { + "epoch": 0.2623867555759945, + "grad_norm": 0.8671875, + "learning_rate": 4.207505123065717e-05, + "loss": 1.1456, + "step": 3566 + }, + { + "epoch": 0.26246033570935845, + "grad_norm": 0.953125, + "learning_rate": 4.2070820496387584e-05, + "loss": 0.8844, + "step": 3567 + }, + { + "epoch": 0.26253391584272245, + "grad_norm": 0.8359375, + "learning_rate": 4.20665888459593e-05, + "loss": 1.0938, + "step": 3568 + }, + { + "epoch": 0.26260749597608646, + "grad_norm": 0.75390625, + "learning_rate": 4.206235627959942e-05, + "loss": 0.656, + "step": 3569 + }, + { + "epoch": 0.26268107610945046, + "grad_norm": 0.72265625, + "learning_rate": 4.2058122797535117e-05, + "loss": 0.7738, + "step": 3570 + }, + { + "epoch": 0.26275465624281447, + "grad_norm": 0.79296875, + "learning_rate": 4.205388839999358e-05, + "loss": 1.1333, + "step": 3571 + }, + { + "epoch": 0.2628282363761784, + "grad_norm": 1.0703125, + "learning_rate": 4.2049653087202045e-05, + "loss": 1.2255, + "step": 3572 + }, + { + "epoch": 0.2629018165095424, + "grad_norm": 0.98828125, + "learning_rate": 4.2045416859387844e-05, + "loss": 1.0472, + "step": 3573 + }, + { + "epoch": 0.2629753966429064, + "grad_norm": 0.91015625, + "learning_rate": 4.2041179716778324e-05, + "loss": 1.0815, + "step": 3574 + }, + { + "epoch": 0.2630489767762704, + "grad_norm": 0.80078125, + "learning_rate": 4.2036941659600856e-05, + "loss": 0.9453, + "step": 3575 + }, + { + "epoch": 0.2631225569096344, + "grad_norm": 0.7421875, + "learning_rate": 4.203270268808292e-05, + "loss": 0.9314, + "step": 3576 + }, + { + "epoch": 0.2631961370429984, + "grad_norm": 0.87890625, + "learning_rate": 4.202846280245199e-05, + "loss": 0.8123, + "step": 3577 + }, + { + "epoch": 0.2632697171763624, + "grad_norm": 0.96875, + "learning_rate": 4.2024222002935644e-05, + "loss": 1.0372, + "step": 3578 + }, + { + "epoch": 0.2633432973097264, + "grad_norm": 1.203125, + "learning_rate": 4.201998028976144e-05, + "loss": 1.0443, + "step": 3579 + }, + { + "epoch": 0.2634168774430904, + "grad_norm": 0.7109375, + "learning_rate": 4.201573766315704e-05, + "loss": 0.843, + "step": 3580 + }, + { + "epoch": 0.26349045757645434, + "grad_norm": 0.80859375, + "learning_rate": 4.201149412335015e-05, + "loss": 1.1496, + "step": 3581 + }, + { + "epoch": 0.26356403770981834, + "grad_norm": 0.73828125, + "learning_rate": 4.200724967056848e-05, + "loss": 0.6851, + "step": 3582 + }, + { + "epoch": 0.26363761784318235, + "grad_norm": 0.9453125, + "learning_rate": 4.200300430503986e-05, + "loss": 1.3379, + "step": 3583 + }, + { + "epoch": 0.26371119797654635, + "grad_norm": 1.1171875, + "learning_rate": 4.199875802699211e-05, + "loss": 1.044, + "step": 3584 + }, + { + "epoch": 0.2637847781099103, + "grad_norm": 0.92578125, + "learning_rate": 4.199451083665312e-05, + "loss": 1.3473, + "step": 3585 + }, + { + "epoch": 0.2638583582432743, + "grad_norm": 1.0078125, + "learning_rate": 4.199026273425083e-05, + "loss": 1.007, + "step": 3586 + }, + { + "epoch": 0.2639319383766383, + "grad_norm": 0.84765625, + "learning_rate": 4.198601372001323e-05, + "loss": 0.936, + "step": 3587 + }, + { + "epoch": 0.2640055185100023, + "grad_norm": 0.9609375, + "learning_rate": 4.198176379416836e-05, + "loss": 0.9573, + "step": 3588 + }, + { + "epoch": 0.2640790986433663, + "grad_norm": 0.6484375, + "learning_rate": 4.19775129569443e-05, + "loss": 0.6779, + "step": 3589 + }, + { + "epoch": 0.26415267877673027, + "grad_norm": 0.9140625, + "learning_rate": 4.197326120856918e-05, + "loss": 0.9555, + "step": 3590 + }, + { + "epoch": 0.26422625891009427, + "grad_norm": 0.65625, + "learning_rate": 4.196900854927119e-05, + "loss": 0.6246, + "step": 3591 + }, + { + "epoch": 0.2642998390434583, + "grad_norm": 0.71875, + "learning_rate": 4.1964754979278564e-05, + "loss": 0.7344, + "step": 3592 + }, + { + "epoch": 0.2643734191768223, + "grad_norm": 0.98046875, + "learning_rate": 4.1960500498819586e-05, + "loss": 0.6947, + "step": 3593 + }, + { + "epoch": 0.2644469993101862, + "grad_norm": 0.84375, + "learning_rate": 4.195624510812257e-05, + "loss": 0.9936, + "step": 3594 + }, + { + "epoch": 0.26452057944355023, + "grad_norm": 0.73828125, + "learning_rate": 4.195198880741591e-05, + "loss": 0.8097, + "step": 3595 + }, + { + "epoch": 0.26459415957691423, + "grad_norm": 0.71875, + "learning_rate": 4.194773159692803e-05, + "loss": 0.9053, + "step": 3596 + }, + { + "epoch": 0.26466773971027824, + "grad_norm": 0.87890625, + "learning_rate": 4.19434734768874e-05, + "loss": 1.2433, + "step": 3597 + }, + { + "epoch": 0.26474131984364224, + "grad_norm": 0.953125, + "learning_rate": 4.193921444752255e-05, + "loss": 1.0103, + "step": 3598 + }, + { + "epoch": 0.2648148999770062, + "grad_norm": 0.75390625, + "learning_rate": 4.1934954509062056e-05, + "loss": 0.9681, + "step": 3599 + }, + { + "epoch": 0.2648884801103702, + "grad_norm": 0.8203125, + "learning_rate": 4.1930693661734545e-05, + "loss": 0.8749, + "step": 3600 + }, + { + "epoch": 0.2649620602437342, + "grad_norm": 0.87890625, + "learning_rate": 4.1926431905768675e-05, + "loss": 1.3861, + "step": 3601 + }, + { + "epoch": 0.2650356403770982, + "grad_norm": 0.796875, + "learning_rate": 4.1922169241393174e-05, + "loss": 1.0495, + "step": 3602 + }, + { + "epoch": 0.26510922051046215, + "grad_norm": 0.73828125, + "learning_rate": 4.191790566883681e-05, + "loss": 0.8725, + "step": 3603 + }, + { + "epoch": 0.26518280064382616, + "grad_norm": 1.09375, + "learning_rate": 4.1913641188328415e-05, + "loss": 1.3962, + "step": 3604 + }, + { + "epoch": 0.26525638077719016, + "grad_norm": 0.98046875, + "learning_rate": 4.1909375800096826e-05, + "loss": 0.8629, + "step": 3605 + }, + { + "epoch": 0.26532996091055416, + "grad_norm": 0.87109375, + "learning_rate": 4.190510950437099e-05, + "loss": 0.7154, + "step": 3606 + }, + { + "epoch": 0.26540354104391817, + "grad_norm": 0.74609375, + "learning_rate": 4.190084230137984e-05, + "loss": 0.84, + "step": 3607 + }, + { + "epoch": 0.2654771211772821, + "grad_norm": 1.0390625, + "learning_rate": 4.1896574191352424e-05, + "loss": 1.0756, + "step": 3608 + }, + { + "epoch": 0.2655507013106461, + "grad_norm": 0.86328125, + "learning_rate": 4.189230517451777e-05, + "loss": 0.8495, + "step": 3609 + }, + { + "epoch": 0.2656242814440101, + "grad_norm": 0.91015625, + "learning_rate": 4.188803525110501e-05, + "loss": 1.0097, + "step": 3610 + }, + { + "epoch": 0.26569786157737413, + "grad_norm": 0.88671875, + "learning_rate": 4.188376442134329e-05, + "loss": 0.9431, + "step": 3611 + }, + { + "epoch": 0.2657714417107381, + "grad_norm": 0.64453125, + "learning_rate": 4.187949268546182e-05, + "loss": 0.7255, + "step": 3612 + }, + { + "epoch": 0.2658450218441021, + "grad_norm": 0.69921875, + "learning_rate": 4.187522004368986e-05, + "loss": 0.6103, + "step": 3613 + }, + { + "epoch": 0.2659186019774661, + "grad_norm": 0.9765625, + "learning_rate": 4.1870946496256714e-05, + "loss": 0.8542, + "step": 3614 + }, + { + "epoch": 0.2659921821108301, + "grad_norm": 0.9765625, + "learning_rate": 4.186667204339174e-05, + "loss": 1.4835, + "step": 3615 + }, + { + "epoch": 0.2660657622441941, + "grad_norm": 0.99609375, + "learning_rate": 4.1862396685324325e-05, + "loss": 0.961, + "step": 3616 + }, + { + "epoch": 0.26613934237755804, + "grad_norm": 0.828125, + "learning_rate": 4.185812042228393e-05, + "loss": 0.8702, + "step": 3617 + }, + { + "epoch": 0.26621292251092205, + "grad_norm": 1.1484375, + "learning_rate": 4.185384325450006e-05, + "loss": 1.7418, + "step": 3618 + }, + { + "epoch": 0.26628650264428605, + "grad_norm": 0.84765625, + "learning_rate": 4.1849565182202246e-05, + "loss": 0.9595, + "step": 3619 + }, + { + "epoch": 0.26636008277765005, + "grad_norm": 0.8125, + "learning_rate": 4.18452862056201e-05, + "loss": 0.7958, + "step": 3620 + }, + { + "epoch": 0.266433662911014, + "grad_norm": 0.8984375, + "learning_rate": 4.1841006324983256e-05, + "loss": 0.9107, + "step": 3621 + }, + { + "epoch": 0.266507243044378, + "grad_norm": 0.91796875, + "learning_rate": 4.183672554052142e-05, + "loss": 0.8926, + "step": 3622 + }, + { + "epoch": 0.266580823177742, + "grad_norm": 1.109375, + "learning_rate": 4.1832443852464306e-05, + "loss": 1.1984, + "step": 3623 + }, + { + "epoch": 0.266654403311106, + "grad_norm": 0.87109375, + "learning_rate": 4.182816126104173e-05, + "loss": 1.0964, + "step": 3624 + }, + { + "epoch": 0.26672798344447, + "grad_norm": 1.0390625, + "learning_rate": 4.1823877766483524e-05, + "loss": 1.3117, + "step": 3625 + }, + { + "epoch": 0.26680156357783397, + "grad_norm": 1.0078125, + "learning_rate": 4.181959336901958e-05, + "loss": 1.7506, + "step": 3626 + }, + { + "epoch": 0.26687514371119797, + "grad_norm": 0.80078125, + "learning_rate": 4.181530806887982e-05, + "loss": 0.7347, + "step": 3627 + }, + { + "epoch": 0.266948723844562, + "grad_norm": 1.125, + "learning_rate": 4.181102186629424e-05, + "loss": 0.8751, + "step": 3628 + }, + { + "epoch": 0.267022303977926, + "grad_norm": 1.0, + "learning_rate": 4.1806734761492866e-05, + "loss": 1.3935, + "step": 3629 + }, + { + "epoch": 0.2670958841112899, + "grad_norm": 1.0390625, + "learning_rate": 4.1802446754705784e-05, + "loss": 1.4557, + "step": 3630 + }, + { + "epoch": 0.26716946424465393, + "grad_norm": 0.83203125, + "learning_rate": 4.179815784616311e-05, + "loss": 0.8388, + "step": 3631 + }, + { + "epoch": 0.26724304437801794, + "grad_norm": 0.76953125, + "learning_rate": 4.179386803609504e-05, + "loss": 0.806, + "step": 3632 + }, + { + "epoch": 0.26731662451138194, + "grad_norm": 0.82421875, + "learning_rate": 4.1789577324731786e-05, + "loss": 0.8701, + "step": 3633 + }, + { + "epoch": 0.26739020464474594, + "grad_norm": 1.1640625, + "learning_rate": 4.1785285712303636e-05, + "loss": 1.1736, + "step": 3634 + }, + { + "epoch": 0.2674637847781099, + "grad_norm": 0.84375, + "learning_rate": 4.178099319904091e-05, + "loss": 0.6696, + "step": 3635 + }, + { + "epoch": 0.2675373649114739, + "grad_norm": 1.0625, + "learning_rate": 4.1776699785173955e-05, + "loss": 1.1624, + "step": 3636 + }, + { + "epoch": 0.2676109450448379, + "grad_norm": 0.87890625, + "learning_rate": 4.177240547093322e-05, + "loss": 1.2793, + "step": 3637 + }, + { + "epoch": 0.2676845251782019, + "grad_norm": 1.0, + "learning_rate": 4.176811025654917e-05, + "loss": 1.1772, + "step": 3638 + }, + { + "epoch": 0.26775810531156585, + "grad_norm": 1.0859375, + "learning_rate": 4.176381414225229e-05, + "loss": 0.9055, + "step": 3639 + }, + { + "epoch": 0.26783168544492986, + "grad_norm": 0.90234375, + "learning_rate": 4.1759517128273184e-05, + "loss": 0.8579, + "step": 3640 + }, + { + "epoch": 0.26790526557829386, + "grad_norm": 0.77734375, + "learning_rate": 4.175521921484245e-05, + "loss": 0.6146, + "step": 3641 + }, + { + "epoch": 0.26797884571165786, + "grad_norm": 0.6953125, + "learning_rate": 4.175092040219074e-05, + "loss": 0.6374, + "step": 3642 + }, + { + "epoch": 0.26805242584502187, + "grad_norm": 1.0234375, + "learning_rate": 4.174662069054877e-05, + "loss": 1.7015, + "step": 3643 + }, + { + "epoch": 0.2681260059783858, + "grad_norm": 0.7578125, + "learning_rate": 4.1742320080147304e-05, + "loss": 0.7913, + "step": 3644 + }, + { + "epoch": 0.2681995861117498, + "grad_norm": 0.96875, + "learning_rate": 4.173801857121713e-05, + "loss": 0.8472, + "step": 3645 + }, + { + "epoch": 0.2682731662451138, + "grad_norm": 0.75390625, + "learning_rate": 4.173371616398912e-05, + "loss": 0.9246, + "step": 3646 + }, + { + "epoch": 0.26834674637847783, + "grad_norm": 0.984375, + "learning_rate": 4.172941285869417e-05, + "loss": 1.0593, + "step": 3647 + }, + { + "epoch": 0.2684203265118418, + "grad_norm": 0.7265625, + "learning_rate": 4.172510865556323e-05, + "loss": 0.7386, + "step": 3648 + }, + { + "epoch": 0.2684939066452058, + "grad_norm": 0.87890625, + "learning_rate": 4.172080355482729e-05, + "loss": 1.0615, + "step": 3649 + }, + { + "epoch": 0.2685674867785698, + "grad_norm": 0.7265625, + "learning_rate": 4.171649755671741e-05, + "loss": 0.6225, + "step": 3650 + }, + { + "epoch": 0.2686410669119338, + "grad_norm": 0.81640625, + "learning_rate": 4.171219066146468e-05, + "loss": 0.8728, + "step": 3651 + }, + { + "epoch": 0.2687146470452978, + "grad_norm": 0.90234375, + "learning_rate": 4.170788286930024e-05, + "loss": 0.8478, + "step": 3652 + }, + { + "epoch": 0.26878822717866174, + "grad_norm": 0.921875, + "learning_rate": 4.170357418045529e-05, + "loss": 0.8094, + "step": 3653 + }, + { + "epoch": 0.26886180731202575, + "grad_norm": 0.86328125, + "learning_rate": 4.169926459516105e-05, + "loss": 0.8884, + "step": 3654 + }, + { + "epoch": 0.26893538744538975, + "grad_norm": 0.80859375, + "learning_rate": 4.1694954113648823e-05, + "loss": 0.6956, + "step": 3655 + }, + { + "epoch": 0.26900896757875375, + "grad_norm": 0.66015625, + "learning_rate": 4.169064273614995e-05, + "loss": 0.5339, + "step": 3656 + }, + { + "epoch": 0.2690825477121177, + "grad_norm": 1.1015625, + "learning_rate": 4.16863304628958e-05, + "loss": 1.36, + "step": 3657 + }, + { + "epoch": 0.2691561278454817, + "grad_norm": 0.89453125, + "learning_rate": 4.16820172941178e-05, + "loss": 1.2954, + "step": 3658 + }, + { + "epoch": 0.2692297079788457, + "grad_norm": 0.84375, + "learning_rate": 4.167770323004746e-05, + "loss": 1.2144, + "step": 3659 + }, + { + "epoch": 0.2693032881122097, + "grad_norm": 0.9609375, + "learning_rate": 4.167338827091627e-05, + "loss": 1.0776, + "step": 3660 + }, + { + "epoch": 0.2693768682455737, + "grad_norm": 0.8984375, + "learning_rate": 4.166907241695583e-05, + "loss": 1.0608, + "step": 3661 + }, + { + "epoch": 0.26945044837893767, + "grad_norm": 0.82421875, + "learning_rate": 4.1664755668397756e-05, + "loss": 1.0712, + "step": 3662 + }, + { + "epoch": 0.26952402851230167, + "grad_norm": 0.93359375, + "learning_rate": 4.1660438025473725e-05, + "loss": 0.9114, + "step": 3663 + }, + { + "epoch": 0.2695976086456657, + "grad_norm": 0.9765625, + "learning_rate": 4.165611948841545e-05, + "loss": 0.8202, + "step": 3664 + }, + { + "epoch": 0.2696711887790297, + "grad_norm": 0.859375, + "learning_rate": 4.165180005745469e-05, + "loss": 0.7816, + "step": 3665 + }, + { + "epoch": 0.26974476891239363, + "grad_norm": 0.92578125, + "learning_rate": 4.164747973282328e-05, + "loss": 1.0676, + "step": 3666 + }, + { + "epoch": 0.26981834904575763, + "grad_norm": 0.91796875, + "learning_rate": 4.1643158514753076e-05, + "loss": 1.0146, + "step": 3667 + }, + { + "epoch": 0.26989192917912164, + "grad_norm": 0.8046875, + "learning_rate": 4.163883640347599e-05, + "loss": 0.9597, + "step": 3668 + }, + { + "epoch": 0.26996550931248564, + "grad_norm": 0.95703125, + "learning_rate": 4.163451339922399e-05, + "loss": 1.0143, + "step": 3669 + }, + { + "epoch": 0.27003908944584964, + "grad_norm": 1.0703125, + "learning_rate": 4.163018950222905e-05, + "loss": 1.0431, + "step": 3670 + }, + { + "epoch": 0.2701126695792136, + "grad_norm": 0.78125, + "learning_rate": 4.162586471272327e-05, + "loss": 0.7312, + "step": 3671 + }, + { + "epoch": 0.2701862497125776, + "grad_norm": 0.80859375, + "learning_rate": 4.162153903093873e-05, + "loss": 1.2639, + "step": 3672 + }, + { + "epoch": 0.2702598298459416, + "grad_norm": 0.78515625, + "learning_rate": 4.1617212457107576e-05, + "loss": 0.7579, + "step": 3673 + }, + { + "epoch": 0.2703334099793056, + "grad_norm": 0.9140625, + "learning_rate": 4.161288499146202e-05, + "loss": 0.9605, + "step": 3674 + }, + { + "epoch": 0.27040699011266955, + "grad_norm": 0.859375, + "learning_rate": 4.1608556634234296e-05, + "loss": 0.8284, + "step": 3675 + }, + { + "epoch": 0.27048057024603356, + "grad_norm": 1.03125, + "learning_rate": 4.1604227385656714e-05, + "loss": 0.9931, + "step": 3676 + }, + { + "epoch": 0.27055415037939756, + "grad_norm": 0.98046875, + "learning_rate": 4.15998972459616e-05, + "loss": 0.9074, + "step": 3677 + }, + { + "epoch": 0.27062773051276157, + "grad_norm": 0.73046875, + "learning_rate": 4.159556621538137e-05, + "loss": 0.9218, + "step": 3678 + }, + { + "epoch": 0.27070131064612557, + "grad_norm": 0.8203125, + "learning_rate": 4.159123429414844e-05, + "loss": 0.8277, + "step": 3679 + }, + { + "epoch": 0.2707748907794895, + "grad_norm": 0.74609375, + "learning_rate": 4.158690148249529e-05, + "loss": 0.5017, + "step": 3680 + }, + { + "epoch": 0.2708484709128535, + "grad_norm": 0.890625, + "learning_rate": 4.158256778065448e-05, + "loss": 1.0813, + "step": 3681 + }, + { + "epoch": 0.2709220510462175, + "grad_norm": 0.9609375, + "learning_rate": 4.157823318885857e-05, + "loss": 1.0314, + "step": 3682 + }, + { + "epoch": 0.27099563117958153, + "grad_norm": 0.83203125, + "learning_rate": 4.15738977073402e-05, + "loss": 0.676, + "step": 3683 + }, + { + "epoch": 0.2710692113129455, + "grad_norm": 0.8125, + "learning_rate": 4.156956133633204e-05, + "loss": 0.9024, + "step": 3684 + }, + { + "epoch": 0.2711427914463095, + "grad_norm": 0.890625, + "learning_rate": 4.1565224076066824e-05, + "loss": 1.1291, + "step": 3685 + }, + { + "epoch": 0.2712163715796735, + "grad_norm": 0.98046875, + "learning_rate": 4.1560885926777314e-05, + "loss": 0.7727, + "step": 3686 + }, + { + "epoch": 0.2712899517130375, + "grad_norm": 0.93359375, + "learning_rate": 4.1556546888696336e-05, + "loss": 1.0155, + "step": 3687 + }, + { + "epoch": 0.2713635318464015, + "grad_norm": 1.0546875, + "learning_rate": 4.1552206962056764e-05, + "loss": 1.2204, + "step": 3688 + }, + { + "epoch": 0.27143711197976544, + "grad_norm": 0.765625, + "learning_rate": 4.1547866147091496e-05, + "loss": 0.7764, + "step": 3689 + }, + { + "epoch": 0.27151069211312945, + "grad_norm": 0.73828125, + "learning_rate": 4.154352444403351e-05, + "loss": 0.7791, + "step": 3690 + }, + { + "epoch": 0.27158427224649345, + "grad_norm": 0.76171875, + "learning_rate": 4.1539181853115814e-05, + "loss": 0.8875, + "step": 3691 + }, + { + "epoch": 0.27165785237985746, + "grad_norm": 0.875, + "learning_rate": 4.153483837457147e-05, + "loss": 1.0873, + "step": 3692 + }, + { + "epoch": 0.2717314325132214, + "grad_norm": 1.0, + "learning_rate": 4.153049400863358e-05, + "loss": 1.1457, + "step": 3693 + }, + { + "epoch": 0.2718050126465854, + "grad_norm": 0.9140625, + "learning_rate": 4.1526148755535296e-05, + "loss": 0.8534, + "step": 3694 + }, + { + "epoch": 0.2718785927799494, + "grad_norm": 0.8203125, + "learning_rate": 4.152180261550982e-05, + "loss": 0.8841, + "step": 3695 + }, + { + "epoch": 0.2719521729133134, + "grad_norm": 0.7109375, + "learning_rate": 4.1517455588790397e-05, + "loss": 0.9285, + "step": 3696 + }, + { + "epoch": 0.2720257530466774, + "grad_norm": 0.875, + "learning_rate": 4.151310767561034e-05, + "loss": 0.7892, + "step": 3697 + }, + { + "epoch": 0.27209933318004137, + "grad_norm": 0.87890625, + "learning_rate": 4.150875887620298e-05, + "loss": 0.9136, + "step": 3698 + }, + { + "epoch": 0.2721729133134054, + "grad_norm": 1.1875, + "learning_rate": 4.150440919080172e-05, + "loss": 1.0957, + "step": 3699 + }, + { + "epoch": 0.2722464934467694, + "grad_norm": 0.75, + "learning_rate": 4.1500058619639984e-05, + "loss": 0.8967, + "step": 3700 + }, + { + "epoch": 0.2723200735801334, + "grad_norm": 0.80078125, + "learning_rate": 4.149570716295126e-05, + "loss": 1.0409, + "step": 3701 + }, + { + "epoch": 0.27239365371349733, + "grad_norm": 0.98828125, + "learning_rate": 4.1491354820969094e-05, + "loss": 1.4121, + "step": 3702 + }, + { + "epoch": 0.27246723384686133, + "grad_norm": 0.953125, + "learning_rate": 4.1487001593927055e-05, + "loss": 0.7186, + "step": 3703 + }, + { + "epoch": 0.27254081398022534, + "grad_norm": 0.859375, + "learning_rate": 4.1482647482058787e-05, + "loss": 1.3317, + "step": 3704 + }, + { + "epoch": 0.27261439411358934, + "grad_norm": 0.8203125, + "learning_rate": 4.147829248559796e-05, + "loss": 0.6391, + "step": 3705 + }, + { + "epoch": 0.27268797424695335, + "grad_norm": 1.109375, + "learning_rate": 4.147393660477831e-05, + "loss": 1.6144, + "step": 3706 + }, + { + "epoch": 0.2727615543803173, + "grad_norm": 0.8203125, + "learning_rate": 4.1469579839833577e-05, + "loss": 1.0032, + "step": 3707 + }, + { + "epoch": 0.2728351345136813, + "grad_norm": 0.9375, + "learning_rate": 4.146522219099761e-05, + "loss": 0.8316, + "step": 3708 + }, + { + "epoch": 0.2729087146470453, + "grad_norm": 1.0546875, + "learning_rate": 4.146086365850427e-05, + "loss": 1.172, + "step": 3709 + }, + { + "epoch": 0.2729822947804093, + "grad_norm": 0.9921875, + "learning_rate": 4.1456504242587454e-05, + "loss": 1.3697, + "step": 3710 + }, + { + "epoch": 0.27305587491377326, + "grad_norm": 0.7734375, + "learning_rate": 4.145214394348115e-05, + "loss": 1.021, + "step": 3711 + }, + { + "epoch": 0.27312945504713726, + "grad_norm": 0.90625, + "learning_rate": 4.1447782761419354e-05, + "loss": 1.0258, + "step": 3712 + }, + { + "epoch": 0.27320303518050126, + "grad_norm": 0.796875, + "learning_rate": 4.144342069663611e-05, + "loss": 0.861, + "step": 3713 + }, + { + "epoch": 0.27327661531386527, + "grad_norm": 0.75390625, + "learning_rate": 4.143905774936555e-05, + "loss": 0.8782, + "step": 3714 + }, + { + "epoch": 0.27335019544722927, + "grad_norm": 0.921875, + "learning_rate": 4.1434693919841805e-05, + "loss": 1.148, + "step": 3715 + }, + { + "epoch": 0.2734237755805932, + "grad_norm": 0.84765625, + "learning_rate": 4.143032920829907e-05, + "loss": 1.0507, + "step": 3716 + }, + { + "epoch": 0.2734973557139572, + "grad_norm": 0.8984375, + "learning_rate": 4.142596361497161e-05, + "loss": 0.9985, + "step": 3717 + }, + { + "epoch": 0.27357093584732123, + "grad_norm": 0.9609375, + "learning_rate": 4.14215971400937e-05, + "loss": 1.3125, + "step": 3718 + }, + { + "epoch": 0.27364451598068523, + "grad_norm": 0.921875, + "learning_rate": 4.141722978389969e-05, + "loss": 1.0741, + "step": 3719 + }, + { + "epoch": 0.2737180961140492, + "grad_norm": 0.98828125, + "learning_rate": 4.141286154662397e-05, + "loss": 1.418, + "step": 3720 + }, + { + "epoch": 0.2737916762474132, + "grad_norm": 0.8203125, + "learning_rate": 4.140849242850096e-05, + "loss": 0.7892, + "step": 3721 + }, + { + "epoch": 0.2738652563807772, + "grad_norm": 0.76171875, + "learning_rate": 4.140412242976516e-05, + "loss": 1.0057, + "step": 3722 + }, + { + "epoch": 0.2739388365141412, + "grad_norm": 0.83203125, + "learning_rate": 4.1399751550651087e-05, + "loss": 0.7086, + "step": 3723 + }, + { + "epoch": 0.2740124166475052, + "grad_norm": 0.8203125, + "learning_rate": 4.139537979139333e-05, + "loss": 0.9284, + "step": 3724 + }, + { + "epoch": 0.27408599678086915, + "grad_norm": 1.1015625, + "learning_rate": 4.1391007152226494e-05, + "loss": 0.8726, + "step": 3725 + }, + { + "epoch": 0.27415957691423315, + "grad_norm": 0.77734375, + "learning_rate": 4.1386633633385276e-05, + "loss": 0.9388, + "step": 3726 + }, + { + "epoch": 0.27423315704759715, + "grad_norm": 0.90625, + "learning_rate": 4.138225923510437e-05, + "loss": 1.3097, + "step": 3727 + }, + { + "epoch": 0.27430673718096116, + "grad_norm": 0.76953125, + "learning_rate": 4.1377883957618556e-05, + "loss": 0.6689, + "step": 3728 + }, + { + "epoch": 0.2743803173143251, + "grad_norm": 0.91015625, + "learning_rate": 4.137350780116265e-05, + "loss": 1.0086, + "step": 3729 + }, + { + "epoch": 0.2744538974476891, + "grad_norm": 1.0078125, + "learning_rate": 4.13691307659715e-05, + "loss": 1.1734, + "step": 3730 + }, + { + "epoch": 0.2745274775810531, + "grad_norm": 0.79296875, + "learning_rate": 4.136475285228002e-05, + "loss": 0.7806, + "step": 3731 + }, + { + "epoch": 0.2746010577144171, + "grad_norm": 0.734375, + "learning_rate": 4.136037406032316e-05, + "loss": 0.6985, + "step": 3732 + }, + { + "epoch": 0.2746746378477811, + "grad_norm": 0.87109375, + "learning_rate": 4.135599439033593e-05, + "loss": 1.0879, + "step": 3733 + }, + { + "epoch": 0.27474821798114507, + "grad_norm": 0.87890625, + "learning_rate": 4.135161384255337e-05, + "loss": 0.9441, + "step": 3734 + }, + { + "epoch": 0.2748217981145091, + "grad_norm": 1.03125, + "learning_rate": 4.1347232417210575e-05, + "loss": 1.4543, + "step": 3735 + }, + { + "epoch": 0.2748953782478731, + "grad_norm": 0.86328125, + "learning_rate": 4.13428501145427e-05, + "loss": 0.8849, + "step": 3736 + }, + { + "epoch": 0.2749689583812371, + "grad_norm": 0.78515625, + "learning_rate": 4.133846693478492e-05, + "loss": 0.9882, + "step": 3737 + }, + { + "epoch": 0.27504253851460103, + "grad_norm": 0.8125, + "learning_rate": 4.133408287817248e-05, + "loss": 0.9064, + "step": 3738 + }, + { + "epoch": 0.27511611864796504, + "grad_norm": 0.94921875, + "learning_rate": 4.132969794494066e-05, + "loss": 0.6894, + "step": 3739 + }, + { + "epoch": 0.27518969878132904, + "grad_norm": 0.91796875, + "learning_rate": 4.13253121353248e-05, + "loss": 0.8499, + "step": 3740 + }, + { + "epoch": 0.27526327891469304, + "grad_norm": 0.9140625, + "learning_rate": 4.1320925449560265e-05, + "loss": 0.7634, + "step": 3741 + }, + { + "epoch": 0.27533685904805705, + "grad_norm": 0.77734375, + "learning_rate": 4.131653788788249e-05, + "loss": 1.2907, + "step": 3742 + }, + { + "epoch": 0.275410439181421, + "grad_norm": 0.8984375, + "learning_rate": 4.1312149450526946e-05, + "loss": 1.1168, + "step": 3743 + }, + { + "epoch": 0.275484019314785, + "grad_norm": 0.97265625, + "learning_rate": 4.130776013772916e-05, + "loss": 1.072, + "step": 3744 + }, + { + "epoch": 0.275557599448149, + "grad_norm": 0.890625, + "learning_rate": 4.130336994972467e-05, + "loss": 1.2104, + "step": 3745 + }, + { + "epoch": 0.275631179581513, + "grad_norm": 0.9453125, + "learning_rate": 4.1298978886749115e-05, + "loss": 0.7803, + "step": 3746 + }, + { + "epoch": 0.27570475971487696, + "grad_norm": 1.0546875, + "learning_rate": 4.129458694903815e-05, + "loss": 1.5481, + "step": 3747 + }, + { + "epoch": 0.27577833984824096, + "grad_norm": 0.7734375, + "learning_rate": 4.1290194136827476e-05, + "loss": 0.9913, + "step": 3748 + }, + { + "epoch": 0.27585191998160496, + "grad_norm": 0.76953125, + "learning_rate": 4.128580045035285e-05, + "loss": 1.2736, + "step": 3749 + }, + { + "epoch": 0.27592550011496897, + "grad_norm": 0.85546875, + "learning_rate": 4.128140588985008e-05, + "loss": 1.0355, + "step": 3750 + }, + { + "epoch": 0.275999080248333, + "grad_norm": 1.421875, + "learning_rate": 4.1277010455555e-05, + "loss": 1.2461, + "step": 3751 + }, + { + "epoch": 0.2760726603816969, + "grad_norm": 0.9296875, + "learning_rate": 4.127261414770351e-05, + "loss": 0.7185, + "step": 3752 + }, + { + "epoch": 0.2761462405150609, + "grad_norm": 0.8125, + "learning_rate": 4.126821696653156e-05, + "loss": 0.9304, + "step": 3753 + }, + { + "epoch": 0.27621982064842493, + "grad_norm": 0.8515625, + "learning_rate": 4.126381891227513e-05, + "loss": 0.9105, + "step": 3754 + }, + { + "epoch": 0.27629340078178893, + "grad_norm": 0.96484375, + "learning_rate": 4.125941998517026e-05, + "loss": 0.9678, + "step": 3755 + }, + { + "epoch": 0.2763669809151529, + "grad_norm": 0.94140625, + "learning_rate": 4.125502018545302e-05, + "loss": 1.1382, + "step": 3756 + }, + { + "epoch": 0.2764405610485169, + "grad_norm": 0.87890625, + "learning_rate": 4.125061951335956e-05, + "loss": 0.811, + "step": 3757 + }, + { + "epoch": 0.2765141411818809, + "grad_norm": 0.88671875, + "learning_rate": 4.1246217969126036e-05, + "loss": 1.2773, + "step": 3758 + }, + { + "epoch": 0.2765877213152449, + "grad_norm": 1.0390625, + "learning_rate": 4.1241815552988675e-05, + "loss": 1.1922, + "step": 3759 + }, + { + "epoch": 0.2766613014486089, + "grad_norm": 0.80078125, + "learning_rate": 4.123741226518375e-05, + "loss": 0.8484, + "step": 3760 + }, + { + "epoch": 0.27673488158197285, + "grad_norm": 0.8671875, + "learning_rate": 4.1233008105947584e-05, + "loss": 1.1286, + "step": 3761 + }, + { + "epoch": 0.27680846171533685, + "grad_norm": 0.76953125, + "learning_rate": 4.1228603075516526e-05, + "loss": 0.9195, + "step": 3762 + }, + { + "epoch": 0.27688204184870086, + "grad_norm": 1.2109375, + "learning_rate": 4.1224197174127e-05, + "loss": 0.9709, + "step": 3763 + }, + { + "epoch": 0.27695562198206486, + "grad_norm": 0.9609375, + "learning_rate": 4.1219790402015444e-05, + "loss": 1.0277, + "step": 3764 + }, + { + "epoch": 0.2770292021154288, + "grad_norm": 0.890625, + "learning_rate": 4.121538275941839e-05, + "loss": 0.7648, + "step": 3765 + }, + { + "epoch": 0.2771027822487928, + "grad_norm": 0.8125, + "learning_rate": 4.121097424657235e-05, + "loss": 0.6587, + "step": 3766 + }, + { + "epoch": 0.2771763623821568, + "grad_norm": 0.89453125, + "learning_rate": 4.120656486371395e-05, + "loss": 0.8048, + "step": 3767 + }, + { + "epoch": 0.2772499425155208, + "grad_norm": 0.7890625, + "learning_rate": 4.120215461107982e-05, + "loss": 0.8197, + "step": 3768 + }, + { + "epoch": 0.2773235226488848, + "grad_norm": 0.71484375, + "learning_rate": 4.1197743488906656e-05, + "loss": 0.5523, + "step": 3769 + }, + { + "epoch": 0.2773971027822488, + "grad_norm": 0.98828125, + "learning_rate": 4.11933314974312e-05, + "loss": 0.8875, + "step": 3770 + }, + { + "epoch": 0.2774706829156128, + "grad_norm": 1.109375, + "learning_rate": 4.1188918636890216e-05, + "loss": 0.9472, + "step": 3771 + }, + { + "epoch": 0.2775442630489768, + "grad_norm": 0.71484375, + "learning_rate": 4.118450490752055e-05, + "loss": 0.6643, + "step": 3772 + }, + { + "epoch": 0.2776178431823408, + "grad_norm": 1.03125, + "learning_rate": 4.1180090309559075e-05, + "loss": 1.3515, + "step": 3773 + }, + { + "epoch": 0.27769142331570473, + "grad_norm": 0.96484375, + "learning_rate": 4.117567484324271e-05, + "loss": 1.155, + "step": 3774 + }, + { + "epoch": 0.27776500344906874, + "grad_norm": 0.921875, + "learning_rate": 4.117125850880842e-05, + "loss": 0.8974, + "step": 3775 + }, + { + "epoch": 0.27783858358243274, + "grad_norm": 0.72265625, + "learning_rate": 4.116684130649324e-05, + "loss": 0.8932, + "step": 3776 + }, + { + "epoch": 0.27791216371579675, + "grad_norm": 0.80859375, + "learning_rate": 4.116242323653422e-05, + "loss": 1.1929, + "step": 3777 + }, + { + "epoch": 0.27798574384916075, + "grad_norm": 0.890625, + "learning_rate": 4.1158004299168465e-05, + "loss": 0.9176, + "step": 3778 + }, + { + "epoch": 0.2780593239825247, + "grad_norm": 0.73828125, + "learning_rate": 4.1153584494633144e-05, + "loss": 0.672, + "step": 3779 + }, + { + "epoch": 0.2781329041158887, + "grad_norm": 0.99609375, + "learning_rate": 4.114916382316546e-05, + "loss": 1.2671, + "step": 3780 + }, + { + "epoch": 0.2782064842492527, + "grad_norm": 0.90234375, + "learning_rate": 4.114474228500264e-05, + "loss": 1.0137, + "step": 3781 + }, + { + "epoch": 0.2782800643826167, + "grad_norm": 0.64453125, + "learning_rate": 4.1140319880382e-05, + "loss": 0.913, + "step": 3782 + }, + { + "epoch": 0.27835364451598066, + "grad_norm": 0.796875, + "learning_rate": 4.113589660954088e-05, + "loss": 0.8609, + "step": 3783 + }, + { + "epoch": 0.27842722464934466, + "grad_norm": 0.859375, + "learning_rate": 4.113147247271667e-05, + "loss": 0.8475, + "step": 3784 + }, + { + "epoch": 0.27850080478270867, + "grad_norm": 0.8125, + "learning_rate": 4.1127047470146786e-05, + "loss": 1.2032, + "step": 3785 + }, + { + "epoch": 0.27857438491607267, + "grad_norm": 1.0078125, + "learning_rate": 4.112262160206873e-05, + "loss": 1.2808, + "step": 3786 + }, + { + "epoch": 0.2786479650494367, + "grad_norm": 0.859375, + "learning_rate": 4.1118194868720025e-05, + "loss": 0.7413, + "step": 3787 + }, + { + "epoch": 0.2787215451828006, + "grad_norm": 0.8046875, + "learning_rate": 4.111376727033825e-05, + "loss": 0.7744, + "step": 3788 + }, + { + "epoch": 0.2787951253161646, + "grad_norm": 1.0234375, + "learning_rate": 4.110933880716101e-05, + "loss": 1.0279, + "step": 3789 + }, + { + "epoch": 0.27886870544952863, + "grad_norm": 0.6953125, + "learning_rate": 4.110490947942599e-05, + "loss": 0.8174, + "step": 3790 + }, + { + "epoch": 0.27894228558289264, + "grad_norm": 0.89453125, + "learning_rate": 4.1100479287370896e-05, + "loss": 0.9783, + "step": 3791 + }, + { + "epoch": 0.2790158657162566, + "grad_norm": 0.91015625, + "learning_rate": 4.109604823123349e-05, + "loss": 1.1988, + "step": 3792 + }, + { + "epoch": 0.2790894458496206, + "grad_norm": 0.9609375, + "learning_rate": 4.109161631125157e-05, + "loss": 1.1926, + "step": 3793 + }, + { + "epoch": 0.2791630259829846, + "grad_norm": 0.64453125, + "learning_rate": 4.1087183527663e-05, + "loss": 0.7411, + "step": 3794 + }, + { + "epoch": 0.2792366061163486, + "grad_norm": 0.734375, + "learning_rate": 4.108274988070567e-05, + "loss": 0.967, + "step": 3795 + }, + { + "epoch": 0.2793101862497126, + "grad_norm": 0.8203125, + "learning_rate": 4.107831537061753e-05, + "loss": 1.1322, + "step": 3796 + }, + { + "epoch": 0.27938376638307655, + "grad_norm": 1.078125, + "learning_rate": 4.107387999763658e-05, + "loss": 0.9204, + "step": 3797 + }, + { + "epoch": 0.27945734651644055, + "grad_norm": 0.76171875, + "learning_rate": 4.1069443762000835e-05, + "loss": 0.7403, + "step": 3798 + }, + { + "epoch": 0.27953092664980456, + "grad_norm": 0.77734375, + "learning_rate": 4.1065006663948406e-05, + "loss": 0.9051, + "step": 3799 + }, + { + "epoch": 0.27960450678316856, + "grad_norm": 1.1796875, + "learning_rate": 4.106056870371741e-05, + "loss": 1.509, + "step": 3800 + }, + { + "epoch": 0.2796780869165325, + "grad_norm": 0.73828125, + "learning_rate": 4.1056129881546024e-05, + "loss": 0.878, + "step": 3801 + }, + { + "epoch": 0.2797516670498965, + "grad_norm": 0.9609375, + "learning_rate": 4.105169019767248e-05, + "loss": 0.8086, + "step": 3802 + }, + { + "epoch": 0.2798252471832605, + "grad_norm": 1.1796875, + "learning_rate": 4.104724965233503e-05, + "loss": 0.8789, + "step": 3803 + }, + { + "epoch": 0.2798988273166245, + "grad_norm": 0.76953125, + "learning_rate": 4.1042808245772005e-05, + "loss": 1.0014, + "step": 3804 + }, + { + "epoch": 0.2799724074499885, + "grad_norm": 0.83984375, + "learning_rate": 4.103836597822176e-05, + "loss": 0.8212, + "step": 3805 + }, + { + "epoch": 0.2800459875833525, + "grad_norm": 1.109375, + "learning_rate": 4.1033922849922706e-05, + "loss": 1.792, + "step": 3806 + }, + { + "epoch": 0.2801195677167165, + "grad_norm": 0.86328125, + "learning_rate": 4.1029478861113295e-05, + "loss": 0.8233, + "step": 3807 + }, + { + "epoch": 0.2801931478500805, + "grad_norm": 0.66015625, + "learning_rate": 4.102503401203203e-05, + "loss": 0.8307, + "step": 3808 + }, + { + "epoch": 0.2802667279834445, + "grad_norm": 0.62890625, + "learning_rate": 4.102058830291746e-05, + "loss": 0.7511, + "step": 3809 + }, + { + "epoch": 0.28034030811680843, + "grad_norm": 0.74609375, + "learning_rate": 4.1016141734008165e-05, + "loss": 1.0038, + "step": 3810 + }, + { + "epoch": 0.28041388825017244, + "grad_norm": 0.83984375, + "learning_rate": 4.1011694305542805e-05, + "loss": 0.8683, + "step": 3811 + }, + { + "epoch": 0.28048746838353644, + "grad_norm": 1.0078125, + "learning_rate": 4.1007246017760047e-05, + "loss": 1.0173, + "step": 3812 + }, + { + "epoch": 0.28056104851690045, + "grad_norm": 1.1328125, + "learning_rate": 4.100279687089863e-05, + "loss": 1.0265, + "step": 3813 + }, + { + "epoch": 0.28063462865026445, + "grad_norm": 0.80078125, + "learning_rate": 4.099834686519733e-05, + "loss": 0.9441, + "step": 3814 + }, + { + "epoch": 0.2807082087836284, + "grad_norm": 0.703125, + "learning_rate": 4.099389600089497e-05, + "loss": 0.7323, + "step": 3815 + }, + { + "epoch": 0.2807817889169924, + "grad_norm": 0.90234375, + "learning_rate": 4.098944427823041e-05, + "loss": 1.2657, + "step": 3816 + }, + { + "epoch": 0.2808553690503564, + "grad_norm": 0.96484375, + "learning_rate": 4.0984991697442596e-05, + "loss": 0.7879, + "step": 3817 + }, + { + "epoch": 0.2809289491837204, + "grad_norm": 0.80859375, + "learning_rate": 4.098053825877046e-05, + "loss": 0.7809, + "step": 3818 + }, + { + "epoch": 0.28100252931708436, + "grad_norm": 0.765625, + "learning_rate": 4.097608396245301e-05, + "loss": 0.8438, + "step": 3819 + }, + { + "epoch": 0.28107610945044836, + "grad_norm": 0.7421875, + "learning_rate": 4.097162880872932e-05, + "loss": 0.6625, + "step": 3820 + }, + { + "epoch": 0.28114968958381237, + "grad_norm": 0.7890625, + "learning_rate": 4.096717279783847e-05, + "loss": 0.8132, + "step": 3821 + }, + { + "epoch": 0.28122326971717637, + "grad_norm": 1.0234375, + "learning_rate": 4.096271593001962e-05, + "loss": 1.2908, + "step": 3822 + }, + { + "epoch": 0.2812968498505404, + "grad_norm": 1.3359375, + "learning_rate": 4.095825820551195e-05, + "loss": 1.0392, + "step": 3823 + }, + { + "epoch": 0.2813704299839043, + "grad_norm": 0.85546875, + "learning_rate": 4.09537996245547e-05, + "loss": 0.8572, + "step": 3824 + }, + { + "epoch": 0.28144401011726833, + "grad_norm": 0.796875, + "learning_rate": 4.094934018738716e-05, + "loss": 0.856, + "step": 3825 + }, + { + "epoch": 0.28151759025063233, + "grad_norm": 0.71875, + "learning_rate": 4.094487989424866e-05, + "loss": 0.7391, + "step": 3826 + }, + { + "epoch": 0.28159117038399634, + "grad_norm": 1.0, + "learning_rate": 4.094041874537857e-05, + "loss": 1.0149, + "step": 3827 + }, + { + "epoch": 0.2816647505173603, + "grad_norm": 1.2421875, + "learning_rate": 4.0935956741016313e-05, + "loss": 1.3908, + "step": 3828 + }, + { + "epoch": 0.2817383306507243, + "grad_norm": 1.0546875, + "learning_rate": 4.0931493881401364e-05, + "loss": 0.9702, + "step": 3829 + }, + { + "epoch": 0.2818119107840883, + "grad_norm": 0.90234375, + "learning_rate": 4.0927030166773217e-05, + "loss": 1.2533, + "step": 3830 + }, + { + "epoch": 0.2818854909174523, + "grad_norm": 1.0, + "learning_rate": 4.092256559737144e-05, + "loss": 0.9352, + "step": 3831 + }, + { + "epoch": 0.2819590710508163, + "grad_norm": 1.0078125, + "learning_rate": 4.091810017343565e-05, + "loss": 0.9035, + "step": 3832 + }, + { + "epoch": 0.28203265118418025, + "grad_norm": 0.82421875, + "learning_rate": 4.0913633895205484e-05, + "loss": 0.8236, + "step": 3833 + }, + { + "epoch": 0.28210623131754425, + "grad_norm": 0.92578125, + "learning_rate": 4.090916676292065e-05, + "loss": 0.9557, + "step": 3834 + }, + { + "epoch": 0.28217981145090826, + "grad_norm": 0.80859375, + "learning_rate": 4.0904698776820885e-05, + "loss": 0.9953, + "step": 3835 + }, + { + "epoch": 0.28225339158427226, + "grad_norm": 1.2421875, + "learning_rate": 4.090022993714596e-05, + "loss": 1.0776, + "step": 3836 + }, + { + "epoch": 0.28232697171763627, + "grad_norm": 1.0703125, + "learning_rate": 4.089576024413574e-05, + "loss": 1.0095, + "step": 3837 + }, + { + "epoch": 0.2824005518510002, + "grad_norm": 0.85546875, + "learning_rate": 4.089128969803009e-05, + "loss": 0.8806, + "step": 3838 + }, + { + "epoch": 0.2824741319843642, + "grad_norm": 0.93359375, + "learning_rate": 4.088681829906893e-05, + "loss": 0.944, + "step": 3839 + }, + { + "epoch": 0.2825477121177282, + "grad_norm": 0.83203125, + "learning_rate": 4.0882346047492235e-05, + "loss": 0.8527, + "step": 3840 + }, + { + "epoch": 0.2826212922510922, + "grad_norm": 0.875, + "learning_rate": 4.087787294354004e-05, + "loss": 1.0146, + "step": 3841 + }, + { + "epoch": 0.2826948723844562, + "grad_norm": 0.96484375, + "learning_rate": 4.0873398987452384e-05, + "loss": 1.078, + "step": 3842 + }, + { + "epoch": 0.2827684525178202, + "grad_norm": 0.7578125, + "learning_rate": 4.086892417946938e-05, + "loss": 0.8147, + "step": 3843 + }, + { + "epoch": 0.2828420326511842, + "grad_norm": 1.0, + "learning_rate": 4.086444851983119e-05, + "loss": 1.2531, + "step": 3844 + }, + { + "epoch": 0.2829156127845482, + "grad_norm": 0.80078125, + "learning_rate": 4.085997200877803e-05, + "loss": 1.0398, + "step": 3845 + }, + { + "epoch": 0.2829891929179122, + "grad_norm": 0.90625, + "learning_rate": 4.0855494646550094e-05, + "loss": 0.8554, + "step": 3846 + }, + { + "epoch": 0.28306277305127614, + "grad_norm": 0.90625, + "learning_rate": 4.085101643338774e-05, + "loss": 0.8474, + "step": 3847 + }, + { + "epoch": 0.28313635318464014, + "grad_norm": 0.95703125, + "learning_rate": 4.084653736953125e-05, + "loss": 0.7307, + "step": 3848 + }, + { + "epoch": 0.28320993331800415, + "grad_norm": 1.1015625, + "learning_rate": 4.084205745522104e-05, + "loss": 1.3031, + "step": 3849 + }, + { + "epoch": 0.28328351345136815, + "grad_norm": 0.9140625, + "learning_rate": 4.0837576690697524e-05, + "loss": 1.1648, + "step": 3850 + }, + { + "epoch": 0.2833570935847321, + "grad_norm": 0.71484375, + "learning_rate": 4.083309507620118e-05, + "loss": 0.7618, + "step": 3851 + }, + { + "epoch": 0.2834306737180961, + "grad_norm": 0.75390625, + "learning_rate": 4.0828612611972526e-05, + "loss": 0.686, + "step": 3852 + }, + { + "epoch": 0.2835042538514601, + "grad_norm": 0.69140625, + "learning_rate": 4.0824129298252126e-05, + "loss": 0.6035, + "step": 3853 + }, + { + "epoch": 0.2835778339848241, + "grad_norm": 0.88671875, + "learning_rate": 4.08196451352806e-05, + "loss": 1.3134, + "step": 3854 + }, + { + "epoch": 0.2836514141181881, + "grad_norm": 0.6875, + "learning_rate": 4.08151601232986e-05, + "loss": 0.8152, + "step": 3855 + }, + { + "epoch": 0.28372499425155207, + "grad_norm": 0.8671875, + "learning_rate": 4.081067426254682e-05, + "loss": 1.1926, + "step": 3856 + }, + { + "epoch": 0.28379857438491607, + "grad_norm": 0.875, + "learning_rate": 4.0806187553266024e-05, + "loss": 1.1827, + "step": 3857 + }, + { + "epoch": 0.2838721545182801, + "grad_norm": 1.1796875, + "learning_rate": 4.080169999569699e-05, + "loss": 0.921, + "step": 3858 + }, + { + "epoch": 0.2839457346516441, + "grad_norm": 0.7421875, + "learning_rate": 4.079721159008056e-05, + "loss": 0.6629, + "step": 3859 + }, + { + "epoch": 0.284019314785008, + "grad_norm": 0.796875, + "learning_rate": 4.079272233665763e-05, + "loss": 0.7299, + "step": 3860 + }, + { + "epoch": 0.28409289491837203, + "grad_norm": 0.8984375, + "learning_rate": 4.078823223566911e-05, + "loss": 0.9084, + "step": 3861 + }, + { + "epoch": 0.28416647505173603, + "grad_norm": 1.0390625, + "learning_rate": 4.0783741287355994e-05, + "loss": 1.5359, + "step": 3862 + }, + { + "epoch": 0.28424005518510004, + "grad_norm": 0.7109375, + "learning_rate": 4.077924949195929e-05, + "loss": 0.8932, + "step": 3863 + }, + { + "epoch": 0.28431363531846404, + "grad_norm": 0.6796875, + "learning_rate": 4.077475684972006e-05, + "loss": 0.8656, + "step": 3864 + }, + { + "epoch": 0.284387215451828, + "grad_norm": 0.73828125, + "learning_rate": 4.0770263360879435e-05, + "loss": 0.7003, + "step": 3865 + }, + { + "epoch": 0.284460795585192, + "grad_norm": 0.828125, + "learning_rate": 4.076576902567856e-05, + "loss": 1.0429, + "step": 3866 + }, + { + "epoch": 0.284534375718556, + "grad_norm": 1.0859375, + "learning_rate": 4.076127384435864e-05, + "loss": 1.1955, + "step": 3867 + }, + { + "epoch": 0.28460795585192, + "grad_norm": 0.75, + "learning_rate": 4.075677781716092e-05, + "loss": 0.8434, + "step": 3868 + }, + { + "epoch": 0.28468153598528395, + "grad_norm": 1.1328125, + "learning_rate": 4.0752280944326694e-05, + "loss": 0.8366, + "step": 3869 + }, + { + "epoch": 0.28475511611864796, + "grad_norm": 0.8984375, + "learning_rate": 4.07477832260973e-05, + "loss": 0.9159, + "step": 3870 + }, + { + "epoch": 0.28482869625201196, + "grad_norm": 0.890625, + "learning_rate": 4.074328466271413e-05, + "loss": 1.3194, + "step": 3871 + }, + { + "epoch": 0.28490227638537596, + "grad_norm": 0.8828125, + "learning_rate": 4.07387852544186e-05, + "loss": 1.0448, + "step": 3872 + }, + { + "epoch": 0.28497585651873997, + "grad_norm": 1.0625, + "learning_rate": 4.0734285001452194e-05, + "loss": 1.4761, + "step": 3873 + }, + { + "epoch": 0.2850494366521039, + "grad_norm": 1.0390625, + "learning_rate": 4.072978390405643e-05, + "loss": 0.9652, + "step": 3874 + }, + { + "epoch": 0.2851230167854679, + "grad_norm": 1.203125, + "learning_rate": 4.0725281962472875e-05, + "loss": 1.1459, + "step": 3875 + }, + { + "epoch": 0.2851965969188319, + "grad_norm": 0.90234375, + "learning_rate": 4.072077917694314e-05, + "loss": 0.8921, + "step": 3876 + }, + { + "epoch": 0.28527017705219593, + "grad_norm": 0.70703125, + "learning_rate": 4.071627554770887e-05, + "loss": 0.8015, + "step": 3877 + }, + { + "epoch": 0.2853437571855599, + "grad_norm": 0.69921875, + "learning_rate": 4.071177107501178e-05, + "loss": 0.803, + "step": 3878 + }, + { + "epoch": 0.2854173373189239, + "grad_norm": 0.8125, + "learning_rate": 4.070726575909361e-05, + "loss": 1.129, + "step": 3879 + }, + { + "epoch": 0.2854909174522879, + "grad_norm": 0.64453125, + "learning_rate": 4.0702759600196156e-05, + "loss": 0.7708, + "step": 3880 + }, + { + "epoch": 0.2855644975856519, + "grad_norm": 0.80078125, + "learning_rate": 4.069825259856125e-05, + "loss": 0.656, + "step": 3881 + }, + { + "epoch": 0.2856380777190159, + "grad_norm": 0.79296875, + "learning_rate": 4.0693744754430786e-05, + "loss": 0.7339, + "step": 3882 + }, + { + "epoch": 0.28571165785237984, + "grad_norm": 0.95703125, + "learning_rate": 4.0689236068046665e-05, + "loss": 1.2382, + "step": 3883 + }, + { + "epoch": 0.28578523798574385, + "grad_norm": 0.98046875, + "learning_rate": 4.068472653965089e-05, + "loss": 0.8877, + "step": 3884 + }, + { + "epoch": 0.28585881811910785, + "grad_norm": 0.7890625, + "learning_rate": 4.0680216169485466e-05, + "loss": 1.1071, + "step": 3885 + }, + { + "epoch": 0.28593239825247185, + "grad_norm": 1.0234375, + "learning_rate": 4.067570495779245e-05, + "loss": 1.5672, + "step": 3886 + }, + { + "epoch": 0.2860059783858358, + "grad_norm": 0.73046875, + "learning_rate": 4.067119290481396e-05, + "loss": 0.9924, + "step": 3887 + }, + { + "epoch": 0.2860795585191998, + "grad_norm": 0.8125, + "learning_rate": 4.066668001079214e-05, + "loss": 1.0488, + "step": 3888 + }, + { + "epoch": 0.2861531386525638, + "grad_norm": 0.76953125, + "learning_rate": 4.066216627596921e-05, + "loss": 0.7105, + "step": 3889 + }, + { + "epoch": 0.2862267187859278, + "grad_norm": 1.1484375, + "learning_rate": 4.0657651700587376e-05, + "loss": 1.1021, + "step": 3890 + }, + { + "epoch": 0.2863002989192918, + "grad_norm": 0.75390625, + "learning_rate": 4.065313628488896e-05, + "loss": 0.8837, + "step": 3891 + }, + { + "epoch": 0.28637387905265577, + "grad_norm": 0.89453125, + "learning_rate": 4.064862002911628e-05, + "loss": 1.0584, + "step": 3892 + }, + { + "epoch": 0.28644745918601977, + "grad_norm": 0.80859375, + "learning_rate": 4.064410293351172e-05, + "loss": 0.8034, + "step": 3893 + }, + { + "epoch": 0.2865210393193838, + "grad_norm": 1.0078125, + "learning_rate": 4.0639584998317706e-05, + "loss": 1.5604, + "step": 3894 + }, + { + "epoch": 0.2865946194527478, + "grad_norm": 0.7421875, + "learning_rate": 4.0635066223776706e-05, + "loss": 0.8334, + "step": 3895 + }, + { + "epoch": 0.2866681995861117, + "grad_norm": 0.98828125, + "learning_rate": 4.0630546610131224e-05, + "loss": 1.2412, + "step": 3896 + }, + { + "epoch": 0.28674177971947573, + "grad_norm": 0.8046875, + "learning_rate": 4.062602615762384e-05, + "loss": 0.7904, + "step": 3897 + }, + { + "epoch": 0.28681535985283974, + "grad_norm": 0.87890625, + "learning_rate": 4.0621504866497136e-05, + "loss": 0.9662, + "step": 3898 + }, + { + "epoch": 0.28688893998620374, + "grad_norm": 0.94921875, + "learning_rate": 4.061698273699377e-05, + "loss": 1.0871, + "step": 3899 + }, + { + "epoch": 0.28696252011956774, + "grad_norm": 1.0625, + "learning_rate": 4.0612459769356434e-05, + "loss": 1.1765, + "step": 3900 + }, + { + "epoch": 0.2870361002529317, + "grad_norm": 0.93359375, + "learning_rate": 4.060793596382788e-05, + "loss": 0.6849, + "step": 3901 + }, + { + "epoch": 0.2871096803862957, + "grad_norm": 0.84765625, + "learning_rate": 4.060341132065088e-05, + "loss": 1.3514, + "step": 3902 + }, + { + "epoch": 0.2871832605196597, + "grad_norm": 1.03125, + "learning_rate": 4.0598885840068264e-05, + "loss": 1.0643, + "step": 3903 + }, + { + "epoch": 0.2872568406530237, + "grad_norm": 0.80078125, + "learning_rate": 4.05943595223229e-05, + "loss": 0.9268, + "step": 3904 + }, + { + "epoch": 0.28733042078638765, + "grad_norm": 0.796875, + "learning_rate": 4.0589832367657724e-05, + "loss": 1.1582, + "step": 3905 + }, + { + "epoch": 0.28740400091975166, + "grad_norm": 0.77734375, + "learning_rate": 4.058530437631568e-05, + "loss": 0.8383, + "step": 3906 + }, + { + "epoch": 0.28747758105311566, + "grad_norm": 1.0, + "learning_rate": 4.05807755485398e-05, + "loss": 1.1832, + "step": 3907 + }, + { + "epoch": 0.28755116118647966, + "grad_norm": 0.99609375, + "learning_rate": 4.0576245884573105e-05, + "loss": 1.1852, + "step": 3908 + }, + { + "epoch": 0.28762474131984367, + "grad_norm": 1.09375, + "learning_rate": 4.057171538465873e-05, + "loss": 0.9573, + "step": 3909 + }, + { + "epoch": 0.2876983214532076, + "grad_norm": 0.83203125, + "learning_rate": 4.0567184049039796e-05, + "loss": 0.8536, + "step": 3910 + }, + { + "epoch": 0.2877719015865716, + "grad_norm": 1.09375, + "learning_rate": 4.056265187795949e-05, + "loss": 1.3186, + "step": 3911 + }, + { + "epoch": 0.2878454817199356, + "grad_norm": 0.84765625, + "learning_rate": 4.055811887166106e-05, + "loss": 1.015, + "step": 3912 + }, + { + "epoch": 0.28791906185329963, + "grad_norm": 0.67578125, + "learning_rate": 4.055358503038777e-05, + "loss": 0.789, + "step": 3913 + }, + { + "epoch": 0.2879926419866636, + "grad_norm": 0.8203125, + "learning_rate": 4.054905035438295e-05, + "loss": 0.8662, + "step": 3914 + }, + { + "epoch": 0.2880662221200276, + "grad_norm": 0.75, + "learning_rate": 4.054451484388996e-05, + "loss": 0.5611, + "step": 3915 + }, + { + "epoch": 0.2881398022533916, + "grad_norm": 0.984375, + "learning_rate": 4.0539978499152235e-05, + "loss": 1.1378, + "step": 3916 + }, + { + "epoch": 0.2882133823867556, + "grad_norm": 0.734375, + "learning_rate": 4.0535441320413194e-05, + "loss": 0.9651, + "step": 3917 + }, + { + "epoch": 0.2882869625201196, + "grad_norm": 0.80078125, + "learning_rate": 4.053090330791637e-05, + "loss": 1.051, + "step": 3918 + }, + { + "epoch": 0.28836054265348354, + "grad_norm": 0.94921875, + "learning_rate": 4.0526364461905295e-05, + "loss": 1.1585, + "step": 3919 + }, + { + "epoch": 0.28843412278684755, + "grad_norm": 0.86328125, + "learning_rate": 4.052182478262357e-05, + "loss": 1.3894, + "step": 3920 + }, + { + "epoch": 0.28850770292021155, + "grad_norm": 0.703125, + "learning_rate": 4.0517284270314826e-05, + "loss": 0.6976, + "step": 3921 + }, + { + "epoch": 0.28858128305357555, + "grad_norm": 0.96484375, + "learning_rate": 4.051274292522273e-05, + "loss": 0.9478, + "step": 3922 + }, + { + "epoch": 0.2886548631869395, + "grad_norm": 0.78125, + "learning_rate": 4.050820074759104e-05, + "loss": 1.0256, + "step": 3923 + }, + { + "epoch": 0.2887284433203035, + "grad_norm": 0.91796875, + "learning_rate": 4.050365773766349e-05, + "loss": 0.939, + "step": 3924 + }, + { + "epoch": 0.2888020234536675, + "grad_norm": 0.7890625, + "learning_rate": 4.0499113895683927e-05, + "loss": 0.9522, + "step": 3925 + }, + { + "epoch": 0.2888756035870315, + "grad_norm": 0.85546875, + "learning_rate": 4.049456922189618e-05, + "loss": 0.9204, + "step": 3926 + }, + { + "epoch": 0.2889491837203955, + "grad_norm": 0.9609375, + "learning_rate": 4.049002371654418e-05, + "loss": 0.7246, + "step": 3927 + }, + { + "epoch": 0.28902276385375947, + "grad_norm": 1.0546875, + "learning_rate": 4.048547737987185e-05, + "loss": 0.8078, + "step": 3928 + }, + { + "epoch": 0.28909634398712347, + "grad_norm": 1.0859375, + "learning_rate": 4.04809302121232e-05, + "loss": 0.992, + "step": 3929 + }, + { + "epoch": 0.2891699241204875, + "grad_norm": 0.84375, + "learning_rate": 4.047638221354228e-05, + "loss": 0.9145, + "step": 3930 + }, + { + "epoch": 0.2892435042538515, + "grad_norm": 0.91796875, + "learning_rate": 4.047183338437314e-05, + "loss": 0.995, + "step": 3931 + }, + { + "epoch": 0.28931708438721543, + "grad_norm": 1.0859375, + "learning_rate": 4.046728372485994e-05, + "loss": 1.2221, + "step": 3932 + }, + { + "epoch": 0.28939066452057943, + "grad_norm": 0.87890625, + "learning_rate": 4.046273323524682e-05, + "loss": 0.8771, + "step": 3933 + }, + { + "epoch": 0.28946424465394344, + "grad_norm": 0.78125, + "learning_rate": 4.0458181915778026e-05, + "loss": 1.0416, + "step": 3934 + }, + { + "epoch": 0.28953782478730744, + "grad_norm": 0.84375, + "learning_rate": 4.0453629766697796e-05, + "loss": 0.9703, + "step": 3935 + }, + { + "epoch": 0.28961140492067144, + "grad_norm": 0.78125, + "learning_rate": 4.0449076788250446e-05, + "loss": 0.7855, + "step": 3936 + }, + { + "epoch": 0.2896849850540354, + "grad_norm": 1.125, + "learning_rate": 4.044452298068033e-05, + "loss": 1.1892, + "step": 3937 + }, + { + "epoch": 0.2897585651873994, + "grad_norm": 0.73828125, + "learning_rate": 4.043996834423183e-05, + "loss": 1.0772, + "step": 3938 + }, + { + "epoch": 0.2898321453207634, + "grad_norm": 0.6796875, + "learning_rate": 4.043541287914939e-05, + "loss": 0.6697, + "step": 3939 + }, + { + "epoch": 0.2899057254541274, + "grad_norm": 0.83984375, + "learning_rate": 4.043085658567749e-05, + "loss": 0.8478, + "step": 3940 + }, + { + "epoch": 0.28997930558749135, + "grad_norm": 1.0390625, + "learning_rate": 4.042629946406067e-05, + "loss": 1.2608, + "step": 3941 + }, + { + "epoch": 0.29005288572085536, + "grad_norm": 0.80078125, + "learning_rate": 4.042174151454349e-05, + "loss": 0.935, + "step": 3942 + }, + { + "epoch": 0.29012646585421936, + "grad_norm": 0.7890625, + "learning_rate": 4.0417182737370574e-05, + "loss": 0.989, + "step": 3943 + }, + { + "epoch": 0.29020004598758337, + "grad_norm": 1.15625, + "learning_rate": 4.041262313278657e-05, + "loss": 1.1673, + "step": 3944 + }, + { + "epoch": 0.29027362612094737, + "grad_norm": 1.09375, + "learning_rate": 4.040806270103621e-05, + "loss": 1.3329, + "step": 3945 + }, + { + "epoch": 0.2903472062543113, + "grad_norm": 0.95703125, + "learning_rate": 4.0403501442364213e-05, + "loss": 0.9676, + "step": 3946 + }, + { + "epoch": 0.2904207863876753, + "grad_norm": 0.87890625, + "learning_rate": 4.039893935701539e-05, + "loss": 0.8828, + "step": 3947 + }, + { + "epoch": 0.2904943665210393, + "grad_norm": 0.9609375, + "learning_rate": 4.039437644523458e-05, + "loss": 0.9777, + "step": 3948 + }, + { + "epoch": 0.29056794665440333, + "grad_norm": 0.859375, + "learning_rate": 4.038981270726666e-05, + "loss": 0.8939, + "step": 3949 + }, + { + "epoch": 0.2906415267877673, + "grad_norm": 0.79296875, + "learning_rate": 4.038524814335656e-05, + "loss": 0.8317, + "step": 3950 + }, + { + "epoch": 0.2907151069211313, + "grad_norm": 0.875, + "learning_rate": 4.0380682753749245e-05, + "loss": 1.1056, + "step": 3951 + }, + { + "epoch": 0.2907886870544953, + "grad_norm": 0.7109375, + "learning_rate": 4.037611653868974e-05, + "loss": 0.8329, + "step": 3952 + }, + { + "epoch": 0.2908622671878593, + "grad_norm": 0.84765625, + "learning_rate": 4.03715494984231e-05, + "loss": 1.1901, + "step": 3953 + }, + { + "epoch": 0.2909358473212233, + "grad_norm": 0.78125, + "learning_rate": 4.0366981633194434e-05, + "loss": 0.8132, + "step": 3954 + }, + { + "epoch": 0.29100942745458724, + "grad_norm": 1.046875, + "learning_rate": 4.036241294324889e-05, + "loss": 1.1858, + "step": 3955 + }, + { + "epoch": 0.29108300758795125, + "grad_norm": 0.73828125, + "learning_rate": 4.035784342883165e-05, + "loss": 0.6956, + "step": 3956 + }, + { + "epoch": 0.29115658772131525, + "grad_norm": 0.73828125, + "learning_rate": 4.0353273090187974e-05, + "loss": 1.0991, + "step": 3957 + }, + { + "epoch": 0.29123016785467926, + "grad_norm": 0.890625, + "learning_rate": 4.034870192756311e-05, + "loss": 1.155, + "step": 3958 + }, + { + "epoch": 0.2913037479880432, + "grad_norm": 1.2421875, + "learning_rate": 4.034412994120242e-05, + "loss": 1.4814, + "step": 3959 + }, + { + "epoch": 0.2913773281214072, + "grad_norm": 0.77734375, + "learning_rate": 4.033955713135126e-05, + "loss": 0.8542, + "step": 3960 + }, + { + "epoch": 0.2914509082547712, + "grad_norm": 0.75, + "learning_rate": 4.033498349825502e-05, + "loss": 0.8671, + "step": 3961 + }, + { + "epoch": 0.2915244883881352, + "grad_norm": 0.8125, + "learning_rate": 4.03304090421592e-05, + "loss": 0.8192, + "step": 3962 + }, + { + "epoch": 0.2915980685214992, + "grad_norm": 0.7421875, + "learning_rate": 4.032583376330927e-05, + "loss": 0.7636, + "step": 3963 + }, + { + "epoch": 0.29167164865486317, + "grad_norm": 0.796875, + "learning_rate": 4.032125766195079e-05, + "loss": 0.7492, + "step": 3964 + }, + { + "epoch": 0.2917452287882272, + "grad_norm": 0.9921875, + "learning_rate": 4.031668073832935e-05, + "loss": 0.8939, + "step": 3965 + }, + { + "epoch": 0.2918188089215912, + "grad_norm": 0.8671875, + "learning_rate": 4.031210299269059e-05, + "loss": 1.0382, + "step": 3966 + }, + { + "epoch": 0.2918923890549552, + "grad_norm": 0.75390625, + "learning_rate": 4.030752442528017e-05, + "loss": 0.8662, + "step": 3967 + }, + { + "epoch": 0.29196596918831913, + "grad_norm": 0.8125, + "learning_rate": 4.030294503634384e-05, + "loss": 1.2106, + "step": 3968 + }, + { + "epoch": 0.29203954932168313, + "grad_norm": 0.765625, + "learning_rate": 4.029836482612734e-05, + "loss": 0.7495, + "step": 3969 + }, + { + "epoch": 0.29211312945504714, + "grad_norm": 0.71875, + "learning_rate": 4.0293783794876504e-05, + "loss": 0.4663, + "step": 3970 + }, + { + "epoch": 0.29218670958841114, + "grad_norm": 0.72265625, + "learning_rate": 4.0289201942837174e-05, + "loss": 0.9168, + "step": 3971 + }, + { + "epoch": 0.29226028972177515, + "grad_norm": 0.83203125, + "learning_rate": 4.028461927025525e-05, + "loss": 0.6534, + "step": 3972 + }, + { + "epoch": 0.2923338698551391, + "grad_norm": 1.046875, + "learning_rate": 4.028003577737669e-05, + "loss": 0.7976, + "step": 3973 + }, + { + "epoch": 0.2924074499885031, + "grad_norm": 1.0546875, + "learning_rate": 4.0275451464447454e-05, + "loss": 0.9889, + "step": 3974 + }, + { + "epoch": 0.2924810301218671, + "grad_norm": 1.015625, + "learning_rate": 4.02708663317136e-05, + "loss": 1.248, + "step": 3975 + }, + { + "epoch": 0.2925546102552311, + "grad_norm": 1.0625, + "learning_rate": 4.0266280379421195e-05, + "loss": 0.9817, + "step": 3976 + }, + { + "epoch": 0.29262819038859506, + "grad_norm": 0.7734375, + "learning_rate": 4.0261693607816344e-05, + "loss": 0.7109, + "step": 3977 + }, + { + "epoch": 0.29270177052195906, + "grad_norm": 1.046875, + "learning_rate": 4.025710601714523e-05, + "loss": 1.7039, + "step": 3978 + }, + { + "epoch": 0.29277535065532306, + "grad_norm": 0.796875, + "learning_rate": 4.025251760765405e-05, + "loss": 0.9992, + "step": 3979 + }, + { + "epoch": 0.29284893078868707, + "grad_norm": 0.75390625, + "learning_rate": 4.024792837958906e-05, + "loss": 0.6932, + "step": 3980 + }, + { + "epoch": 0.29292251092205107, + "grad_norm": 0.87890625, + "learning_rate": 4.0243338333196556e-05, + "loss": 0.812, + "step": 3981 + }, + { + "epoch": 0.292996091055415, + "grad_norm": 0.6953125, + "learning_rate": 4.023874746872287e-05, + "loss": 0.5889, + "step": 3982 + }, + { + "epoch": 0.293069671188779, + "grad_norm": 1.8671875, + "learning_rate": 4.023415578641438e-05, + "loss": 1.2658, + "step": 3983 + }, + { + "epoch": 0.29314325132214303, + "grad_norm": 0.85546875, + "learning_rate": 4.022956328651754e-05, + "loss": 0.8574, + "step": 3984 + }, + { + "epoch": 0.29321683145550703, + "grad_norm": 0.78515625, + "learning_rate": 4.022496996927879e-05, + "loss": 0.7555, + "step": 3985 + }, + { + "epoch": 0.293290411588871, + "grad_norm": 0.8984375, + "learning_rate": 4.022037583494466e-05, + "loss": 0.921, + "step": 3986 + }, + { + "epoch": 0.293363991722235, + "grad_norm": 0.76171875, + "learning_rate": 4.021578088376171e-05, + "loss": 0.7318, + "step": 3987 + }, + { + "epoch": 0.293437571855599, + "grad_norm": 0.875, + "learning_rate": 4.021118511597654e-05, + "loss": 0.6764, + "step": 3988 + }, + { + "epoch": 0.293511151988963, + "grad_norm": 0.8984375, + "learning_rate": 4.0206588531835795e-05, + "loss": 1.1966, + "step": 3989 + }, + { + "epoch": 0.293584732122327, + "grad_norm": 0.70703125, + "learning_rate": 4.0201991131586156e-05, + "loss": 0.663, + "step": 3990 + }, + { + "epoch": 0.29365831225569095, + "grad_norm": 0.7734375, + "learning_rate": 4.0197392915474375e-05, + "loss": 0.6867, + "step": 3991 + }, + { + "epoch": 0.29373189238905495, + "grad_norm": 0.796875, + "learning_rate": 4.019279388374722e-05, + "loss": 0.9117, + "step": 3992 + }, + { + "epoch": 0.29380547252241895, + "grad_norm": 0.9609375, + "learning_rate": 4.01881940366515e-05, + "loss": 1.0109, + "step": 3993 + }, + { + "epoch": 0.29387905265578296, + "grad_norm": 0.94140625, + "learning_rate": 4.0183593374434106e-05, + "loss": 0.8342, + "step": 3994 + }, + { + "epoch": 0.2939526327891469, + "grad_norm": 0.8515625, + "learning_rate": 4.0178991897341925e-05, + "loss": 1.0372, + "step": 3995 + }, + { + "epoch": 0.2940262129225109, + "grad_norm": 0.6953125, + "learning_rate": 4.017438960562192e-05, + "loss": 0.7008, + "step": 3996 + }, + { + "epoch": 0.2940997930558749, + "grad_norm": 0.71484375, + "learning_rate": 4.0169786499521083e-05, + "loss": 0.7199, + "step": 3997 + }, + { + "epoch": 0.2941733731892389, + "grad_norm": 0.953125, + "learning_rate": 4.0165182579286467e-05, + "loss": 1.1108, + "step": 3998 + }, + { + "epoch": 0.2942469533226029, + "grad_norm": 0.6640625, + "learning_rate": 4.016057784516513e-05, + "loss": 0.6917, + "step": 3999 + }, + { + "epoch": 0.29432053345596687, + "grad_norm": 0.80859375, + "learning_rate": 4.015597229740422e-05, + "loss": 0.8002, + "step": 4000 + }, + { + "epoch": 0.2943941135893309, + "grad_norm": 0.9609375, + "learning_rate": 4.015136593625091e-05, + "loss": 1.214, + "step": 4001 + }, + { + "epoch": 0.2944676937226949, + "grad_norm": 0.74609375, + "learning_rate": 4.0146758761952396e-05, + "loss": 0.6247, + "step": 4002 + }, + { + "epoch": 0.2945412738560589, + "grad_norm": 1.1484375, + "learning_rate": 4.014215077475596e-05, + "loss": 1.5746, + "step": 4003 + }, + { + "epoch": 0.29461485398942283, + "grad_norm": 1.171875, + "learning_rate": 4.013754197490888e-05, + "loss": 1.0741, + "step": 4004 + }, + { + "epoch": 0.29468843412278684, + "grad_norm": 0.89453125, + "learning_rate": 4.0132932362658516e-05, + "loss": 1.278, + "step": 4005 + }, + { + "epoch": 0.29476201425615084, + "grad_norm": 0.9375, + "learning_rate": 4.0128321938252254e-05, + "loss": 1.0738, + "step": 4006 + }, + { + "epoch": 0.29483559438951484, + "grad_norm": 1.828125, + "learning_rate": 4.012371070193753e-05, + "loss": 0.8013, + "step": 4007 + }, + { + "epoch": 0.29490917452287885, + "grad_norm": 0.82421875, + "learning_rate": 4.011909865396181e-05, + "loss": 1.0525, + "step": 4008 + }, + { + "epoch": 0.2949827546562428, + "grad_norm": 0.7890625, + "learning_rate": 4.011448579457263e-05, + "loss": 0.8665, + "step": 4009 + }, + { + "epoch": 0.2950563347896068, + "grad_norm": 1.1328125, + "learning_rate": 4.010987212401754e-05, + "loss": 1.0984, + "step": 4010 + }, + { + "epoch": 0.2951299149229708, + "grad_norm": 0.953125, + "learning_rate": 4.010525764254415e-05, + "loss": 0.7722, + "step": 4011 + }, + { + "epoch": 0.2952034950563348, + "grad_norm": 0.82421875, + "learning_rate": 4.010064235040012e-05, + "loss": 1.1803, + "step": 4012 + }, + { + "epoch": 0.29527707518969876, + "grad_norm": 1.125, + "learning_rate": 4.009602624783312e-05, + "loss": 1.4804, + "step": 4013 + }, + { + "epoch": 0.29535065532306276, + "grad_norm": 0.953125, + "learning_rate": 4.009140933509092e-05, + "loss": 1.0147, + "step": 4014 + }, + { + "epoch": 0.29542423545642676, + "grad_norm": 0.80859375, + "learning_rate": 4.008679161242128e-05, + "loss": 0.826, + "step": 4015 + }, + { + "epoch": 0.29549781558979077, + "grad_norm": 0.67578125, + "learning_rate": 4.008217308007203e-05, + "loss": 0.8106, + "step": 4016 + }, + { + "epoch": 0.2955713957231548, + "grad_norm": 0.796875, + "learning_rate": 4.007755373829103e-05, + "loss": 1.3554, + "step": 4017 + }, + { + "epoch": 0.2956449758565187, + "grad_norm": 0.9375, + "learning_rate": 4.00729335873262e-05, + "loss": 1.0787, + "step": 4018 + }, + { + "epoch": 0.2957185559898827, + "grad_norm": 0.94140625, + "learning_rate": 4.00683126274255e-05, + "loss": 1.0574, + "step": 4019 + }, + { + "epoch": 0.29579213612324673, + "grad_norm": 1.1953125, + "learning_rate": 4.0063690858836914e-05, + "loss": 0.8312, + "step": 4020 + }, + { + "epoch": 0.29586571625661073, + "grad_norm": 0.859375, + "learning_rate": 4.0059068281808495e-05, + "loss": 0.9378, + "step": 4021 + }, + { + "epoch": 0.2959392963899747, + "grad_norm": 0.82421875, + "learning_rate": 4.0054444896588326e-05, + "loss": 0.832, + "step": 4022 + }, + { + "epoch": 0.2960128765233387, + "grad_norm": 0.953125, + "learning_rate": 4.0049820703424534e-05, + "loss": 1.4243, + "step": 4023 + }, + { + "epoch": 0.2960864566567027, + "grad_norm": 1.125, + "learning_rate": 4.0045195702565285e-05, + "loss": 0.8421, + "step": 4024 + }, + { + "epoch": 0.2961600367900667, + "grad_norm": 0.95703125, + "learning_rate": 4.0040569894258794e-05, + "loss": 1.042, + "step": 4025 + }, + { + "epoch": 0.2962336169234307, + "grad_norm": 0.83984375, + "learning_rate": 4.003594327875334e-05, + "loss": 0.8849, + "step": 4026 + }, + { + "epoch": 0.29630719705679465, + "grad_norm": 0.92578125, + "learning_rate": 4.003131585629719e-05, + "loss": 1.2332, + "step": 4027 + }, + { + "epoch": 0.29638077719015865, + "grad_norm": 0.7421875, + "learning_rate": 4.002668762713873e-05, + "loss": 0.7748, + "step": 4028 + }, + { + "epoch": 0.29645435732352265, + "grad_norm": 1.0, + "learning_rate": 4.002205859152631e-05, + "loss": 1.0201, + "step": 4029 + }, + { + "epoch": 0.29652793745688666, + "grad_norm": 0.7421875, + "learning_rate": 4.0017428749708385e-05, + "loss": 0.6225, + "step": 4030 + }, + { + "epoch": 0.2966015175902506, + "grad_norm": 0.98828125, + "learning_rate": 4.0012798101933414e-05, + "loss": 1.0481, + "step": 4031 + }, + { + "epoch": 0.2966750977236146, + "grad_norm": 0.875, + "learning_rate": 4.000816664844993e-05, + "loss": 0.7896, + "step": 4032 + }, + { + "epoch": 0.2967486778569786, + "grad_norm": 0.8046875, + "learning_rate": 4.000353438950649e-05, + "loss": 0.963, + "step": 4033 + }, + { + "epoch": 0.2968222579903426, + "grad_norm": 0.96484375, + "learning_rate": 3.9998901325351694e-05, + "loss": 1.1171, + "step": 4034 + }, + { + "epoch": 0.2968958381237066, + "grad_norm": 0.8828125, + "learning_rate": 3.99942674562342e-05, + "loss": 0.9836, + "step": 4035 + }, + { + "epoch": 0.29696941825707057, + "grad_norm": 0.89453125, + "learning_rate": 3.998963278240268e-05, + "loss": 1.0004, + "step": 4036 + }, + { + "epoch": 0.2970429983904346, + "grad_norm": 1.0390625, + "learning_rate": 3.9984997304105885e-05, + "loss": 1.484, + "step": 4037 + }, + { + "epoch": 0.2971165785237986, + "grad_norm": 1.1171875, + "learning_rate": 3.998036102159259e-05, + "loss": 0.897, + "step": 4038 + }, + { + "epoch": 0.2971901586571626, + "grad_norm": 0.8828125, + "learning_rate": 3.9975723935111614e-05, + "loss": 0.9784, + "step": 4039 + }, + { + "epoch": 0.29726373879052653, + "grad_norm": 0.83984375, + "learning_rate": 3.997108604491182e-05, + "loss": 1.2529, + "step": 4040 + }, + { + "epoch": 0.29733731892389054, + "grad_norm": 0.890625, + "learning_rate": 3.9966447351242106e-05, + "loss": 0.8156, + "step": 4041 + }, + { + "epoch": 0.29741089905725454, + "grad_norm": 0.98828125, + "learning_rate": 3.996180785435144e-05, + "loss": 1.016, + "step": 4042 + }, + { + "epoch": 0.29748447919061854, + "grad_norm": 0.8984375, + "learning_rate": 3.9957167554488795e-05, + "loss": 1.1219, + "step": 4043 + }, + { + "epoch": 0.29755805932398255, + "grad_norm": 1.0625, + "learning_rate": 3.995252645190323e-05, + "loss": 1.1444, + "step": 4044 + }, + { + "epoch": 0.2976316394573465, + "grad_norm": 0.90625, + "learning_rate": 3.99478845468438e-05, + "loss": 0.906, + "step": 4045 + }, + { + "epoch": 0.2977052195907105, + "grad_norm": 0.8046875, + "learning_rate": 3.994324183955964e-05, + "loss": 0.8373, + "step": 4046 + }, + { + "epoch": 0.2977787997240745, + "grad_norm": 1.0625, + "learning_rate": 3.993859833029993e-05, + "loss": 1.2315, + "step": 4047 + }, + { + "epoch": 0.2978523798574385, + "grad_norm": 0.92578125, + "learning_rate": 3.9933954019313844e-05, + "loss": 0.8823, + "step": 4048 + }, + { + "epoch": 0.29792595999080246, + "grad_norm": 0.69140625, + "learning_rate": 3.992930890685066e-05, + "loss": 0.5843, + "step": 4049 + }, + { + "epoch": 0.29799954012416646, + "grad_norm": 0.828125, + "learning_rate": 3.992466299315965e-05, + "loss": 0.8507, + "step": 4050 + }, + { + "epoch": 0.29807312025753047, + "grad_norm": 0.80859375, + "learning_rate": 3.992001627849019e-05, + "loss": 0.9282, + "step": 4051 + }, + { + "epoch": 0.29814670039089447, + "grad_norm": 0.9765625, + "learning_rate": 3.991536876309162e-05, + "loss": 0.8898, + "step": 4052 + }, + { + "epoch": 0.2982202805242585, + "grad_norm": 0.91796875, + "learning_rate": 3.991072044721339e-05, + "loss": 0.8895, + "step": 4053 + }, + { + "epoch": 0.2982938606576224, + "grad_norm": 0.79296875, + "learning_rate": 3.990607133110495e-05, + "loss": 0.909, + "step": 4054 + }, + { + "epoch": 0.2983674407909864, + "grad_norm": 0.953125, + "learning_rate": 3.9901421415015815e-05, + "loss": 1.1622, + "step": 4055 + }, + { + "epoch": 0.29844102092435043, + "grad_norm": 0.85546875, + "learning_rate": 3.989677069919554e-05, + "loss": 0.8779, + "step": 4056 + }, + { + "epoch": 0.29851460105771443, + "grad_norm": 0.921875, + "learning_rate": 3.9892119183893715e-05, + "loss": 0.9261, + "step": 4057 + }, + { + "epoch": 0.2985881811910784, + "grad_norm": 0.87109375, + "learning_rate": 3.988746686935998e-05, + "loss": 0.8476, + "step": 4058 + }, + { + "epoch": 0.2986617613244424, + "grad_norm": 0.91015625, + "learning_rate": 3.9882813755844015e-05, + "loss": 1.0539, + "step": 4059 + }, + { + "epoch": 0.2987353414578064, + "grad_norm": 0.984375, + "learning_rate": 3.9878159843595554e-05, + "loss": 1.2995, + "step": 4060 + }, + { + "epoch": 0.2988089215911704, + "grad_norm": 0.9375, + "learning_rate": 3.987350513286435e-05, + "loss": 0.9445, + "step": 4061 + }, + { + "epoch": 0.2988825017245344, + "grad_norm": 0.83984375, + "learning_rate": 3.986884962390022e-05, + "loss": 1.2249, + "step": 4062 + }, + { + "epoch": 0.29895608185789835, + "grad_norm": 1.0546875, + "learning_rate": 3.986419331695301e-05, + "loss": 0.9489, + "step": 4063 + }, + { + "epoch": 0.29902966199126235, + "grad_norm": 1.3515625, + "learning_rate": 3.985953621227262e-05, + "loss": 0.9554, + "step": 4064 + }, + { + "epoch": 0.29910324212462636, + "grad_norm": 1.0390625, + "learning_rate": 3.985487831010899e-05, + "loss": 1.1644, + "step": 4065 + }, + { + "epoch": 0.29917682225799036, + "grad_norm": 0.875, + "learning_rate": 3.985021961071209e-05, + "loss": 0.7879, + "step": 4066 + }, + { + "epoch": 0.2992504023913543, + "grad_norm": 1.578125, + "learning_rate": 3.984556011433196e-05, + "loss": 1.0147, + "step": 4067 + }, + { + "epoch": 0.2993239825247183, + "grad_norm": 0.80859375, + "learning_rate": 3.984089982121865e-05, + "loss": 0.8522, + "step": 4068 + }, + { + "epoch": 0.2993975626580823, + "grad_norm": 0.69921875, + "learning_rate": 3.983623873162229e-05, + "loss": 0.751, + "step": 4069 + }, + { + "epoch": 0.2994711427914463, + "grad_norm": 0.74609375, + "learning_rate": 3.983157684579301e-05, + "loss": 0.8797, + "step": 4070 + }, + { + "epoch": 0.2995447229248103, + "grad_norm": 0.75, + "learning_rate": 3.982691416398101e-05, + "loss": 0.7336, + "step": 4071 + }, + { + "epoch": 0.2996183030581743, + "grad_norm": 1.109375, + "learning_rate": 3.9822250686436534e-05, + "loss": 1.3268, + "step": 4072 + }, + { + "epoch": 0.2996918831915383, + "grad_norm": 0.765625, + "learning_rate": 3.981758641340986e-05, + "loss": 0.9972, + "step": 4073 + }, + { + "epoch": 0.2997654633249023, + "grad_norm": 1.1015625, + "learning_rate": 3.981292134515131e-05, + "loss": 1.0735, + "step": 4074 + }, + { + "epoch": 0.2998390434582663, + "grad_norm": 0.91796875, + "learning_rate": 3.980825548191125e-05, + "loss": 1.1444, + "step": 4075 + }, + { + "epoch": 0.29991262359163023, + "grad_norm": 0.9140625, + "learning_rate": 3.9803588823940085e-05, + "loss": 0.849, + "step": 4076 + }, + { + "epoch": 0.29998620372499424, + "grad_norm": 0.83203125, + "learning_rate": 3.979892137148827e-05, + "loss": 0.8298, + "step": 4077 + }, + { + "epoch": 0.30005978385835824, + "grad_norm": 0.8515625, + "learning_rate": 3.97942531248063e-05, + "loss": 1.0243, + "step": 4078 + }, + { + "epoch": 0.30013336399172225, + "grad_norm": 1.0, + "learning_rate": 3.97895840841447e-05, + "loss": 1.2506, + "step": 4079 + }, + { + "epoch": 0.30020694412508625, + "grad_norm": 0.78515625, + "learning_rate": 3.978491424975406e-05, + "loss": 0.9184, + "step": 4080 + }, + { + "epoch": 0.3002805242584502, + "grad_norm": 0.9296875, + "learning_rate": 3.9780243621884997e-05, + "loss": 1.4987, + "step": 4081 + }, + { + "epoch": 0.3003541043918142, + "grad_norm": 0.9453125, + "learning_rate": 3.977557220078817e-05, + "loss": 1.0628, + "step": 4082 + }, + { + "epoch": 0.3004276845251782, + "grad_norm": 0.79296875, + "learning_rate": 3.977089998671429e-05, + "loss": 0.6733, + "step": 4083 + }, + { + "epoch": 0.3005012646585422, + "grad_norm": 0.70703125, + "learning_rate": 3.9766226979914104e-05, + "loss": 0.7021, + "step": 4084 + }, + { + "epoch": 0.30057484479190616, + "grad_norm": 0.67578125, + "learning_rate": 3.976155318063841e-05, + "loss": 0.6074, + "step": 4085 + }, + { + "epoch": 0.30064842492527016, + "grad_norm": 0.74609375, + "learning_rate": 3.9756878589138044e-05, + "loss": 0.7702, + "step": 4086 + }, + { + "epoch": 0.30072200505863417, + "grad_norm": 0.66796875, + "learning_rate": 3.9752203205663865e-05, + "loss": 0.6078, + "step": 4087 + }, + { + "epoch": 0.30079558519199817, + "grad_norm": 0.703125, + "learning_rate": 3.9747527030466805e-05, + "loss": 0.6987, + "step": 4088 + }, + { + "epoch": 0.3008691653253622, + "grad_norm": 0.96875, + "learning_rate": 3.974285006379783e-05, + "loss": 1.3702, + "step": 4089 + }, + { + "epoch": 0.3009427454587261, + "grad_norm": 1.0078125, + "learning_rate": 3.9738172305907936e-05, + "loss": 1.4945, + "step": 4090 + }, + { + "epoch": 0.30101632559209013, + "grad_norm": 1.078125, + "learning_rate": 3.973349375704816e-05, + "loss": 1.2491, + "step": 4091 + }, + { + "epoch": 0.30108990572545413, + "grad_norm": 0.8046875, + "learning_rate": 3.972881441746962e-05, + "loss": 0.9268, + "step": 4092 + }, + { + "epoch": 0.30116348585881814, + "grad_norm": 0.89453125, + "learning_rate": 3.9724134287423406e-05, + "loss": 0.7378, + "step": 4093 + }, + { + "epoch": 0.3012370659921821, + "grad_norm": 1.1171875, + "learning_rate": 3.971945336716074e-05, + "loss": 0.9854, + "step": 4094 + }, + { + "epoch": 0.3013106461255461, + "grad_norm": 0.71875, + "learning_rate": 3.971477165693279e-05, + "loss": 0.8025, + "step": 4095 + }, + { + "epoch": 0.3013842262589101, + "grad_norm": 0.99609375, + "learning_rate": 3.9710089156990856e-05, + "loss": 1.0144, + "step": 4096 + }, + { + "epoch": 0.3014578063922741, + "grad_norm": 0.7578125, + "learning_rate": 3.970540586758621e-05, + "loss": 0.9574, + "step": 4097 + }, + { + "epoch": 0.3015313865256381, + "grad_norm": 0.7109375, + "learning_rate": 3.970072178897021e-05, + "loss": 0.7508, + "step": 4098 + }, + { + "epoch": 0.30160496665900205, + "grad_norm": 0.78125, + "learning_rate": 3.969603692139423e-05, + "loss": 1.0062, + "step": 4099 + }, + { + "epoch": 0.30167854679236605, + "grad_norm": 0.99609375, + "learning_rate": 3.969135126510971e-05, + "loss": 0.9119, + "step": 4100 + }, + { + "epoch": 0.30175212692573006, + "grad_norm": 0.89453125, + "learning_rate": 3.968666482036812e-05, + "loss": 1.28, + "step": 4101 + }, + { + "epoch": 0.30182570705909406, + "grad_norm": 0.9453125, + "learning_rate": 3.968197758742096e-05, + "loss": 1.1114, + "step": 4102 + }, + { + "epoch": 0.301899287192458, + "grad_norm": 0.7734375, + "learning_rate": 3.9677289566519796e-05, + "loss": 0.7357, + "step": 4103 + }, + { + "epoch": 0.301972867325822, + "grad_norm": 0.828125, + "learning_rate": 3.967260075791622e-05, + "loss": 0.5691, + "step": 4104 + }, + { + "epoch": 0.302046447459186, + "grad_norm": 0.85546875, + "learning_rate": 3.966791116186188e-05, + "loss": 0.8774, + "step": 4105 + }, + { + "epoch": 0.30212002759255, + "grad_norm": 0.82421875, + "learning_rate": 3.966322077860846e-05, + "loss": 1.015, + "step": 4106 + }, + { + "epoch": 0.302193607725914, + "grad_norm": 0.84375, + "learning_rate": 3.965852960840766e-05, + "loss": 0.8266, + "step": 4107 + }, + { + "epoch": 0.302267187859278, + "grad_norm": 0.875, + "learning_rate": 3.9653837651511266e-05, + "loss": 1.1217, + "step": 4108 + }, + { + "epoch": 0.302340767992642, + "grad_norm": 0.99609375, + "learning_rate": 3.964914490817108e-05, + "loss": 0.9724, + "step": 4109 + }, + { + "epoch": 0.302414348126006, + "grad_norm": 0.8984375, + "learning_rate": 3.9644451378638956e-05, + "loss": 0.853, + "step": 4110 + }, + { + "epoch": 0.30248792825937, + "grad_norm": 0.8515625, + "learning_rate": 3.963975706316679e-05, + "loss": 0.8187, + "step": 4111 + }, + { + "epoch": 0.30256150839273394, + "grad_norm": 0.73046875, + "learning_rate": 3.963506196200651e-05, + "loss": 0.979, + "step": 4112 + }, + { + "epoch": 0.30263508852609794, + "grad_norm": 0.98828125, + "learning_rate": 3.96303660754101e-05, + "loss": 1.1585, + "step": 4113 + }, + { + "epoch": 0.30270866865946194, + "grad_norm": 0.921875, + "learning_rate": 3.9625669403629574e-05, + "loss": 0.923, + "step": 4114 + }, + { + "epoch": 0.30278224879282595, + "grad_norm": 0.84765625, + "learning_rate": 3.9620971946916996e-05, + "loss": 0.8744, + "step": 4115 + }, + { + "epoch": 0.30285582892618995, + "grad_norm": 0.99609375, + "learning_rate": 3.961627370552447e-05, + "loss": 1.3116, + "step": 4116 + }, + { + "epoch": 0.3029294090595539, + "grad_norm": 0.69140625, + "learning_rate": 3.961157467970413e-05, + "loss": 0.8754, + "step": 4117 + }, + { + "epoch": 0.3030029891929179, + "grad_norm": 0.85546875, + "learning_rate": 3.9606874869708186e-05, + "loss": 0.8331, + "step": 4118 + }, + { + "epoch": 0.3030765693262819, + "grad_norm": 1.1875, + "learning_rate": 3.960217427578885e-05, + "loss": 1.3008, + "step": 4119 + }, + { + "epoch": 0.3031501494596459, + "grad_norm": 0.7421875, + "learning_rate": 3.9597472898198404e-05, + "loss": 0.9466, + "step": 4120 + }, + { + "epoch": 0.30322372959300986, + "grad_norm": 0.8671875, + "learning_rate": 3.9592770737189155e-05, + "loss": 0.8472, + "step": 4121 + }, + { + "epoch": 0.30329730972637386, + "grad_norm": 0.91015625, + "learning_rate": 3.9588067793013466e-05, + "loss": 0.9979, + "step": 4122 + }, + { + "epoch": 0.30337088985973787, + "grad_norm": 0.859375, + "learning_rate": 3.9583364065923727e-05, + "loss": 1.0135, + "step": 4123 + }, + { + "epoch": 0.3034444699931019, + "grad_norm": 0.9375, + "learning_rate": 3.9578659556172386e-05, + "loss": 1.0402, + "step": 4124 + }, + { + "epoch": 0.3035180501264659, + "grad_norm": 0.65234375, + "learning_rate": 3.957395426401192e-05, + "loss": 0.8235, + "step": 4125 + }, + { + "epoch": 0.3035916302598298, + "grad_norm": 1.0859375, + "learning_rate": 3.9569248189694863e-05, + "loss": 1.3151, + "step": 4126 + }, + { + "epoch": 0.30366521039319383, + "grad_norm": 1.0390625, + "learning_rate": 3.956454133347376e-05, + "loss": 1.1963, + "step": 4127 + }, + { + "epoch": 0.30373879052655783, + "grad_norm": 0.8828125, + "learning_rate": 3.955983369560124e-05, + "loss": 1.0767, + "step": 4128 + }, + { + "epoch": 0.30381237065992184, + "grad_norm": 0.7578125, + "learning_rate": 3.955512527632994e-05, + "loss": 0.7796, + "step": 4129 + }, + { + "epoch": 0.3038859507932858, + "grad_norm": 0.7421875, + "learning_rate": 3.955041607591256e-05, + "loss": 1.0109, + "step": 4130 + }, + { + "epoch": 0.3039595309266498, + "grad_norm": 0.9921875, + "learning_rate": 3.954570609460183e-05, + "loss": 1.0324, + "step": 4131 + }, + { + "epoch": 0.3040331110600138, + "grad_norm": 0.91015625, + "learning_rate": 3.954099533265053e-05, + "loss": 1.2995, + "step": 4132 + }, + { + "epoch": 0.3041066911933778, + "grad_norm": 0.77734375, + "learning_rate": 3.953628379031147e-05, + "loss": 0.7554, + "step": 4133 + }, + { + "epoch": 0.3041802713267418, + "grad_norm": 0.87109375, + "learning_rate": 3.953157146783751e-05, + "loss": 0.8913, + "step": 4134 + }, + { + "epoch": 0.30425385146010575, + "grad_norm": 0.80859375, + "learning_rate": 3.952685836548157e-05, + "loss": 0.813, + "step": 4135 + }, + { + "epoch": 0.30432743159346975, + "grad_norm": 0.90234375, + "learning_rate": 3.952214448349657e-05, + "loss": 1.051, + "step": 4136 + }, + { + "epoch": 0.30440101172683376, + "grad_norm": 5.65625, + "learning_rate": 3.951742982213551e-05, + "loss": 0.946, + "step": 4137 + }, + { + "epoch": 0.30447459186019776, + "grad_norm": 1.078125, + "learning_rate": 3.9512714381651406e-05, + "loss": 1.1763, + "step": 4138 + }, + { + "epoch": 0.3045481719935617, + "grad_norm": 0.65234375, + "learning_rate": 3.950799816229733e-05, + "loss": 0.7493, + "step": 4139 + }, + { + "epoch": 0.3046217521269257, + "grad_norm": 0.984375, + "learning_rate": 3.95032811643264e-05, + "loss": 1.0026, + "step": 4140 + }, + { + "epoch": 0.3046953322602897, + "grad_norm": 0.8203125, + "learning_rate": 3.949856338799175e-05, + "loss": 1.0317, + "step": 4141 + }, + { + "epoch": 0.3047689123936537, + "grad_norm": 4.25, + "learning_rate": 3.94938448335466e-05, + "loss": 0.8519, + "step": 4142 + }, + { + "epoch": 0.3048424925270177, + "grad_norm": 0.81640625, + "learning_rate": 3.948912550124417e-05, + "loss": 1.1186, + "step": 4143 + }, + { + "epoch": 0.3049160726603817, + "grad_norm": 0.6875, + "learning_rate": 3.9484405391337744e-05, + "loss": 0.6545, + "step": 4144 + }, + { + "epoch": 0.3049896527937457, + "grad_norm": 0.9609375, + "learning_rate": 3.947968450408063e-05, + "loss": 1.0666, + "step": 4145 + }, + { + "epoch": 0.3050632329271097, + "grad_norm": 1.1796875, + "learning_rate": 3.947496283972619e-05, + "loss": 0.9816, + "step": 4146 + }, + { + "epoch": 0.3051368130604737, + "grad_norm": 0.84765625, + "learning_rate": 3.9470240398527846e-05, + "loss": 1.1742, + "step": 4147 + }, + { + "epoch": 0.30521039319383764, + "grad_norm": 0.7265625, + "learning_rate": 3.946551718073903e-05, + "loss": 0.8515, + "step": 4148 + }, + { + "epoch": 0.30528397332720164, + "grad_norm": 0.9609375, + "learning_rate": 3.946079318661323e-05, + "loss": 1.1281, + "step": 4149 + }, + { + "epoch": 0.30535755346056564, + "grad_norm": 0.6875, + "learning_rate": 3.945606841640397e-05, + "loss": 0.5859, + "step": 4150 + }, + { + "epoch": 0.30543113359392965, + "grad_norm": 0.83984375, + "learning_rate": 3.9451342870364816e-05, + "loss": 0.6277, + "step": 4151 + }, + { + "epoch": 0.30550471372729365, + "grad_norm": 1.1953125, + "learning_rate": 3.944661654874939e-05, + "loss": 1.4611, + "step": 4152 + }, + { + "epoch": 0.3055782938606576, + "grad_norm": 1.0234375, + "learning_rate": 3.944188945181134e-05, + "loss": 1.3086, + "step": 4153 + }, + { + "epoch": 0.3056518739940216, + "grad_norm": 0.93359375, + "learning_rate": 3.9437161579804355e-05, + "loss": 0.9333, + "step": 4154 + }, + { + "epoch": 0.3057254541273856, + "grad_norm": 0.78515625, + "learning_rate": 3.943243293298218e-05, + "loss": 0.8708, + "step": 4155 + }, + { + "epoch": 0.3057990342607496, + "grad_norm": 0.796875, + "learning_rate": 3.942770351159859e-05, + "loss": 0.7147, + "step": 4156 + }, + { + "epoch": 0.30587261439411356, + "grad_norm": 0.88671875, + "learning_rate": 3.942297331590739e-05, + "loss": 0.7497, + "step": 4157 + }, + { + "epoch": 0.30594619452747757, + "grad_norm": 1.0859375, + "learning_rate": 3.9418242346162474e-05, + "loss": 1.3933, + "step": 4158 + }, + { + "epoch": 0.30601977466084157, + "grad_norm": 0.859375, + "learning_rate": 3.9413510602617706e-05, + "loss": 1.102, + "step": 4159 + }, + { + "epoch": 0.3060933547942056, + "grad_norm": 1.0546875, + "learning_rate": 3.940877808552706e-05, + "loss": 1.2623, + "step": 4160 + }, + { + "epoch": 0.3061669349275696, + "grad_norm": 0.9140625, + "learning_rate": 3.94040447951445e-05, + "loss": 1.2528, + "step": 4161 + }, + { + "epoch": 0.3062405150609335, + "grad_norm": 0.8203125, + "learning_rate": 3.939931073172406e-05, + "loss": 0.9589, + "step": 4162 + }, + { + "epoch": 0.30631409519429753, + "grad_norm": 1.015625, + "learning_rate": 3.939457589551982e-05, + "loss": 1.0826, + "step": 4163 + }, + { + "epoch": 0.30638767532766154, + "grad_norm": 0.95703125, + "learning_rate": 3.938984028678587e-05, + "loss": 0.7509, + "step": 4164 + }, + { + "epoch": 0.30646125546102554, + "grad_norm": 0.87890625, + "learning_rate": 3.9385103905776374e-05, + "loss": 0.8302, + "step": 4165 + }, + { + "epoch": 0.3065348355943895, + "grad_norm": 0.86328125, + "learning_rate": 3.938036675274552e-05, + "loss": 0.9071, + "step": 4166 + }, + { + "epoch": 0.3066084157277535, + "grad_norm": 0.6484375, + "learning_rate": 3.937562882794754e-05, + "loss": 0.6937, + "step": 4167 + }, + { + "epoch": 0.3066819958611175, + "grad_norm": 0.98828125, + "learning_rate": 3.937089013163672e-05, + "loss": 0.9938, + "step": 4168 + }, + { + "epoch": 0.3067555759944815, + "grad_norm": 0.671875, + "learning_rate": 3.936615066406737e-05, + "loss": 0.6215, + "step": 4169 + }, + { + "epoch": 0.3068291561278455, + "grad_norm": 3.578125, + "learning_rate": 3.936141042549384e-05, + "loss": 0.8972, + "step": 4170 + }, + { + "epoch": 0.30690273626120945, + "grad_norm": 1.0546875, + "learning_rate": 3.935666941617054e-05, + "loss": 0.8855, + "step": 4171 + }, + { + "epoch": 0.30697631639457346, + "grad_norm": 1.59375, + "learning_rate": 3.9351927636351905e-05, + "loss": 1.0294, + "step": 4172 + }, + { + "epoch": 0.30704989652793746, + "grad_norm": 0.94921875, + "learning_rate": 3.9347185086292424e-05, + "loss": 1.1058, + "step": 4173 + }, + { + "epoch": 0.30712347666130146, + "grad_norm": 1.0, + "learning_rate": 3.934244176624662e-05, + "loss": 0.7408, + "step": 4174 + }, + { + "epoch": 0.3071970567946654, + "grad_norm": 0.86328125, + "learning_rate": 3.9337697676469046e-05, + "loss": 1.0399, + "step": 4175 + }, + { + "epoch": 0.3072706369280294, + "grad_norm": 0.83984375, + "learning_rate": 3.933295281721433e-05, + "loss": 0.7997, + "step": 4176 + }, + { + "epoch": 0.3073442170613934, + "grad_norm": 0.74609375, + "learning_rate": 3.93282071887371e-05, + "loss": 0.8517, + "step": 4177 + }, + { + "epoch": 0.3074177971947574, + "grad_norm": 0.98046875, + "learning_rate": 3.9323460791292055e-05, + "loss": 1.04, + "step": 4178 + }, + { + "epoch": 0.30749137732812143, + "grad_norm": 0.91796875, + "learning_rate": 3.9318713625133926e-05, + "loss": 1.0115, + "step": 4179 + }, + { + "epoch": 0.3075649574614854, + "grad_norm": 0.85546875, + "learning_rate": 3.9313965690517475e-05, + "loss": 0.7472, + "step": 4180 + }, + { + "epoch": 0.3076385375948494, + "grad_norm": 0.8671875, + "learning_rate": 3.930921698769752e-05, + "loss": 0.9818, + "step": 4181 + }, + { + "epoch": 0.3077121177282134, + "grad_norm": 0.93359375, + "learning_rate": 3.930446751692892e-05, + "loss": 1.0383, + "step": 4182 + }, + { + "epoch": 0.3077856978615774, + "grad_norm": 1.046875, + "learning_rate": 3.9299717278466566e-05, + "loss": 1.16, + "step": 4183 + }, + { + "epoch": 0.30785927799494134, + "grad_norm": 0.8671875, + "learning_rate": 3.929496627256539e-05, + "loss": 0.9647, + "step": 4184 + }, + { + "epoch": 0.30793285812830534, + "grad_norm": 0.87109375, + "learning_rate": 3.929021449948037e-05, + "loss": 0.8962, + "step": 4185 + }, + { + "epoch": 0.30800643826166935, + "grad_norm": 0.76171875, + "learning_rate": 3.928546195946654e-05, + "loss": 1.3702, + "step": 4186 + }, + { + "epoch": 0.30808001839503335, + "grad_norm": 0.9140625, + "learning_rate": 3.928070865277894e-05, + "loss": 0.9198, + "step": 4187 + }, + { + "epoch": 0.30815359852839735, + "grad_norm": 0.82421875, + "learning_rate": 3.927595457967268e-05, + "loss": 0.8546, + "step": 4188 + }, + { + "epoch": 0.3082271786617613, + "grad_norm": 0.9921875, + "learning_rate": 3.92711997404029e-05, + "loss": 1.02, + "step": 4189 + }, + { + "epoch": 0.3083007587951253, + "grad_norm": 0.640625, + "learning_rate": 3.9266444135224786e-05, + "loss": 0.5583, + "step": 4190 + }, + { + "epoch": 0.3083743389284893, + "grad_norm": 0.7734375, + "learning_rate": 3.926168776439356e-05, + "loss": 0.9121, + "step": 4191 + }, + { + "epoch": 0.3084479190618533, + "grad_norm": 0.89453125, + "learning_rate": 3.925693062816449e-05, + "loss": 0.7983, + "step": 4192 + }, + { + "epoch": 0.30852149919521726, + "grad_norm": 0.91015625, + "learning_rate": 3.925217272679288e-05, + "loss": 1.0658, + "step": 4193 + }, + { + "epoch": 0.30859507932858127, + "grad_norm": 0.90625, + "learning_rate": 3.924741406053407e-05, + "loss": 1.1073, + "step": 4194 + }, + { + "epoch": 0.30866865946194527, + "grad_norm": 0.96484375, + "learning_rate": 3.924265462964347e-05, + "loss": 1.1864, + "step": 4195 + }, + { + "epoch": 0.3087422395953093, + "grad_norm": 0.96484375, + "learning_rate": 3.923789443437649e-05, + "loss": 0.9088, + "step": 4196 + }, + { + "epoch": 0.3088158197286733, + "grad_norm": 0.765625, + "learning_rate": 3.923313347498861e-05, + "loss": 0.7344, + "step": 4197 + }, + { + "epoch": 0.30888939986203723, + "grad_norm": 0.7109375, + "learning_rate": 3.9228371751735336e-05, + "loss": 0.8802, + "step": 4198 + }, + { + "epoch": 0.30896297999540123, + "grad_norm": 0.7890625, + "learning_rate": 3.922360926487223e-05, + "loss": 0.6581, + "step": 4199 + }, + { + "epoch": 0.30903656012876524, + "grad_norm": 0.95703125, + "learning_rate": 3.921884601465487e-05, + "loss": 1.1206, + "step": 4200 + }, + { + "epoch": 0.30911014026212924, + "grad_norm": 1.0234375, + "learning_rate": 3.921408200133891e-05, + "loss": 1.3348, + "step": 4201 + }, + { + "epoch": 0.30918372039549324, + "grad_norm": 0.890625, + "learning_rate": 3.9209317225180006e-05, + "loss": 0.9765, + "step": 4202 + }, + { + "epoch": 0.3092573005288572, + "grad_norm": 0.87890625, + "learning_rate": 3.920455168643389e-05, + "loss": 1.094, + "step": 4203 + }, + { + "epoch": 0.3093308806622212, + "grad_norm": 0.703125, + "learning_rate": 3.9199785385356314e-05, + "loss": 0.9887, + "step": 4204 + }, + { + "epoch": 0.3094044607955852, + "grad_norm": 0.81640625, + "learning_rate": 3.919501832220307e-05, + "loss": 1.0445, + "step": 4205 + }, + { + "epoch": 0.3094780409289492, + "grad_norm": 0.91015625, + "learning_rate": 3.919025049723001e-05, + "loss": 0.7977, + "step": 4206 + }, + { + "epoch": 0.30955162106231315, + "grad_norm": 1.046875, + "learning_rate": 3.9185481910693004e-05, + "loss": 1.2536, + "step": 4207 + }, + { + "epoch": 0.30962520119567716, + "grad_norm": 0.98046875, + "learning_rate": 3.9180712562847974e-05, + "loss": 1.1189, + "step": 4208 + }, + { + "epoch": 0.30969878132904116, + "grad_norm": 1.0703125, + "learning_rate": 3.917594245395089e-05, + "loss": 1.6528, + "step": 4209 + }, + { + "epoch": 0.30977236146240517, + "grad_norm": 1.0, + "learning_rate": 3.917117158425774e-05, + "loss": 1.353, + "step": 4210 + }, + { + "epoch": 0.30984594159576917, + "grad_norm": 1.03125, + "learning_rate": 3.916639995402459e-05, + "loss": 0.9726, + "step": 4211 + }, + { + "epoch": 0.3099195217291331, + "grad_norm": 0.85546875, + "learning_rate": 3.9161627563507494e-05, + "loss": 0.8628, + "step": 4212 + }, + { + "epoch": 0.3099931018624971, + "grad_norm": 0.83203125, + "learning_rate": 3.915685441296261e-05, + "loss": 0.8632, + "step": 4213 + }, + { + "epoch": 0.3100666819958611, + "grad_norm": 0.83203125, + "learning_rate": 3.915208050264608e-05, + "loss": 0.9593, + "step": 4214 + }, + { + "epoch": 0.31014026212922513, + "grad_norm": 0.69140625, + "learning_rate": 3.914730583281412e-05, + "loss": 0.7832, + "step": 4215 + }, + { + "epoch": 0.3102138422625891, + "grad_norm": 0.8671875, + "learning_rate": 3.9142530403722976e-05, + "loss": 1.0504, + "step": 4216 + }, + { + "epoch": 0.3102874223959531, + "grad_norm": 1.0625, + "learning_rate": 3.9137754215628944e-05, + "loss": 0.9269, + "step": 4217 + }, + { + "epoch": 0.3103610025293171, + "grad_norm": 0.859375, + "learning_rate": 3.913297726878834e-05, + "loss": 0.8606, + "step": 4218 + }, + { + "epoch": 0.3104345826626811, + "grad_norm": 0.71484375, + "learning_rate": 3.912819956345754e-05, + "loss": 0.852, + "step": 4219 + }, + { + "epoch": 0.3105081627960451, + "grad_norm": 0.859375, + "learning_rate": 3.912342109989296e-05, + "loss": 0.9616, + "step": 4220 + }, + { + "epoch": 0.31058174292940904, + "grad_norm": 1.0078125, + "learning_rate": 3.911864187835103e-05, + "loss": 1.1499, + "step": 4221 + }, + { + "epoch": 0.31065532306277305, + "grad_norm": 0.9375, + "learning_rate": 3.911386189908826e-05, + "loss": 0.7484, + "step": 4222 + }, + { + "epoch": 0.31072890319613705, + "grad_norm": 0.796875, + "learning_rate": 3.910908116236118e-05, + "loss": 0.7645, + "step": 4223 + }, + { + "epoch": 0.31080248332950106, + "grad_norm": 0.95703125, + "learning_rate": 3.9104299668426375e-05, + "loss": 0.8799, + "step": 4224 + }, + { + "epoch": 0.310876063462865, + "grad_norm": 1.234375, + "learning_rate": 3.909951741754043e-05, + "loss": 1.3231, + "step": 4225 + }, + { + "epoch": 0.310949643596229, + "grad_norm": 0.83203125, + "learning_rate": 3.9094734409960026e-05, + "loss": 0.8432, + "step": 4226 + }, + { + "epoch": 0.311023223729593, + "grad_norm": 1.3828125, + "learning_rate": 3.908995064594185e-05, + "loss": 1.1038, + "step": 4227 + }, + { + "epoch": 0.311096803862957, + "grad_norm": 0.9609375, + "learning_rate": 3.908516612574262e-05, + "loss": 0.8895, + "step": 4228 + }, + { + "epoch": 0.311170383996321, + "grad_norm": 0.87109375, + "learning_rate": 3.908038084961914e-05, + "loss": 0.944, + "step": 4229 + }, + { + "epoch": 0.31124396412968497, + "grad_norm": 0.74609375, + "learning_rate": 3.907559481782821e-05, + "loss": 0.7459, + "step": 4230 + }, + { + "epoch": 0.311317544263049, + "grad_norm": 0.80078125, + "learning_rate": 3.907080803062669e-05, + "loss": 0.8815, + "step": 4231 + }, + { + "epoch": 0.311391124396413, + "grad_norm": 0.9765625, + "learning_rate": 3.906602048827148e-05, + "loss": 1.0102, + "step": 4232 + }, + { + "epoch": 0.311464704529777, + "grad_norm": 0.703125, + "learning_rate": 3.906123219101952e-05, + "loss": 0.9443, + "step": 4233 + }, + { + "epoch": 0.31153828466314093, + "grad_norm": 0.84765625, + "learning_rate": 3.905644313912778e-05, + "loss": 0.8356, + "step": 4234 + }, + { + "epoch": 0.31161186479650493, + "grad_norm": 1.015625, + "learning_rate": 3.905165333285329e-05, + "loss": 1.1552, + "step": 4235 + }, + { + "epoch": 0.31168544492986894, + "grad_norm": 1.0234375, + "learning_rate": 3.904686277245311e-05, + "loss": 1.1431, + "step": 4236 + }, + { + "epoch": 0.31175902506323294, + "grad_norm": 0.81640625, + "learning_rate": 3.9042071458184323e-05, + "loss": 0.6391, + "step": 4237 + }, + { + "epoch": 0.31183260519659695, + "grad_norm": 0.82421875, + "learning_rate": 3.903727939030409e-05, + "loss": 0.7324, + "step": 4238 + }, + { + "epoch": 0.3119061853299609, + "grad_norm": 0.6484375, + "learning_rate": 3.903248656906958e-05, + "loss": 0.7205, + "step": 4239 + }, + { + "epoch": 0.3119797654633249, + "grad_norm": 0.94921875, + "learning_rate": 3.902769299473803e-05, + "loss": 0.7757, + "step": 4240 + }, + { + "epoch": 0.3120533455966889, + "grad_norm": 0.68359375, + "learning_rate": 3.9022898667566686e-05, + "loss": 0.7457, + "step": 4241 + }, + { + "epoch": 0.3121269257300529, + "grad_norm": 0.8671875, + "learning_rate": 3.901810358781286e-05, + "loss": 1.1793, + "step": 4242 + }, + { + "epoch": 0.31220050586341686, + "grad_norm": 0.9921875, + "learning_rate": 3.901330775573389e-05, + "loss": 0.9182, + "step": 4243 + }, + { + "epoch": 0.31227408599678086, + "grad_norm": 0.93359375, + "learning_rate": 3.9008511171587145e-05, + "loss": 1.2152, + "step": 4244 + }, + { + "epoch": 0.31234766613014486, + "grad_norm": 0.94140625, + "learning_rate": 3.900371383563008e-05, + "loss": 0.9771, + "step": 4245 + }, + { + "epoch": 0.31242124626350887, + "grad_norm": 0.8828125, + "learning_rate": 3.899891574812014e-05, + "loss": 0.7187, + "step": 4246 + }, + { + "epoch": 0.31249482639687287, + "grad_norm": 0.921875, + "learning_rate": 3.899411690931482e-05, + "loss": 1.0377, + "step": 4247 + }, + { + "epoch": 0.3125684065302368, + "grad_norm": 0.8125, + "learning_rate": 3.89893173194717e-05, + "loss": 1.0612, + "step": 4248 + }, + { + "epoch": 0.3126419866636008, + "grad_norm": 1.78125, + "learning_rate": 3.8984516978848326e-05, + "loss": 1.1835, + "step": 4249 + }, + { + "epoch": 0.31271556679696483, + "grad_norm": 0.8828125, + "learning_rate": 3.8979715887702336e-05, + "loss": 1.229, + "step": 4250 + }, + { + "epoch": 0.31278914693032883, + "grad_norm": 0.96875, + "learning_rate": 3.89749140462914e-05, + "loss": 0.6719, + "step": 4251 + }, + { + "epoch": 0.3128627270636928, + "grad_norm": 0.83984375, + "learning_rate": 3.8970111454873225e-05, + "loss": 0.9952, + "step": 4252 + }, + { + "epoch": 0.3129363071970568, + "grad_norm": 0.9296875, + "learning_rate": 3.8965308113705553e-05, + "loss": 0.7742, + "step": 4253 + }, + { + "epoch": 0.3130098873304208, + "grad_norm": 0.74609375, + "learning_rate": 3.896050402304618e-05, + "loss": 0.9227, + "step": 4254 + }, + { + "epoch": 0.3130834674637848, + "grad_norm": 0.80859375, + "learning_rate": 3.895569918315292e-05, + "loss": 1.1841, + "step": 4255 + }, + { + "epoch": 0.3131570475971488, + "grad_norm": 0.890625, + "learning_rate": 3.8950893594283636e-05, + "loss": 1.2208, + "step": 4256 + }, + { + "epoch": 0.31323062773051275, + "grad_norm": 0.7578125, + "learning_rate": 3.894608725669624e-05, + "loss": 0.7798, + "step": 4257 + }, + { + "epoch": 0.31330420786387675, + "grad_norm": 0.78125, + "learning_rate": 3.894128017064869e-05, + "loss": 0.8113, + "step": 4258 + }, + { + "epoch": 0.31337778799724075, + "grad_norm": 0.8515625, + "learning_rate": 3.8936472336398965e-05, + "loss": 1.0558, + "step": 4259 + }, + { + "epoch": 0.31345136813060476, + "grad_norm": 0.8359375, + "learning_rate": 3.8931663754205086e-05, + "loss": 0.8551, + "step": 4260 + }, + { + "epoch": 0.3135249482639687, + "grad_norm": 1.4375, + "learning_rate": 3.8926854424325135e-05, + "loss": 0.9985, + "step": 4261 + }, + { + "epoch": 0.3135985283973327, + "grad_norm": 0.72265625, + "learning_rate": 3.89220443470172e-05, + "loss": 0.6664, + "step": 4262 + }, + { + "epoch": 0.3136721085306967, + "grad_norm": 0.76171875, + "learning_rate": 3.891723352253944e-05, + "loss": 0.9043, + "step": 4263 + }, + { + "epoch": 0.3137456886640607, + "grad_norm": 0.87890625, + "learning_rate": 3.8912421951150055e-05, + "loss": 1.2252, + "step": 4264 + }, + { + "epoch": 0.3138192687974247, + "grad_norm": 0.73828125, + "learning_rate": 3.890760963310725e-05, + "loss": 0.7947, + "step": 4265 + }, + { + "epoch": 0.31389284893078867, + "grad_norm": 1.0, + "learning_rate": 3.890279656866931e-05, + "loss": 0.916, + "step": 4266 + }, + { + "epoch": 0.3139664290641527, + "grad_norm": 0.71484375, + "learning_rate": 3.889798275809453e-05, + "loss": 0.6326, + "step": 4267 + }, + { + "epoch": 0.3140400091975167, + "grad_norm": 0.7578125, + "learning_rate": 3.889316820164127e-05, + "loss": 0.8687, + "step": 4268 + }, + { + "epoch": 0.3141135893308807, + "grad_norm": 0.92578125, + "learning_rate": 3.888835289956792e-05, + "loss": 0.9091, + "step": 4269 + }, + { + "epoch": 0.31418716946424463, + "grad_norm": 0.890625, + "learning_rate": 3.888353685213289e-05, + "loss": 1.1087, + "step": 4270 + }, + { + "epoch": 0.31426074959760864, + "grad_norm": 0.8359375, + "learning_rate": 3.887872005959466e-05, + "loss": 1.0186, + "step": 4271 + }, + { + "epoch": 0.31433432973097264, + "grad_norm": 0.76171875, + "learning_rate": 3.887390252221174e-05, + "loss": 0.7486, + "step": 4272 + }, + { + "epoch": 0.31440790986433664, + "grad_norm": 0.8828125, + "learning_rate": 3.886908424024268e-05, + "loss": 1.0472, + "step": 4273 + }, + { + "epoch": 0.31448148999770065, + "grad_norm": 0.71875, + "learning_rate": 3.886426521394606e-05, + "loss": 0.5371, + "step": 4274 + }, + { + "epoch": 0.3145550701310646, + "grad_norm": 0.96875, + "learning_rate": 3.885944544358051e-05, + "loss": 0.7701, + "step": 4275 + }, + { + "epoch": 0.3146286502644286, + "grad_norm": 0.8984375, + "learning_rate": 3.8854624929404704e-05, + "loss": 1.0232, + "step": 4276 + }, + { + "epoch": 0.3147022303977926, + "grad_norm": 0.86328125, + "learning_rate": 3.8849803671677344e-05, + "loss": 1.0513, + "step": 4277 + }, + { + "epoch": 0.3147758105311566, + "grad_norm": 0.95703125, + "learning_rate": 3.8844981670657174e-05, + "loss": 0.8778, + "step": 4278 + }, + { + "epoch": 0.31484939066452056, + "grad_norm": 0.89453125, + "learning_rate": 3.8840158926603e-05, + "loss": 0.7227, + "step": 4279 + }, + { + "epoch": 0.31492297079788456, + "grad_norm": 1.3046875, + "learning_rate": 3.8835335439773624e-05, + "loss": 1.1575, + "step": 4280 + }, + { + "epoch": 0.31499655093124856, + "grad_norm": 0.92578125, + "learning_rate": 3.883051121042793e-05, + "loss": 1.0151, + "step": 4281 + }, + { + "epoch": 0.31507013106461257, + "grad_norm": 0.6875, + "learning_rate": 3.882568623882482e-05, + "loss": 0.4754, + "step": 4282 + }, + { + "epoch": 0.3151437111979766, + "grad_norm": 0.90625, + "learning_rate": 3.882086052522325e-05, + "loss": 1.3109, + "step": 4283 + }, + { + "epoch": 0.3152172913313405, + "grad_norm": 0.7265625, + "learning_rate": 3.881603406988219e-05, + "loss": 0.7013, + "step": 4284 + }, + { + "epoch": 0.3152908714647045, + "grad_norm": 0.890625, + "learning_rate": 3.881120687306068e-05, + "loss": 1.0863, + "step": 4285 + }, + { + "epoch": 0.31536445159806853, + "grad_norm": 0.99609375, + "learning_rate": 3.8806378935017785e-05, + "loss": 0.7815, + "step": 4286 + }, + { + "epoch": 0.31543803173143253, + "grad_norm": 0.94140625, + "learning_rate": 3.8801550256012606e-05, + "loss": 0.9918, + "step": 4287 + }, + { + "epoch": 0.3155116118647965, + "grad_norm": 0.734375, + "learning_rate": 3.87967208363043e-05, + "loss": 0.7851, + "step": 4288 + }, + { + "epoch": 0.3155851919981605, + "grad_norm": 1.8359375, + "learning_rate": 3.8791890676152036e-05, + "loss": 1.0747, + "step": 4289 + }, + { + "epoch": 0.3156587721315245, + "grad_norm": 0.78515625, + "learning_rate": 3.8787059775815055e-05, + "loss": 1.0735, + "step": 4290 + }, + { + "epoch": 0.3157323522648885, + "grad_norm": 0.734375, + "learning_rate": 3.878222813555261e-05, + "loss": 0.8061, + "step": 4291 + }, + { + "epoch": 0.3158059323982525, + "grad_norm": 1.109375, + "learning_rate": 3.877739575562401e-05, + "loss": 1.1215, + "step": 4292 + }, + { + "epoch": 0.31587951253161645, + "grad_norm": 0.79296875, + "learning_rate": 3.8772562636288614e-05, + "loss": 0.868, + "step": 4293 + }, + { + "epoch": 0.31595309266498045, + "grad_norm": 0.94921875, + "learning_rate": 3.876772877780578e-05, + "loss": 1.1162, + "step": 4294 + }, + { + "epoch": 0.31602667279834445, + "grad_norm": 0.703125, + "learning_rate": 3.8762894180434956e-05, + "loss": 0.6902, + "step": 4295 + }, + { + "epoch": 0.31610025293170846, + "grad_norm": 0.8203125, + "learning_rate": 3.875805884443559e-05, + "loss": 0.8879, + "step": 4296 + }, + { + "epoch": 0.3161738330650724, + "grad_norm": 0.76171875, + "learning_rate": 3.875322277006719e-05, + "loss": 0.8738, + "step": 4297 + }, + { + "epoch": 0.3162474131984364, + "grad_norm": 0.796875, + "learning_rate": 3.874838595758931e-05, + "loss": 0.9271, + "step": 4298 + }, + { + "epoch": 0.3163209933318004, + "grad_norm": 0.78125, + "learning_rate": 3.874354840726151e-05, + "loss": 0.8894, + "step": 4299 + }, + { + "epoch": 0.3163945734651644, + "grad_norm": 0.7734375, + "learning_rate": 3.8738710119343435e-05, + "loss": 0.9597, + "step": 4300 + }, + { + "epoch": 0.3164681535985284, + "grad_norm": 1.125, + "learning_rate": 3.873387109409473e-05, + "loss": 0.971, + "step": 4301 + }, + { + "epoch": 0.31654173373189237, + "grad_norm": 0.875, + "learning_rate": 3.872903133177511e-05, + "loss": 0.867, + "step": 4302 + }, + { + "epoch": 0.3166153138652564, + "grad_norm": 1.1015625, + "learning_rate": 3.8724190832644294e-05, + "loss": 1.3698, + "step": 4303 + }, + { + "epoch": 0.3166888939986204, + "grad_norm": 1.0390625, + "learning_rate": 3.87193495969621e-05, + "loss": 1.3375, + "step": 4304 + }, + { + "epoch": 0.3167624741319844, + "grad_norm": 1.046875, + "learning_rate": 3.8714507624988304e-05, + "loss": 1.3526, + "step": 4305 + }, + { + "epoch": 0.31683605426534833, + "grad_norm": 1.09375, + "learning_rate": 3.87096649169828e-05, + "loss": 1.6838, + "step": 4306 + }, + { + "epoch": 0.31690963439871234, + "grad_norm": 0.6328125, + "learning_rate": 3.8704821473205466e-05, + "loss": 0.5086, + "step": 4307 + }, + { + "epoch": 0.31698321453207634, + "grad_norm": 0.83984375, + "learning_rate": 3.869997729391625e-05, + "loss": 1.0475, + "step": 4308 + }, + { + "epoch": 0.31705679466544034, + "grad_norm": 0.87890625, + "learning_rate": 3.869513237937513e-05, + "loss": 0.8904, + "step": 4309 + }, + { + "epoch": 0.31713037479880435, + "grad_norm": 0.67578125, + "learning_rate": 3.869028672984212e-05, + "loss": 0.7615, + "step": 4310 + }, + { + "epoch": 0.3172039549321683, + "grad_norm": 0.96875, + "learning_rate": 3.868544034557728e-05, + "loss": 1.0894, + "step": 4311 + }, + { + "epoch": 0.3172775350655323, + "grad_norm": 0.91015625, + "learning_rate": 3.86805932268407e-05, + "loss": 0.9805, + "step": 4312 + }, + { + "epoch": 0.3173511151988963, + "grad_norm": 0.984375, + "learning_rate": 3.867574537389253e-05, + "loss": 0.9955, + "step": 4313 + }, + { + "epoch": 0.3174246953322603, + "grad_norm": 0.8828125, + "learning_rate": 3.8670896786992926e-05, + "loss": 1.1447, + "step": 4314 + }, + { + "epoch": 0.31749827546562426, + "grad_norm": 0.85546875, + "learning_rate": 3.8666047466402125e-05, + "loss": 1.008, + "step": 4315 + }, + { + "epoch": 0.31757185559898826, + "grad_norm": 0.87109375, + "learning_rate": 3.8661197412380356e-05, + "loss": 1.0325, + "step": 4316 + }, + { + "epoch": 0.31764543573235227, + "grad_norm": 0.92578125, + "learning_rate": 3.865634662518792e-05, + "loss": 0.8053, + "step": 4317 + }, + { + "epoch": 0.31771901586571627, + "grad_norm": 0.84765625, + "learning_rate": 3.8651495105085164e-05, + "loss": 1.0136, + "step": 4318 + }, + { + "epoch": 0.3177925959990803, + "grad_norm": 0.91015625, + "learning_rate": 3.864664285233245e-05, + "loss": 0.8909, + "step": 4319 + }, + { + "epoch": 0.3178661761324442, + "grad_norm": 0.79296875, + "learning_rate": 3.864178986719019e-05, + "loss": 0.923, + "step": 4320 + }, + { + "epoch": 0.3179397562658082, + "grad_norm": 0.9375, + "learning_rate": 3.863693614991883e-05, + "loss": 1.127, + "step": 4321 + }, + { + "epoch": 0.31801333639917223, + "grad_norm": 0.78125, + "learning_rate": 3.8632081700778865e-05, + "loss": 0.8364, + "step": 4322 + }, + { + "epoch": 0.31808691653253623, + "grad_norm": 1.109375, + "learning_rate": 3.8627226520030816e-05, + "loss": 1.1214, + "step": 4323 + }, + { + "epoch": 0.3181604966659002, + "grad_norm": 0.80859375, + "learning_rate": 3.862237060793526e-05, + "loss": 0.765, + "step": 4324 + }, + { + "epoch": 0.3182340767992642, + "grad_norm": 0.78125, + "learning_rate": 3.861751396475281e-05, + "loss": 0.951, + "step": 4325 + }, + { + "epoch": 0.3183076569326282, + "grad_norm": 1.0078125, + "learning_rate": 3.86126565907441e-05, + "loss": 0.9109, + "step": 4326 + }, + { + "epoch": 0.3183812370659922, + "grad_norm": 0.95703125, + "learning_rate": 3.860779848616982e-05, + "loss": 0.7895, + "step": 4327 + }, + { + "epoch": 0.3184548171993562, + "grad_norm": 0.8046875, + "learning_rate": 3.86029396512907e-05, + "loss": 0.8631, + "step": 4328 + }, + { + "epoch": 0.31852839733272015, + "grad_norm": 0.78515625, + "learning_rate": 3.85980800863675e-05, + "loss": 0.6359, + "step": 4329 + }, + { + "epoch": 0.31860197746608415, + "grad_norm": 1.03125, + "learning_rate": 3.859321979166102e-05, + "loss": 1.3715, + "step": 4330 + }, + { + "epoch": 0.31867555759944816, + "grad_norm": 0.90625, + "learning_rate": 3.858835876743211e-05, + "loss": 0.9113, + "step": 4331 + }, + { + "epoch": 0.31874913773281216, + "grad_norm": 0.73828125, + "learning_rate": 3.858349701394166e-05, + "loss": 1.0455, + "step": 4332 + }, + { + "epoch": 0.3188227178661761, + "grad_norm": 0.82421875, + "learning_rate": 3.857863453145057e-05, + "loss": 0.9149, + "step": 4333 + }, + { + "epoch": 0.3188962979995401, + "grad_norm": 0.84375, + "learning_rate": 3.857377132021982e-05, + "loss": 0.84, + "step": 4334 + }, + { + "epoch": 0.3189698781329041, + "grad_norm": 0.69140625, + "learning_rate": 3.856890738051039e-05, + "loss": 0.6142, + "step": 4335 + }, + { + "epoch": 0.3190434582662681, + "grad_norm": 0.93359375, + "learning_rate": 3.856404271258334e-05, + "loss": 0.9173, + "step": 4336 + }, + { + "epoch": 0.3191170383996321, + "grad_norm": 0.7578125, + "learning_rate": 3.855917731669972e-05, + "loss": 0.9181, + "step": 4337 + }, + { + "epoch": 0.3191906185329961, + "grad_norm": 0.86328125, + "learning_rate": 3.8554311193120674e-05, + "loss": 0.9264, + "step": 4338 + }, + { + "epoch": 0.3192641986663601, + "grad_norm": 1.0, + "learning_rate": 3.8549444342107345e-05, + "loss": 1.0953, + "step": 4339 + }, + { + "epoch": 0.3193377787997241, + "grad_norm": 0.9765625, + "learning_rate": 3.8544576763920934e-05, + "loss": 0.9389, + "step": 4340 + }, + { + "epoch": 0.3194113589330881, + "grad_norm": 0.8046875, + "learning_rate": 3.8539708458822665e-05, + "loss": 0.9894, + "step": 4341 + }, + { + "epoch": 0.31948493906645203, + "grad_norm": 0.96875, + "learning_rate": 3.853483942707382e-05, + "loss": 0.7656, + "step": 4342 + }, + { + "epoch": 0.31955851919981604, + "grad_norm": 0.76953125, + "learning_rate": 3.85299696689357e-05, + "loss": 0.6863, + "step": 4343 + }, + { + "epoch": 0.31963209933318004, + "grad_norm": 0.99609375, + "learning_rate": 3.852509918466967e-05, + "loss": 1.1532, + "step": 4344 + }, + { + "epoch": 0.31970567946654405, + "grad_norm": 0.84765625, + "learning_rate": 3.8520227974537106e-05, + "loss": 0.8935, + "step": 4345 + }, + { + "epoch": 0.31977925959990805, + "grad_norm": 0.81640625, + "learning_rate": 3.851535603879944e-05, + "loss": 0.7089, + "step": 4346 + }, + { + "epoch": 0.319852839733272, + "grad_norm": 0.86328125, + "learning_rate": 3.851048337771815e-05, + "loss": 0.975, + "step": 4347 + }, + { + "epoch": 0.319926419866636, + "grad_norm": 1.5546875, + "learning_rate": 3.850560999155473e-05, + "loss": 1.3401, + "step": 4348 + }, + { + "epoch": 0.32, + "grad_norm": 0.99609375, + "learning_rate": 3.8500735880570734e-05, + "loss": 1.1325, + "step": 4349 + }, + { + "epoch": 0.320073580133364, + "grad_norm": 0.7109375, + "learning_rate": 3.849586104502774e-05, + "loss": 0.852, + "step": 4350 + }, + { + "epoch": 0.32014716026672796, + "grad_norm": 0.87109375, + "learning_rate": 3.849098548518738e-05, + "loss": 0.7993, + "step": 4351 + }, + { + "epoch": 0.32022074040009196, + "grad_norm": 0.7578125, + "learning_rate": 3.84861092013113e-05, + "loss": 0.8838, + "step": 4352 + }, + { + "epoch": 0.32029432053345597, + "grad_norm": 1.0, + "learning_rate": 3.848123219366121e-05, + "loss": 0.8858, + "step": 4353 + }, + { + "epoch": 0.32036790066681997, + "grad_norm": 0.91015625, + "learning_rate": 3.847635446249886e-05, + "loss": 0.8729, + "step": 4354 + }, + { + "epoch": 0.320441480800184, + "grad_norm": 0.86328125, + "learning_rate": 3.8471476008086014e-05, + "loss": 0.689, + "step": 4355 + }, + { + "epoch": 0.3205150609335479, + "grad_norm": 1.0, + "learning_rate": 3.84665968306845e-05, + "loss": 1.2473, + "step": 4356 + }, + { + "epoch": 0.32058864106691193, + "grad_norm": 0.765625, + "learning_rate": 3.8461716930556164e-05, + "loss": 0.8909, + "step": 4357 + }, + { + "epoch": 0.32066222120027593, + "grad_norm": 1.0234375, + "learning_rate": 3.845683630796291e-05, + "loss": 1.0895, + "step": 4358 + }, + { + "epoch": 0.32073580133363994, + "grad_norm": 0.91796875, + "learning_rate": 3.845195496316666e-05, + "loss": 1.2606, + "step": 4359 + }, + { + "epoch": 0.3208093814670039, + "grad_norm": 0.7890625, + "learning_rate": 3.844707289642939e-05, + "loss": 0.7466, + "step": 4360 + }, + { + "epoch": 0.3208829616003679, + "grad_norm": 1.0390625, + "learning_rate": 3.8442190108013135e-05, + "loss": 1.5815, + "step": 4361 + }, + { + "epoch": 0.3209565417337319, + "grad_norm": 0.734375, + "learning_rate": 3.843730659817991e-05, + "loss": 0.8769, + "step": 4362 + }, + { + "epoch": 0.3210301218670959, + "grad_norm": 0.83203125, + "learning_rate": 3.843242236719182e-05, + "loss": 0.7549, + "step": 4363 + }, + { + "epoch": 0.3211037020004599, + "grad_norm": 0.74609375, + "learning_rate": 3.8427537415311e-05, + "loss": 0.7548, + "step": 4364 + }, + { + "epoch": 0.32117728213382385, + "grad_norm": 0.7421875, + "learning_rate": 3.84226517427996e-05, + "loss": 0.6908, + "step": 4365 + }, + { + "epoch": 0.32125086226718785, + "grad_norm": 0.90234375, + "learning_rate": 3.8417765349919835e-05, + "loss": 0.8573, + "step": 4366 + }, + { + "epoch": 0.32132444240055186, + "grad_norm": 0.828125, + "learning_rate": 3.8412878236933946e-05, + "loss": 0.8099, + "step": 4367 + }, + { + "epoch": 0.32139802253391586, + "grad_norm": 0.7890625, + "learning_rate": 3.840799040410422e-05, + "loss": 0.7745, + "step": 4368 + }, + { + "epoch": 0.3214716026672798, + "grad_norm": 0.90234375, + "learning_rate": 3.840310185169297e-05, + "loss": 0.8537, + "step": 4369 + }, + { + "epoch": 0.3215451828006438, + "grad_norm": 0.703125, + "learning_rate": 3.839821257996256e-05, + "loss": 0.6836, + "step": 4370 + }, + { + "epoch": 0.3216187629340078, + "grad_norm": 0.71875, + "learning_rate": 3.839332258917539e-05, + "loss": 0.7052, + "step": 4371 + }, + { + "epoch": 0.3216923430673718, + "grad_norm": 1.2578125, + "learning_rate": 3.838843187959388e-05, + "loss": 1.4513, + "step": 4372 + }, + { + "epoch": 0.3217659232007358, + "grad_norm": 0.88671875, + "learning_rate": 3.8383540451480527e-05, + "loss": 0.8022, + "step": 4373 + }, + { + "epoch": 0.3218395033340998, + "grad_norm": 0.89453125, + "learning_rate": 3.837864830509783e-05, + "loss": 1.1341, + "step": 4374 + }, + { + "epoch": 0.3219130834674638, + "grad_norm": 0.875, + "learning_rate": 3.837375544070836e-05, + "loss": 0.7469, + "step": 4375 + }, + { + "epoch": 0.3219866636008278, + "grad_norm": 0.7890625, + "learning_rate": 3.8368861858574684e-05, + "loss": 0.8379, + "step": 4376 + }, + { + "epoch": 0.3220602437341918, + "grad_norm": 0.828125, + "learning_rate": 3.8363967558959444e-05, + "loss": 1.1046, + "step": 4377 + }, + { + "epoch": 0.32213382386755574, + "grad_norm": 0.69140625, + "learning_rate": 3.8359072542125305e-05, + "loss": 0.7402, + "step": 4378 + }, + { + "epoch": 0.32220740400091974, + "grad_norm": 1.0390625, + "learning_rate": 3.835417680833499e-05, + "loss": 1.3135, + "step": 4379 + }, + { + "epoch": 0.32228098413428374, + "grad_norm": 1.140625, + "learning_rate": 3.83492803578512e-05, + "loss": 1.6153, + "step": 4380 + }, + { + "epoch": 0.32235456426764775, + "grad_norm": 2.71875, + "learning_rate": 3.834438319093676e-05, + "loss": 0.6938, + "step": 4381 + }, + { + "epoch": 0.32242814440101175, + "grad_norm": 0.80859375, + "learning_rate": 3.8339485307854484e-05, + "loss": 1.081, + "step": 4382 + }, + { + "epoch": 0.3225017245343757, + "grad_norm": 0.921875, + "learning_rate": 3.833458670886723e-05, + "loss": 0.8331, + "step": 4383 + }, + { + "epoch": 0.3225753046677397, + "grad_norm": 0.70703125, + "learning_rate": 3.8329687394237876e-05, + "loss": 0.7388, + "step": 4384 + }, + { + "epoch": 0.3226488848011037, + "grad_norm": 1.0546875, + "learning_rate": 3.8324787364229386e-05, + "loss": 1.7329, + "step": 4385 + }, + { + "epoch": 0.3227224649344677, + "grad_norm": 1.1484375, + "learning_rate": 3.831988661910473e-05, + "loss": 1.2951, + "step": 4386 + }, + { + "epoch": 0.32279604506783166, + "grad_norm": 0.8671875, + "learning_rate": 3.83149851591269e-05, + "loss": 0.8811, + "step": 4387 + }, + { + "epoch": 0.32286962520119566, + "grad_norm": 0.71875, + "learning_rate": 3.8310082984558973e-05, + "loss": 0.7405, + "step": 4388 + }, + { + "epoch": 0.32294320533455967, + "grad_norm": 0.8046875, + "learning_rate": 3.830518009566404e-05, + "loss": 0.7229, + "step": 4389 + }, + { + "epoch": 0.3230167854679237, + "grad_norm": 1.0234375, + "learning_rate": 3.830027649270521e-05, + "loss": 1.0959, + "step": 4390 + }, + { + "epoch": 0.3230903656012877, + "grad_norm": 1.0546875, + "learning_rate": 3.829537217594567e-05, + "loss": 1.6317, + "step": 4391 + }, + { + "epoch": 0.3231639457346516, + "grad_norm": 0.95703125, + "learning_rate": 3.8290467145648615e-05, + "loss": 0.97, + "step": 4392 + }, + { + "epoch": 0.32323752586801563, + "grad_norm": 0.859375, + "learning_rate": 3.828556140207729e-05, + "loss": 1.2009, + "step": 4393 + }, + { + "epoch": 0.32331110600137963, + "grad_norm": 0.88671875, + "learning_rate": 3.828065494549497e-05, + "loss": 1.0788, + "step": 4394 + }, + { + "epoch": 0.32338468613474364, + "grad_norm": 0.9296875, + "learning_rate": 3.827574777616499e-05, + "loss": 0.824, + "step": 4395 + }, + { + "epoch": 0.3234582662681076, + "grad_norm": 0.69140625, + "learning_rate": 3.8270839894350694e-05, + "loss": 0.7157, + "step": 4396 + }, + { + "epoch": 0.3235318464014716, + "grad_norm": 0.8359375, + "learning_rate": 3.826593130031549e-05, + "loss": 0.9199, + "step": 4397 + }, + { + "epoch": 0.3236054265348356, + "grad_norm": 0.95703125, + "learning_rate": 3.826102199432282e-05, + "loss": 1.3482, + "step": 4398 + }, + { + "epoch": 0.3236790066681996, + "grad_norm": 0.80078125, + "learning_rate": 3.8256111976636135e-05, + "loss": 0.672, + "step": 4399 + }, + { + "epoch": 0.3237525868015636, + "grad_norm": 0.93359375, + "learning_rate": 3.825120124751895e-05, + "loss": 1.0889, + "step": 4400 + }, + { + "epoch": 0.32382616693492755, + "grad_norm": 0.7734375, + "learning_rate": 3.8246289807234837e-05, + "loss": 0.9362, + "step": 4401 + }, + { + "epoch": 0.32389974706829155, + "grad_norm": 0.73828125, + "learning_rate": 3.824137765604736e-05, + "loss": 0.558, + "step": 4402 + }, + { + "epoch": 0.32397332720165556, + "grad_norm": 0.6875, + "learning_rate": 3.823646479422016e-05, + "loss": 0.6618, + "step": 4403 + }, + { + "epoch": 0.32404690733501956, + "grad_norm": 0.78125, + "learning_rate": 3.8231551222016884e-05, + "loss": 0.7082, + "step": 4404 + }, + { + "epoch": 0.3241204874683835, + "grad_norm": 0.7890625, + "learning_rate": 3.822663693970126e-05, + "loss": 0.8813, + "step": 4405 + }, + { + "epoch": 0.3241940676017475, + "grad_norm": 0.953125, + "learning_rate": 3.8221721947537e-05, + "loss": 0.7983, + "step": 4406 + }, + { + "epoch": 0.3242676477351115, + "grad_norm": 0.921875, + "learning_rate": 3.82168062457879e-05, + "loss": 0.8671, + "step": 4407 + }, + { + "epoch": 0.3243412278684755, + "grad_norm": 0.70703125, + "learning_rate": 3.8211889834717775e-05, + "loss": 0.64, + "step": 4408 + }, + { + "epoch": 0.3244148080018395, + "grad_norm": 0.890625, + "learning_rate": 3.8206972714590474e-05, + "loss": 0.9115, + "step": 4409 + }, + { + "epoch": 0.3244883881352035, + "grad_norm": 1.046875, + "learning_rate": 3.820205488566989e-05, + "loss": 1.4206, + "step": 4410 + }, + { + "epoch": 0.3245619682685675, + "grad_norm": 0.83984375, + "learning_rate": 3.819713634821995e-05, + "loss": 0.6944, + "step": 4411 + }, + { + "epoch": 0.3246355484019315, + "grad_norm": 0.88671875, + "learning_rate": 3.819221710250464e-05, + "loss": 0.614, + "step": 4412 + }, + { + "epoch": 0.3247091285352955, + "grad_norm": 1.3984375, + "learning_rate": 3.818729714878795e-05, + "loss": 0.764, + "step": 4413 + }, + { + "epoch": 0.32478270866865944, + "grad_norm": 1.2109375, + "learning_rate": 3.8182376487333925e-05, + "loss": 1.1189, + "step": 4414 + }, + { + "epoch": 0.32485628880202344, + "grad_norm": 0.74609375, + "learning_rate": 3.817745511840665e-05, + "loss": 0.6934, + "step": 4415 + }, + { + "epoch": 0.32492986893538744, + "grad_norm": 0.87890625, + "learning_rate": 3.8172533042270255e-05, + "loss": 1.1779, + "step": 4416 + }, + { + "epoch": 0.32500344906875145, + "grad_norm": 1.0703125, + "learning_rate": 3.816761025918889e-05, + "loss": 1.0069, + "step": 4417 + }, + { + "epoch": 0.32507702920211545, + "grad_norm": 1.140625, + "learning_rate": 3.816268676942676e-05, + "loss": 1.2744, + "step": 4418 + }, + { + "epoch": 0.3251506093354794, + "grad_norm": 0.79296875, + "learning_rate": 3.815776257324808e-05, + "loss": 0.8949, + "step": 4419 + }, + { + "epoch": 0.3252241894688434, + "grad_norm": 0.83984375, + "learning_rate": 3.815283767091713e-05, + "loss": 0.8184, + "step": 4420 + }, + { + "epoch": 0.3252977696022074, + "grad_norm": 0.83203125, + "learning_rate": 3.814791206269823e-05, + "loss": 0.9131, + "step": 4421 + }, + { + "epoch": 0.3253713497355714, + "grad_norm": 0.95703125, + "learning_rate": 3.8142985748855726e-05, + "loss": 1.1168, + "step": 4422 + }, + { + "epoch": 0.32544492986893536, + "grad_norm": 0.94921875, + "learning_rate": 3.8138058729654e-05, + "loss": 0.8302, + "step": 4423 + }, + { + "epoch": 0.32551851000229937, + "grad_norm": 0.76171875, + "learning_rate": 3.813313100535747e-05, + "loss": 0.8193, + "step": 4424 + }, + { + "epoch": 0.32559209013566337, + "grad_norm": 0.99609375, + "learning_rate": 3.8128202576230616e-05, + "loss": 1.2543, + "step": 4425 + }, + { + "epoch": 0.3256656702690274, + "grad_norm": 0.859375, + "learning_rate": 3.8123273442537914e-05, + "loss": 1.0079, + "step": 4426 + }, + { + "epoch": 0.3257392504023914, + "grad_norm": 0.8984375, + "learning_rate": 3.8118343604543916e-05, + "loss": 0.7694, + "step": 4427 + }, + { + "epoch": 0.3258128305357553, + "grad_norm": 1.1875, + "learning_rate": 3.8113413062513195e-05, + "loss": 0.9604, + "step": 4428 + }, + { + "epoch": 0.32588641066911933, + "grad_norm": 1.0546875, + "learning_rate": 3.8108481816710365e-05, + "loss": 1.205, + "step": 4429 + }, + { + "epoch": 0.32595999080248333, + "grad_norm": 0.92578125, + "learning_rate": 3.810354986740007e-05, + "loss": 1.4123, + "step": 4430 + }, + { + "epoch": 0.32603357093584734, + "grad_norm": 0.890625, + "learning_rate": 3.8098617214847e-05, + "loss": 1.187, + "step": 4431 + }, + { + "epoch": 0.3261071510692113, + "grad_norm": 0.75, + "learning_rate": 3.809368385931589e-05, + "loss": 0.8348, + "step": 4432 + }, + { + "epoch": 0.3261807312025753, + "grad_norm": 0.94140625, + "learning_rate": 3.80887498010715e-05, + "loss": 1.1723, + "step": 4433 + }, + { + "epoch": 0.3262543113359393, + "grad_norm": 0.69921875, + "learning_rate": 3.808381504037862e-05, + "loss": 0.6599, + "step": 4434 + }, + { + "epoch": 0.3263278914693033, + "grad_norm": 0.8125, + "learning_rate": 3.8078879577502105e-05, + "loss": 0.7088, + "step": 4435 + }, + { + "epoch": 0.3264014716026673, + "grad_norm": 0.80859375, + "learning_rate": 3.807394341270682e-05, + "loss": 0.7271, + "step": 4436 + }, + { + "epoch": 0.32647505173603125, + "grad_norm": 0.82421875, + "learning_rate": 3.806900654625769e-05, + "loss": 0.7788, + "step": 4437 + }, + { + "epoch": 0.32654863186939526, + "grad_norm": 0.8203125, + "learning_rate": 3.806406897841966e-05, + "loss": 1.0432, + "step": 4438 + }, + { + "epoch": 0.32662221200275926, + "grad_norm": 0.921875, + "learning_rate": 3.805913070945773e-05, + "loss": 1.1296, + "step": 4439 + }, + { + "epoch": 0.32669579213612326, + "grad_norm": 0.75, + "learning_rate": 3.8054191739636916e-05, + "loss": 0.849, + "step": 4440 + }, + { + "epoch": 0.3267693722694872, + "grad_norm": 0.7578125, + "learning_rate": 3.804925206922229e-05, + "loss": 0.8604, + "step": 4441 + }, + { + "epoch": 0.3268429524028512, + "grad_norm": 0.83984375, + "learning_rate": 3.804431169847894e-05, + "loss": 0.853, + "step": 4442 + }, + { + "epoch": 0.3269165325362152, + "grad_norm": 0.91015625, + "learning_rate": 3.8039370627672026e-05, + "loss": 0.747, + "step": 4443 + }, + { + "epoch": 0.3269901126695792, + "grad_norm": 1.71875, + "learning_rate": 3.8034428857066716e-05, + "loss": 0.8982, + "step": 4444 + }, + { + "epoch": 0.32706369280294323, + "grad_norm": 0.8046875, + "learning_rate": 3.802948638692823e-05, + "loss": 0.8946, + "step": 4445 + }, + { + "epoch": 0.3271372729363072, + "grad_norm": 1.1796875, + "learning_rate": 3.802454321752182e-05, + "loss": 1.32, + "step": 4446 + }, + { + "epoch": 0.3272108530696712, + "grad_norm": 0.81640625, + "learning_rate": 3.801959934911277e-05, + "loss": 0.8762, + "step": 4447 + }, + { + "epoch": 0.3272844332030352, + "grad_norm": 0.87109375, + "learning_rate": 3.801465478196642e-05, + "loss": 1.0046, + "step": 4448 + }, + { + "epoch": 0.3273580133363992, + "grad_norm": 0.73828125, + "learning_rate": 3.800970951634812e-05, + "loss": 0.6777, + "step": 4449 + }, + { + "epoch": 0.32743159346976314, + "grad_norm": 0.75, + "learning_rate": 3.8004763552523296e-05, + "loss": 0.9015, + "step": 4450 + }, + { + "epoch": 0.32750517360312714, + "grad_norm": 0.90234375, + "learning_rate": 3.799981689075737e-05, + "loss": 0.7905, + "step": 4451 + }, + { + "epoch": 0.32757875373649115, + "grad_norm": 0.81640625, + "learning_rate": 3.799486953131582e-05, + "loss": 1.1968, + "step": 4452 + }, + { + "epoch": 0.32765233386985515, + "grad_norm": 0.78515625, + "learning_rate": 3.798992147446417e-05, + "loss": 0.8914, + "step": 4453 + }, + { + "epoch": 0.32772591400321915, + "grad_norm": 0.8203125, + "learning_rate": 3.798497272046796e-05, + "loss": 0.8838, + "step": 4454 + }, + { + "epoch": 0.3277994941365831, + "grad_norm": 0.71875, + "learning_rate": 3.79800232695928e-05, + "loss": 0.7554, + "step": 4455 + }, + { + "epoch": 0.3278730742699471, + "grad_norm": 0.8203125, + "learning_rate": 3.79750731221043e-05, + "loss": 0.6986, + "step": 4456 + }, + { + "epoch": 0.3279466544033111, + "grad_norm": 0.87109375, + "learning_rate": 3.7970122278268136e-05, + "loss": 1.3797, + "step": 4457 + }, + { + "epoch": 0.3280202345366751, + "grad_norm": 1.0703125, + "learning_rate": 3.7965170738350006e-05, + "loss": 0.8397, + "step": 4458 + }, + { + "epoch": 0.32809381467003906, + "grad_norm": 0.66015625, + "learning_rate": 3.7960218502615655e-05, + "loss": 0.8418, + "step": 4459 + }, + { + "epoch": 0.32816739480340307, + "grad_norm": 0.8125, + "learning_rate": 3.795526557133085e-05, + "loss": 0.946, + "step": 4460 + }, + { + "epoch": 0.32824097493676707, + "grad_norm": 0.96484375, + "learning_rate": 3.7950311944761405e-05, + "loss": 1.3836, + "step": 4461 + }, + { + "epoch": 0.3283145550701311, + "grad_norm": 0.80078125, + "learning_rate": 3.7945357623173185e-05, + "loss": 0.8702, + "step": 4462 + }, + { + "epoch": 0.3283881352034951, + "grad_norm": 0.79296875, + "learning_rate": 3.794040260683207e-05, + "loss": 0.7486, + "step": 4463 + }, + { + "epoch": 0.32846171533685903, + "grad_norm": 0.81640625, + "learning_rate": 3.793544689600399e-05, + "loss": 0.8161, + "step": 4464 + }, + { + "epoch": 0.32853529547022303, + "grad_norm": 1.0625, + "learning_rate": 3.7930490490954904e-05, + "loss": 1.0262, + "step": 4465 + }, + { + "epoch": 0.32860887560358704, + "grad_norm": 0.73046875, + "learning_rate": 3.792553339195082e-05, + "loss": 0.7622, + "step": 4466 + }, + { + "epoch": 0.32868245573695104, + "grad_norm": 0.80078125, + "learning_rate": 3.7920575599257754e-05, + "loss": 0.7319, + "step": 4467 + }, + { + "epoch": 0.328756035870315, + "grad_norm": 0.84765625, + "learning_rate": 3.791561711314182e-05, + "loss": 0.994, + "step": 4468 + }, + { + "epoch": 0.328829616003679, + "grad_norm": 0.91796875, + "learning_rate": 3.79106579338691e-05, + "loss": 0.9103, + "step": 4469 + }, + { + "epoch": 0.328903196137043, + "grad_norm": 0.6796875, + "learning_rate": 3.790569806170576e-05, + "loss": 0.8773, + "step": 4470 + }, + { + "epoch": 0.328976776270407, + "grad_norm": 0.92578125, + "learning_rate": 3.790073749691798e-05, + "loss": 1.2341, + "step": 4471 + }, + { + "epoch": 0.329050356403771, + "grad_norm": 0.75, + "learning_rate": 3.789577623977198e-05, + "loss": 0.6619, + "step": 4472 + }, + { + "epoch": 0.32912393653713495, + "grad_norm": 0.7421875, + "learning_rate": 3.789081429053403e-05, + "loss": 0.7464, + "step": 4473 + }, + { + "epoch": 0.32919751667049896, + "grad_norm": 0.828125, + "learning_rate": 3.7885851649470415e-05, + "loss": 0.8021, + "step": 4474 + }, + { + "epoch": 0.32927109680386296, + "grad_norm": 0.88671875, + "learning_rate": 3.788088831684749e-05, + "loss": 0.8931, + "step": 4475 + }, + { + "epoch": 0.32934467693722697, + "grad_norm": 0.78515625, + "learning_rate": 3.787592429293161e-05, + "loss": 0.9185, + "step": 4476 + }, + { + "epoch": 0.3294182570705909, + "grad_norm": 0.8671875, + "learning_rate": 3.787095957798919e-05, + "loss": 0.9234, + "step": 4477 + }, + { + "epoch": 0.3294918372039549, + "grad_norm": 1.078125, + "learning_rate": 3.78659941722867e-05, + "loss": 0.9418, + "step": 4478 + }, + { + "epoch": 0.3295654173373189, + "grad_norm": 0.77734375, + "learning_rate": 3.786102807609059e-05, + "loss": 1.0555, + "step": 4479 + }, + { + "epoch": 0.3296389974706829, + "grad_norm": 0.875, + "learning_rate": 3.785606128966739e-05, + "loss": 0.8578, + "step": 4480 + }, + { + "epoch": 0.32971257760404693, + "grad_norm": 0.69921875, + "learning_rate": 3.785109381328365e-05, + "loss": 0.7296, + "step": 4481 + }, + { + "epoch": 0.3297861577374109, + "grad_norm": 0.828125, + "learning_rate": 3.7846125647206e-05, + "loss": 0.9887, + "step": 4482 + }, + { + "epoch": 0.3298597378707749, + "grad_norm": 1.0390625, + "learning_rate": 3.784115679170105e-05, + "loss": 1.4146, + "step": 4483 + }, + { + "epoch": 0.3299333180041389, + "grad_norm": 1.453125, + "learning_rate": 3.783618724703546e-05, + "loss": 0.9495, + "step": 4484 + }, + { + "epoch": 0.3300068981375029, + "grad_norm": 0.85546875, + "learning_rate": 3.783121701347594e-05, + "loss": 1.0103, + "step": 4485 + }, + { + "epoch": 0.33008047827086684, + "grad_norm": 1.234375, + "learning_rate": 3.782624609128924e-05, + "loss": 1.0129, + "step": 4486 + }, + { + "epoch": 0.33015405840423084, + "grad_norm": 0.7734375, + "learning_rate": 3.782127448074214e-05, + "loss": 0.658, + "step": 4487 + }, + { + "epoch": 0.33022763853759485, + "grad_norm": 0.98828125, + "learning_rate": 3.7816302182101456e-05, + "loss": 1.1676, + "step": 4488 + }, + { + "epoch": 0.33030121867095885, + "grad_norm": 0.71484375, + "learning_rate": 3.7811329195634044e-05, + "loss": 0.7587, + "step": 4489 + }, + { + "epoch": 0.33037479880432286, + "grad_norm": 1.09375, + "learning_rate": 3.7806355521606787e-05, + "loss": 1.4104, + "step": 4490 + }, + { + "epoch": 0.3304483789376868, + "grad_norm": 0.9609375, + "learning_rate": 3.780138116028662e-05, + "loss": 1.2279, + "step": 4491 + }, + { + "epoch": 0.3305219590710508, + "grad_norm": 0.8984375, + "learning_rate": 3.77964061119405e-05, + "loss": 1.0594, + "step": 4492 + }, + { + "epoch": 0.3305955392044148, + "grad_norm": 0.703125, + "learning_rate": 3.7791430376835425e-05, + "loss": 0.8484, + "step": 4493 + }, + { + "epoch": 0.3306691193377788, + "grad_norm": 1.0390625, + "learning_rate": 3.778645395523845e-05, + "loss": 1.0097, + "step": 4494 + }, + { + "epoch": 0.33074269947114276, + "grad_norm": 0.92578125, + "learning_rate": 3.7781476847416645e-05, + "loss": 0.8819, + "step": 4495 + }, + { + "epoch": 0.33081627960450677, + "grad_norm": 0.859375, + "learning_rate": 3.777649905363712e-05, + "loss": 1.1707, + "step": 4496 + }, + { + "epoch": 0.3308898597378708, + "grad_norm": 0.76171875, + "learning_rate": 3.7771520574167006e-05, + "loss": 0.679, + "step": 4497 + }, + { + "epoch": 0.3309634398712348, + "grad_norm": 0.88671875, + "learning_rate": 3.7766541409273524e-05, + "loss": 0.758, + "step": 4498 + }, + { + "epoch": 0.3310370200045988, + "grad_norm": 0.89453125, + "learning_rate": 3.7761561559223865e-05, + "loss": 0.8395, + "step": 4499 + }, + { + "epoch": 0.33111060013796273, + "grad_norm": 0.8671875, + "learning_rate": 3.77565810242853e-05, + "loss": 0.7264, + "step": 4500 + }, + { + "epoch": 0.33118418027132673, + "grad_norm": 0.87109375, + "learning_rate": 3.775159980472513e-05, + "loss": 1.4052, + "step": 4501 + }, + { + "epoch": 0.33125776040469074, + "grad_norm": 6.90625, + "learning_rate": 3.7746617900810685e-05, + "loss": 0.8163, + "step": 4502 + }, + { + "epoch": 0.33133134053805474, + "grad_norm": 0.8359375, + "learning_rate": 3.774163531280933e-05, + "loss": 0.9579, + "step": 4503 + }, + { + "epoch": 0.3314049206714187, + "grad_norm": 0.921875, + "learning_rate": 3.7736652040988476e-05, + "loss": 0.6861, + "step": 4504 + }, + { + "epoch": 0.3314785008047827, + "grad_norm": 0.703125, + "learning_rate": 3.773166808561556e-05, + "loss": 0.6254, + "step": 4505 + }, + { + "epoch": 0.3315520809381467, + "grad_norm": 0.75, + "learning_rate": 3.772668344695806e-05, + "loss": 0.9836, + "step": 4506 + }, + { + "epoch": 0.3316256610715107, + "grad_norm": 0.8046875, + "learning_rate": 3.77216981252835e-05, + "loss": 0.708, + "step": 4507 + }, + { + "epoch": 0.3316992412048747, + "grad_norm": 0.953125, + "learning_rate": 3.7716712120859435e-05, + "loss": 0.9957, + "step": 4508 + }, + { + "epoch": 0.33177282133823865, + "grad_norm": 0.74609375, + "learning_rate": 3.771172543395344e-05, + "loss": 0.7541, + "step": 4509 + }, + { + "epoch": 0.33184640147160266, + "grad_norm": 1.078125, + "learning_rate": 3.7706738064833155e-05, + "loss": 1.4388, + "step": 4510 + }, + { + "epoch": 0.33191998160496666, + "grad_norm": 0.66015625, + "learning_rate": 3.770175001376623e-05, + "loss": 0.6819, + "step": 4511 + }, + { + "epoch": 0.33199356173833067, + "grad_norm": 0.92578125, + "learning_rate": 3.7696761281020387e-05, + "loss": 1.098, + "step": 4512 + }, + { + "epoch": 0.3320671418716946, + "grad_norm": 0.953125, + "learning_rate": 3.769177186686334e-05, + "loss": 0.9928, + "step": 4513 + }, + { + "epoch": 0.3321407220050586, + "grad_norm": 0.76953125, + "learning_rate": 3.768678177156286e-05, + "loss": 0.759, + "step": 4514 + }, + { + "epoch": 0.3322143021384226, + "grad_norm": 0.75390625, + "learning_rate": 3.768179099538678e-05, + "loss": 0.8186, + "step": 4515 + }, + { + "epoch": 0.3322878822717866, + "grad_norm": 0.7578125, + "learning_rate": 3.767679953860292e-05, + "loss": 0.7773, + "step": 4516 + }, + { + "epoch": 0.33236146240515063, + "grad_norm": 0.8359375, + "learning_rate": 3.767180740147918e-05, + "loss": 0.9745, + "step": 4517 + }, + { + "epoch": 0.3324350425385146, + "grad_norm": 0.68359375, + "learning_rate": 3.766681458428346e-05, + "loss": 0.7881, + "step": 4518 + }, + { + "epoch": 0.3325086226718786, + "grad_norm": 0.76953125, + "learning_rate": 3.7661821087283726e-05, + "loss": 0.8318, + "step": 4519 + }, + { + "epoch": 0.3325822028052426, + "grad_norm": 1.0234375, + "learning_rate": 3.765682691074797e-05, + "loss": 0.9919, + "step": 4520 + }, + { + "epoch": 0.3326557829386066, + "grad_norm": 0.90234375, + "learning_rate": 3.765183205494422e-05, + "loss": 0.9731, + "step": 4521 + }, + { + "epoch": 0.33272936307197054, + "grad_norm": 1.125, + "learning_rate": 3.7646836520140536e-05, + "loss": 1.3843, + "step": 4522 + }, + { + "epoch": 0.33280294320533454, + "grad_norm": 0.82421875, + "learning_rate": 3.7641840306605034e-05, + "loss": 0.79, + "step": 4523 + }, + { + "epoch": 0.33287652333869855, + "grad_norm": 0.796875, + "learning_rate": 3.763684341460583e-05, + "loss": 0.6936, + "step": 4524 + }, + { + "epoch": 0.33295010347206255, + "grad_norm": 0.83984375, + "learning_rate": 3.763184584441111e-05, + "loss": 0.9758, + "step": 4525 + }, + { + "epoch": 0.33302368360542656, + "grad_norm": 1.0078125, + "learning_rate": 3.762684759628908e-05, + "loss": 0.7863, + "step": 4526 + }, + { + "epoch": 0.3330972637387905, + "grad_norm": 0.59375, + "learning_rate": 3.7621848670508e-05, + "loss": 0.6345, + "step": 4527 + }, + { + "epoch": 0.3331708438721545, + "grad_norm": 0.83984375, + "learning_rate": 3.761684906733613e-05, + "loss": 1.0841, + "step": 4528 + }, + { + "epoch": 0.3332444240055185, + "grad_norm": 0.87890625, + "learning_rate": 3.7611848787041794e-05, + "loss": 1.062, + "step": 4529 + }, + { + "epoch": 0.3333180041388825, + "grad_norm": 0.9375, + "learning_rate": 3.760684782989337e-05, + "loss": 1.0346, + "step": 4530 + }, + { + "epoch": 0.33339158427224647, + "grad_norm": 0.98828125, + "learning_rate": 3.760184619615922e-05, + "loss": 1.297, + "step": 4531 + }, + { + "epoch": 0.33346516440561047, + "grad_norm": 0.94140625, + "learning_rate": 3.759684388610779e-05, + "loss": 0.8569, + "step": 4532 + }, + { + "epoch": 0.3335387445389745, + "grad_norm": 1.1328125, + "learning_rate": 3.7591840900007544e-05, + "loss": 1.1531, + "step": 4533 + }, + { + "epoch": 0.3336123246723385, + "grad_norm": 0.98046875, + "learning_rate": 3.758683723812698e-05, + "loss": 0.9587, + "step": 4534 + }, + { + "epoch": 0.3336859048057025, + "grad_norm": 0.97265625, + "learning_rate": 3.758183290073463e-05, + "loss": 1.0634, + "step": 4535 + }, + { + "epoch": 0.33375948493906643, + "grad_norm": 1.03125, + "learning_rate": 3.757682788809907e-05, + "loss": 0.9081, + "step": 4536 + }, + { + "epoch": 0.33383306507243043, + "grad_norm": 0.8125, + "learning_rate": 3.757182220048893e-05, + "loss": 0.7661, + "step": 4537 + }, + { + "epoch": 0.33390664520579444, + "grad_norm": 0.92578125, + "learning_rate": 3.7566815838172815e-05, + "loss": 0.9866, + "step": 4538 + }, + { + "epoch": 0.33398022533915844, + "grad_norm": 0.6015625, + "learning_rate": 3.756180880141944e-05, + "loss": 0.526, + "step": 4539 + }, + { + "epoch": 0.3340538054725224, + "grad_norm": 1.0234375, + "learning_rate": 3.7556801090497504e-05, + "loss": 1.2827, + "step": 4540 + }, + { + "epoch": 0.3341273856058864, + "grad_norm": 0.87109375, + "learning_rate": 3.755179270567578e-05, + "loss": 0.9763, + "step": 4541 + }, + { + "epoch": 0.3342009657392504, + "grad_norm": 1.15625, + "learning_rate": 3.754678364722304e-05, + "loss": 1.3209, + "step": 4542 + }, + { + "epoch": 0.3342745458726144, + "grad_norm": 0.87890625, + "learning_rate": 3.754177391540812e-05, + "loss": 1.079, + "step": 4543 + }, + { + "epoch": 0.3343481260059784, + "grad_norm": 0.78515625, + "learning_rate": 3.7536763510499895e-05, + "loss": 1.1404, + "step": 4544 + }, + { + "epoch": 0.33442170613934236, + "grad_norm": 1.0, + "learning_rate": 3.753175243276724e-05, + "loss": 1.2987, + "step": 4545 + }, + { + "epoch": 0.33449528627270636, + "grad_norm": 0.88671875, + "learning_rate": 3.752674068247911e-05, + "loss": 1.0372, + "step": 4546 + }, + { + "epoch": 0.33456886640607036, + "grad_norm": 0.7734375, + "learning_rate": 3.752172825990446e-05, + "loss": 0.9713, + "step": 4547 + }, + { + "epoch": 0.33464244653943437, + "grad_norm": 0.63671875, + "learning_rate": 3.75167151653123e-05, + "loss": 0.6626, + "step": 4548 + }, + { + "epoch": 0.3347160266727983, + "grad_norm": 0.75390625, + "learning_rate": 3.751170139897168e-05, + "loss": 0.8907, + "step": 4549 + }, + { + "epoch": 0.3347896068061623, + "grad_norm": 1.15625, + "learning_rate": 3.750668696115167e-05, + "loss": 0.7901, + "step": 4550 + }, + { + "epoch": 0.3348631869395263, + "grad_norm": 0.92578125, + "learning_rate": 3.7501671852121414e-05, + "loss": 1.1131, + "step": 4551 + }, + { + "epoch": 0.33493676707289033, + "grad_norm": 1.3828125, + "learning_rate": 3.7496656072150025e-05, + "loss": 0.8705, + "step": 4552 + }, + { + "epoch": 0.33501034720625433, + "grad_norm": 0.953125, + "learning_rate": 3.749163962150671e-05, + "loss": 1.1594, + "step": 4553 + }, + { + "epoch": 0.3350839273396183, + "grad_norm": 0.85546875, + "learning_rate": 3.74866225004607e-05, + "loss": 0.7878, + "step": 4554 + }, + { + "epoch": 0.3351575074729823, + "grad_norm": 0.7890625, + "learning_rate": 3.7481604709281234e-05, + "loss": 0.806, + "step": 4555 + }, + { + "epoch": 0.3352310876063463, + "grad_norm": 0.92578125, + "learning_rate": 3.7476586248237616e-05, + "loss": 1.0637, + "step": 4556 + }, + { + "epoch": 0.3353046677397103, + "grad_norm": 1.015625, + "learning_rate": 3.747156711759918e-05, + "loss": 1.2394, + "step": 4557 + }, + { + "epoch": 0.33537824787307424, + "grad_norm": 0.65625, + "learning_rate": 3.74665473176353e-05, + "loss": 0.7652, + "step": 4558 + }, + { + "epoch": 0.33545182800643825, + "grad_norm": 1.0390625, + "learning_rate": 3.746152684861537e-05, + "loss": 1.0927, + "step": 4559 + }, + { + "epoch": 0.33552540813980225, + "grad_norm": 0.796875, + "learning_rate": 3.745650571080882e-05, + "loss": 1.2981, + "step": 4560 + }, + { + "epoch": 0.33559898827316625, + "grad_norm": 1.03125, + "learning_rate": 3.745148390448515e-05, + "loss": 0.8137, + "step": 4561 + }, + { + "epoch": 0.33567256840653026, + "grad_norm": 1.1953125, + "learning_rate": 3.744646142991385e-05, + "loss": 1.33, + "step": 4562 + }, + { + "epoch": 0.3357461485398942, + "grad_norm": 1.0, + "learning_rate": 3.7441438287364466e-05, + "loss": 1.2443, + "step": 4563 + }, + { + "epoch": 0.3358197286732582, + "grad_norm": 0.828125, + "learning_rate": 3.74364144771066e-05, + "loss": 0.9518, + "step": 4564 + }, + { + "epoch": 0.3358933088066222, + "grad_norm": 0.73828125, + "learning_rate": 3.743138999940985e-05, + "loss": 0.7083, + "step": 4565 + }, + { + "epoch": 0.3359668889399862, + "grad_norm": 0.97265625, + "learning_rate": 3.742636485454388e-05, + "loss": 0.8364, + "step": 4566 + }, + { + "epoch": 0.3360404690733502, + "grad_norm": 1.015625, + "learning_rate": 3.742133904277838e-05, + "loss": 1.3439, + "step": 4567 + }, + { + "epoch": 0.33611404920671417, + "grad_norm": 0.76171875, + "learning_rate": 3.741631256438307e-05, + "loss": 0.9982, + "step": 4568 + }, + { + "epoch": 0.3361876293400782, + "grad_norm": 0.671875, + "learning_rate": 3.7411285419627716e-05, + "loss": 0.6144, + "step": 4569 + }, + { + "epoch": 0.3362612094734422, + "grad_norm": 1.0703125, + "learning_rate": 3.740625760878212e-05, + "loss": 0.8794, + "step": 4570 + }, + { + "epoch": 0.3363347896068062, + "grad_norm": 0.8828125, + "learning_rate": 3.740122913211611e-05, + "loss": 1.2987, + "step": 4571 + }, + { + "epoch": 0.33640836974017013, + "grad_norm": 1.1171875, + "learning_rate": 3.739619998989955e-05, + "loss": 1.0957, + "step": 4572 + }, + { + "epoch": 0.33648194987353414, + "grad_norm": 0.765625, + "learning_rate": 3.739117018240235e-05, + "loss": 0.7344, + "step": 4573 + }, + { + "epoch": 0.33655553000689814, + "grad_norm": 0.9609375, + "learning_rate": 3.7386139709894455e-05, + "loss": 1.4638, + "step": 4574 + }, + { + "epoch": 0.33662911014026214, + "grad_norm": 0.8515625, + "learning_rate": 3.7381108572645835e-05, + "loss": 0.7897, + "step": 4575 + }, + { + "epoch": 0.33670269027362615, + "grad_norm": 0.87890625, + "learning_rate": 3.73760767709265e-05, + "loss": 1.1372, + "step": 4576 + }, + { + "epoch": 0.3367762704069901, + "grad_norm": 0.921875, + "learning_rate": 3.73710443050065e-05, + "loss": 0.927, + "step": 4577 + }, + { + "epoch": 0.3368498505403541, + "grad_norm": 0.97265625, + "learning_rate": 3.736601117515592e-05, + "loss": 1.03, + "step": 4578 + }, + { + "epoch": 0.3369234306737181, + "grad_norm": 0.76171875, + "learning_rate": 3.736097738164487e-05, + "loss": 0.7169, + "step": 4579 + }, + { + "epoch": 0.3369970108070821, + "grad_norm": 0.75390625, + "learning_rate": 3.7355942924743525e-05, + "loss": 0.9262, + "step": 4580 + }, + { + "epoch": 0.33707059094044606, + "grad_norm": 1.1328125, + "learning_rate": 3.7350907804722047e-05, + "loss": 0.9447, + "step": 4581 + }, + { + "epoch": 0.33714417107381006, + "grad_norm": 0.80078125, + "learning_rate": 3.734587202185067e-05, + "loss": 1.1006, + "step": 4582 + }, + { + "epoch": 0.33721775120717407, + "grad_norm": 1.0625, + "learning_rate": 3.7340835576399675e-05, + "loss": 1.0276, + "step": 4583 + }, + { + "epoch": 0.33729133134053807, + "grad_norm": 0.91796875, + "learning_rate": 3.733579846863933e-05, + "loss": 0.8182, + "step": 4584 + }, + { + "epoch": 0.3373649114739021, + "grad_norm": 0.73828125, + "learning_rate": 3.7330760698839995e-05, + "loss": 0.806, + "step": 4585 + }, + { + "epoch": 0.337438491607266, + "grad_norm": 0.97265625, + "learning_rate": 3.732572226727201e-05, + "loss": 0.7503, + "step": 4586 + }, + { + "epoch": 0.33751207174063, + "grad_norm": 0.78125, + "learning_rate": 3.7320683174205794e-05, + "loss": 1.0204, + "step": 4587 + }, + { + "epoch": 0.33758565187399403, + "grad_norm": 1.0, + "learning_rate": 3.731564341991178e-05, + "loss": 0.897, + "step": 4588 + }, + { + "epoch": 0.33765923200735803, + "grad_norm": 0.8359375, + "learning_rate": 3.7310603004660446e-05, + "loss": 0.8043, + "step": 4589 + }, + { + "epoch": 0.337732812140722, + "grad_norm": 0.7890625, + "learning_rate": 3.7305561928722306e-05, + "loss": 0.848, + "step": 4590 + }, + { + "epoch": 0.337806392274086, + "grad_norm": 0.8671875, + "learning_rate": 3.730052019236789e-05, + "loss": 0.8005, + "step": 4591 + }, + { + "epoch": 0.33787997240745, + "grad_norm": 0.9140625, + "learning_rate": 3.72954777958678e-05, + "loss": 0.8389, + "step": 4592 + }, + { + "epoch": 0.337953552540814, + "grad_norm": 0.91796875, + "learning_rate": 3.729043473949263e-05, + "loss": 1.0695, + "step": 4593 + }, + { + "epoch": 0.338027132674178, + "grad_norm": 1.0390625, + "learning_rate": 3.728539102351305e-05, + "loss": 1.1062, + "step": 4594 + }, + { + "epoch": 0.33810071280754195, + "grad_norm": 0.765625, + "learning_rate": 3.7280346648199724e-05, + "loss": 0.7073, + "step": 4595 + }, + { + "epoch": 0.33817429294090595, + "grad_norm": 0.9296875, + "learning_rate": 3.72753016138234e-05, + "loss": 1.2053, + "step": 4596 + }, + { + "epoch": 0.33824787307426996, + "grad_norm": 0.81640625, + "learning_rate": 3.727025592065481e-05, + "loss": 0.8578, + "step": 4597 + }, + { + "epoch": 0.33832145320763396, + "grad_norm": 0.75, + "learning_rate": 3.726520956896477e-05, + "loss": 0.8075, + "step": 4598 + }, + { + "epoch": 0.3383950333409979, + "grad_norm": 1.1015625, + "learning_rate": 3.7260162559024095e-05, + "loss": 0.9552, + "step": 4599 + }, + { + "epoch": 0.3384686134743619, + "grad_norm": 0.78515625, + "learning_rate": 3.725511489110365e-05, + "loss": 0.6871, + "step": 4600 + }, + { + "epoch": 0.3385421936077259, + "grad_norm": 1.109375, + "learning_rate": 3.725006656547435e-05, + "loss": 1.2754, + "step": 4601 + }, + { + "epoch": 0.3386157737410899, + "grad_norm": 0.73046875, + "learning_rate": 3.7245017582407095e-05, + "loss": 0.7249, + "step": 4602 + }, + { + "epoch": 0.3386893538744539, + "grad_norm": 0.82421875, + "learning_rate": 3.723996794217288e-05, + "loss": 0.9319, + "step": 4603 + }, + { + "epoch": 0.3387629340078179, + "grad_norm": 1.0, + "learning_rate": 3.7234917645042706e-05, + "loss": 0.8907, + "step": 4604 + }, + { + "epoch": 0.3388365141411819, + "grad_norm": 0.7578125, + "learning_rate": 3.722986669128761e-05, + "loss": 0.9709, + "step": 4605 + }, + { + "epoch": 0.3389100942745459, + "grad_norm": 1.046875, + "learning_rate": 3.7224815081178666e-05, + "loss": 1.0093, + "step": 4606 + }, + { + "epoch": 0.3389836744079099, + "grad_norm": 0.94140625, + "learning_rate": 3.721976281498699e-05, + "loss": 1.0445, + "step": 4607 + }, + { + "epoch": 0.33905725454127383, + "grad_norm": 0.91015625, + "learning_rate": 3.7214709892983725e-05, + "loss": 0.9872, + "step": 4608 + }, + { + "epoch": 0.33913083467463784, + "grad_norm": 1.1171875, + "learning_rate": 3.720965631544004e-05, + "loss": 1.0319, + "step": 4609 + }, + { + "epoch": 0.33920441480800184, + "grad_norm": 0.80859375, + "learning_rate": 3.720460208262717e-05, + "loss": 0.9197, + "step": 4610 + }, + { + "epoch": 0.33927799494136585, + "grad_norm": 1.171875, + "learning_rate": 3.719954719481635e-05, + "loss": 0.7862, + "step": 4611 + }, + { + "epoch": 0.33935157507472985, + "grad_norm": 0.890625, + "learning_rate": 3.7194491652278876e-05, + "loss": 0.9574, + "step": 4612 + }, + { + "epoch": 0.3394251552080938, + "grad_norm": 0.796875, + "learning_rate": 3.718943545528607e-05, + "loss": 0.8347, + "step": 4613 + }, + { + "epoch": 0.3394987353414578, + "grad_norm": 1.046875, + "learning_rate": 3.718437860410929e-05, + "loss": 0.9657, + "step": 4614 + }, + { + "epoch": 0.3395723154748218, + "grad_norm": 0.859375, + "learning_rate": 3.7179321099019916e-05, + "loss": 0.8613, + "step": 4615 + }, + { + "epoch": 0.3396458956081858, + "grad_norm": 0.72265625, + "learning_rate": 3.717426294028938e-05, + "loss": 0.8471, + "step": 4616 + }, + { + "epoch": 0.33971947574154976, + "grad_norm": 0.87109375, + "learning_rate": 3.7169204128189154e-05, + "loss": 0.7595, + "step": 4617 + }, + { + "epoch": 0.33979305587491376, + "grad_norm": 0.88671875, + "learning_rate": 3.716414466299072e-05, + "loss": 0.6496, + "step": 4618 + }, + { + "epoch": 0.33986663600827777, + "grad_norm": 1.03125, + "learning_rate": 3.715908454496563e-05, + "loss": 1.1779, + "step": 4619 + }, + { + "epoch": 0.33994021614164177, + "grad_norm": 0.859375, + "learning_rate": 3.715402377438542e-05, + "loss": 1.117, + "step": 4620 + }, + { + "epoch": 0.3400137962750058, + "grad_norm": 1.0234375, + "learning_rate": 3.714896235152172e-05, + "loss": 1.1262, + "step": 4621 + }, + { + "epoch": 0.3400873764083697, + "grad_norm": 0.89453125, + "learning_rate": 3.714390027664615e-05, + "loss": 0.8273, + "step": 4622 + }, + { + "epoch": 0.3401609565417337, + "grad_norm": 0.78125, + "learning_rate": 3.713883755003039e-05, + "loss": 0.925, + "step": 4623 + }, + { + "epoch": 0.34023453667509773, + "grad_norm": 0.78125, + "learning_rate": 3.713377417194616e-05, + "loss": 0.9008, + "step": 4624 + }, + { + "epoch": 0.34030811680846174, + "grad_norm": 0.78125, + "learning_rate": 3.7128710142665166e-05, + "loss": 0.7594, + "step": 4625 + }, + { + "epoch": 0.3403816969418257, + "grad_norm": 0.8046875, + "learning_rate": 3.712364546245922e-05, + "loss": 0.8971, + "step": 4626 + }, + { + "epoch": 0.3404552770751897, + "grad_norm": 0.69140625, + "learning_rate": 3.711858013160012e-05, + "loss": 1.0136, + "step": 4627 + }, + { + "epoch": 0.3405288572085537, + "grad_norm": 0.921875, + "learning_rate": 3.711351415035971e-05, + "loss": 1.2684, + "step": 4628 + }, + { + "epoch": 0.3406024373419177, + "grad_norm": 0.83984375, + "learning_rate": 3.710844751900988e-05, + "loss": 0.8905, + "step": 4629 + }, + { + "epoch": 0.3406760174752817, + "grad_norm": 1.28125, + "learning_rate": 3.7103380237822525e-05, + "loss": 0.9414, + "step": 4630 + }, + { + "epoch": 0.34074959760864565, + "grad_norm": 1.140625, + "learning_rate": 3.7098312307069626e-05, + "loss": 1.092, + "step": 4631 + }, + { + "epoch": 0.34082317774200965, + "grad_norm": 0.89453125, + "learning_rate": 3.7093243727023154e-05, + "loss": 0.9318, + "step": 4632 + }, + { + "epoch": 0.34089675787537366, + "grad_norm": 0.63671875, + "learning_rate": 3.7088174497955136e-05, + "loss": 0.6058, + "step": 4633 + }, + { + "epoch": 0.34097033800873766, + "grad_norm": 0.87890625, + "learning_rate": 3.7083104620137624e-05, + "loss": 0.9116, + "step": 4634 + }, + { + "epoch": 0.3410439181421016, + "grad_norm": 1.0, + "learning_rate": 3.70780340938427e-05, + "loss": 1.1056, + "step": 4635 + }, + { + "epoch": 0.3411174982754656, + "grad_norm": 0.83203125, + "learning_rate": 3.70729629193425e-05, + "loss": 1.045, + "step": 4636 + }, + { + "epoch": 0.3411910784088296, + "grad_norm": 1.0859375, + "learning_rate": 3.706789109690919e-05, + "loss": 1.3281, + "step": 4637 + }, + { + "epoch": 0.3412646585421936, + "grad_norm": 0.61328125, + "learning_rate": 3.706281862681495e-05, + "loss": 0.6027, + "step": 4638 + }, + { + "epoch": 0.3413382386755576, + "grad_norm": 0.91796875, + "learning_rate": 3.705774550933202e-05, + "loss": 1.3672, + "step": 4639 + }, + { + "epoch": 0.3414118188089216, + "grad_norm": 1.0390625, + "learning_rate": 3.705267174473267e-05, + "loss": 1.2316, + "step": 4640 + }, + { + "epoch": 0.3414853989422856, + "grad_norm": 0.77734375, + "learning_rate": 3.704759733328918e-05, + "loss": 1.1305, + "step": 4641 + }, + { + "epoch": 0.3415589790756496, + "grad_norm": 0.87109375, + "learning_rate": 3.704252227527391e-05, + "loss": 1.2627, + "step": 4642 + }, + { + "epoch": 0.3416325592090136, + "grad_norm": 0.87890625, + "learning_rate": 3.703744657095919e-05, + "loss": 0.7576, + "step": 4643 + }, + { + "epoch": 0.34170613934237754, + "grad_norm": 0.9140625, + "learning_rate": 3.7032370220617476e-05, + "loss": 1.2629, + "step": 4644 + }, + { + "epoch": 0.34177971947574154, + "grad_norm": 0.81640625, + "learning_rate": 3.702729322452116e-05, + "loss": 0.898, + "step": 4645 + }, + { + "epoch": 0.34185329960910554, + "grad_norm": 1.046875, + "learning_rate": 3.702221558294274e-05, + "loss": 1.3024, + "step": 4646 + }, + { + "epoch": 0.34192687974246955, + "grad_norm": 0.78125, + "learning_rate": 3.701713729615471e-05, + "loss": 1.4048, + "step": 4647 + }, + { + "epoch": 0.34200045987583355, + "grad_norm": 0.87890625, + "learning_rate": 3.701205836442963e-05, + "loss": 0.8842, + "step": 4648 + }, + { + "epoch": 0.3420740400091975, + "grad_norm": 0.85546875, + "learning_rate": 3.700697878804006e-05, + "loss": 0.9754, + "step": 4649 + }, + { + "epoch": 0.3421476201425615, + "grad_norm": 0.9140625, + "learning_rate": 3.7001898567258605e-05, + "loss": 1.0291, + "step": 4650 + }, + { + "epoch": 0.3422212002759255, + "grad_norm": 0.8046875, + "learning_rate": 3.699681770235794e-05, + "loss": 0.9822, + "step": 4651 + }, + { + "epoch": 0.3422947804092895, + "grad_norm": 0.984375, + "learning_rate": 3.6991736193610724e-05, + "loss": 1.2103, + "step": 4652 + }, + { + "epoch": 0.34236836054265346, + "grad_norm": 0.97265625, + "learning_rate": 3.698665404128967e-05, + "loss": 1.2218, + "step": 4653 + }, + { + "epoch": 0.34244194067601746, + "grad_norm": 1.1015625, + "learning_rate": 3.698157124566753e-05, + "loss": 1.1661, + "step": 4654 + }, + { + "epoch": 0.34251552080938147, + "grad_norm": 1.0546875, + "learning_rate": 3.6976487807017104e-05, + "loss": 0.911, + "step": 4655 + }, + { + "epoch": 0.3425891009427455, + "grad_norm": 0.7421875, + "learning_rate": 3.6971403725611186e-05, + "loss": 0.7984, + "step": 4656 + }, + { + "epoch": 0.3426626810761095, + "grad_norm": 0.90234375, + "learning_rate": 3.6966319001722646e-05, + "loss": 0.6459, + "step": 4657 + }, + { + "epoch": 0.3427362612094734, + "grad_norm": 1.015625, + "learning_rate": 3.6961233635624364e-05, + "loss": 1.0434, + "step": 4658 + }, + { + "epoch": 0.34280984134283743, + "grad_norm": 1.0234375, + "learning_rate": 3.695614762758927e-05, + "loss": 1.3347, + "step": 4659 + }, + { + "epoch": 0.34288342147620143, + "grad_norm": 0.8515625, + "learning_rate": 3.6951060977890305e-05, + "loss": 0.828, + "step": 4660 + }, + { + "epoch": 0.34295700160956544, + "grad_norm": 1.0546875, + "learning_rate": 3.694597368680048e-05, + "loss": 0.766, + "step": 4661 + }, + { + "epoch": 0.3430305817429294, + "grad_norm": 1.0546875, + "learning_rate": 3.69408857545928e-05, + "loss": 1.3484, + "step": 4662 + }, + { + "epoch": 0.3431041618762934, + "grad_norm": 0.890625, + "learning_rate": 3.693579718154034e-05, + "loss": 0.6742, + "step": 4663 + }, + { + "epoch": 0.3431777420096574, + "grad_norm": 0.8984375, + "learning_rate": 3.693070796791619e-05, + "loss": 0.9636, + "step": 4664 + }, + { + "epoch": 0.3432513221430214, + "grad_norm": 0.82421875, + "learning_rate": 3.692561811399348e-05, + "loss": 0.7635, + "step": 4665 + }, + { + "epoch": 0.3433249022763854, + "grad_norm": 0.99609375, + "learning_rate": 3.692052762004536e-05, + "loss": 1.2835, + "step": 4666 + }, + { + "epoch": 0.34339848240974935, + "grad_norm": 0.90625, + "learning_rate": 3.691543648634505e-05, + "loss": 0.7873, + "step": 4667 + }, + { + "epoch": 0.34347206254311335, + "grad_norm": 1.0625, + "learning_rate": 3.691034471316576e-05, + "loss": 1.097, + "step": 4668 + }, + { + "epoch": 0.34354564267647736, + "grad_norm": 1.0625, + "learning_rate": 3.6905252300780765e-05, + "loss": 0.7454, + "step": 4669 + }, + { + "epoch": 0.34361922280984136, + "grad_norm": 0.91796875, + "learning_rate": 3.6900159249463364e-05, + "loss": 0.9617, + "step": 4670 + }, + { + "epoch": 0.3436928029432053, + "grad_norm": 0.7578125, + "learning_rate": 3.6895065559486894e-05, + "loss": 0.8249, + "step": 4671 + }, + { + "epoch": 0.3437663830765693, + "grad_norm": 1.0078125, + "learning_rate": 3.6889971231124725e-05, + "loss": 1.4166, + "step": 4672 + }, + { + "epoch": 0.3438399632099333, + "grad_norm": 0.703125, + "learning_rate": 3.688487626465025e-05, + "loss": 0.6122, + "step": 4673 + }, + { + "epoch": 0.3439135433432973, + "grad_norm": 0.8046875, + "learning_rate": 3.687978066033693e-05, + "loss": 0.8879, + "step": 4674 + }, + { + "epoch": 0.3439871234766613, + "grad_norm": 0.78125, + "learning_rate": 3.68746844184582e-05, + "loss": 0.8693, + "step": 4675 + }, + { + "epoch": 0.3440607036100253, + "grad_norm": 0.93359375, + "learning_rate": 3.686958753928759e-05, + "loss": 1.2325, + "step": 4676 + }, + { + "epoch": 0.3441342837433893, + "grad_norm": 0.8828125, + "learning_rate": 3.686449002309864e-05, + "loss": 0.9057, + "step": 4677 + }, + { + "epoch": 0.3442078638767533, + "grad_norm": 0.90234375, + "learning_rate": 3.685939187016492e-05, + "loss": 1.0158, + "step": 4678 + }, + { + "epoch": 0.3442814440101173, + "grad_norm": 0.8046875, + "learning_rate": 3.6854293080760036e-05, + "loss": 0.5932, + "step": 4679 + }, + { + "epoch": 0.34435502414348124, + "grad_norm": 0.8125, + "learning_rate": 3.684919365515762e-05, + "loss": 0.7456, + "step": 4680 + }, + { + "epoch": 0.34442860427684524, + "grad_norm": 0.609375, + "learning_rate": 3.684409359363138e-05, + "loss": 0.5749, + "step": 4681 + }, + { + "epoch": 0.34450218441020924, + "grad_norm": 0.66796875, + "learning_rate": 3.6838992896455e-05, + "loss": 0.7502, + "step": 4682 + }, + { + "epoch": 0.34457576454357325, + "grad_norm": 0.95703125, + "learning_rate": 3.6833891563902225e-05, + "loss": 0.8746, + "step": 4683 + }, + { + "epoch": 0.34464934467693725, + "grad_norm": 0.78125, + "learning_rate": 3.6828789596246846e-05, + "loss": 0.9033, + "step": 4684 + }, + { + "epoch": 0.3447229248103012, + "grad_norm": 0.98828125, + "learning_rate": 3.682368699376268e-05, + "loss": 1.0363, + "step": 4685 + }, + { + "epoch": 0.3447965049436652, + "grad_norm": 1.015625, + "learning_rate": 3.681858375672355e-05, + "loss": 1.1239, + "step": 4686 + }, + { + "epoch": 0.3448700850770292, + "grad_norm": 0.79296875, + "learning_rate": 3.6813479885403355e-05, + "loss": 0.8014, + "step": 4687 + }, + { + "epoch": 0.3449436652103932, + "grad_norm": 0.83984375, + "learning_rate": 3.680837538007601e-05, + "loss": 1.2095, + "step": 4688 + }, + { + "epoch": 0.34501724534375716, + "grad_norm": 0.7578125, + "learning_rate": 3.6803270241015465e-05, + "loss": 0.6501, + "step": 4689 + }, + { + "epoch": 0.34509082547712117, + "grad_norm": 0.859375, + "learning_rate": 3.6798164468495696e-05, + "loss": 1.0622, + "step": 4690 + }, + { + "epoch": 0.34516440561048517, + "grad_norm": 0.87890625, + "learning_rate": 3.679305806279072e-05, + "loss": 0.8261, + "step": 4691 + }, + { + "epoch": 0.3452379857438492, + "grad_norm": 0.87890625, + "learning_rate": 3.67879510241746e-05, + "loss": 0.7577, + "step": 4692 + }, + { + "epoch": 0.3453115658772132, + "grad_norm": 1.03125, + "learning_rate": 3.67828433529214e-05, + "loss": 1.22, + "step": 4693 + }, + { + "epoch": 0.3453851460105771, + "grad_norm": 0.92578125, + "learning_rate": 3.677773504930526e-05, + "loss": 0.9988, + "step": 4694 + }, + { + "epoch": 0.34545872614394113, + "grad_norm": 1.0625, + "learning_rate": 3.677262611360033e-05, + "loss": 1.5305, + "step": 4695 + }, + { + "epoch": 0.34553230627730513, + "grad_norm": 0.7109375, + "learning_rate": 3.6767516546080786e-05, + "loss": 0.6164, + "step": 4696 + }, + { + "epoch": 0.34560588641066914, + "grad_norm": 1.0078125, + "learning_rate": 3.6762406347020856e-05, + "loss": 1.3206, + "step": 4697 + }, + { + "epoch": 0.3456794665440331, + "grad_norm": 0.83984375, + "learning_rate": 3.675729551669479e-05, + "loss": 0.7833, + "step": 4698 + }, + { + "epoch": 0.3457530466773971, + "grad_norm": 0.69921875, + "learning_rate": 3.6752184055376886e-05, + "loss": 0.6527, + "step": 4699 + }, + { + "epoch": 0.3458266268107611, + "grad_norm": 0.91015625, + "learning_rate": 3.6747071963341454e-05, + "loss": 1.1474, + "step": 4700 + }, + { + "epoch": 0.3459002069441251, + "grad_norm": 0.71875, + "learning_rate": 3.674195924086287e-05, + "loss": 0.599, + "step": 4701 + }, + { + "epoch": 0.3459737870774891, + "grad_norm": 0.96484375, + "learning_rate": 3.673684588821549e-05, + "loss": 0.8061, + "step": 4702 + }, + { + "epoch": 0.34604736721085305, + "grad_norm": 0.87890625, + "learning_rate": 3.6731731905673774e-05, + "loss": 1.018, + "step": 4703 + }, + { + "epoch": 0.34612094734421706, + "grad_norm": 0.7578125, + "learning_rate": 3.672661729351216e-05, + "loss": 0.7041, + "step": 4704 + }, + { + "epoch": 0.34619452747758106, + "grad_norm": 0.953125, + "learning_rate": 3.672150205200514e-05, + "loss": 0.9247, + "step": 4705 + }, + { + "epoch": 0.34626810761094506, + "grad_norm": 0.98828125, + "learning_rate": 3.671638618142725e-05, + "loss": 0.936, + "step": 4706 + }, + { + "epoch": 0.346341687744309, + "grad_norm": 0.6953125, + "learning_rate": 3.671126968205304e-05, + "loss": 0.629, + "step": 4707 + }, + { + "epoch": 0.346415267877673, + "grad_norm": 1.1640625, + "learning_rate": 3.670615255415711e-05, + "loss": 1.2083, + "step": 4708 + }, + { + "epoch": 0.346488848011037, + "grad_norm": 0.69140625, + "learning_rate": 3.670103479801408e-05, + "loss": 0.7944, + "step": 4709 + }, + { + "epoch": 0.346562428144401, + "grad_norm": 0.8515625, + "learning_rate": 3.6695916413898603e-05, + "loss": 0.7856, + "step": 4710 + }, + { + "epoch": 0.34663600827776503, + "grad_norm": 0.796875, + "learning_rate": 3.669079740208539e-05, + "loss": 0.9476, + "step": 4711 + }, + { + "epoch": 0.346709588411129, + "grad_norm": 0.62109375, + "learning_rate": 3.6685677762849165e-05, + "loss": 0.6409, + "step": 4712 + }, + { + "epoch": 0.346783168544493, + "grad_norm": 0.97265625, + "learning_rate": 3.6680557496464684e-05, + "loss": 1.1041, + "step": 4713 + }, + { + "epoch": 0.346856748677857, + "grad_norm": 0.7734375, + "learning_rate": 3.667543660320674e-05, + "loss": 0.7868, + "step": 4714 + }, + { + "epoch": 0.346930328811221, + "grad_norm": 0.87109375, + "learning_rate": 3.667031508335017e-05, + "loss": 1.2149, + "step": 4715 + }, + { + "epoch": 0.34700390894458494, + "grad_norm": 0.84765625, + "learning_rate": 3.666519293716983e-05, + "loss": 0.9748, + "step": 4716 + }, + { + "epoch": 0.34707748907794894, + "grad_norm": 0.75390625, + "learning_rate": 3.6660070164940615e-05, + "loss": 0.925, + "step": 4717 + }, + { + "epoch": 0.34715106921131295, + "grad_norm": 0.75, + "learning_rate": 3.665494676693745e-05, + "loss": 0.7965, + "step": 4718 + }, + { + "epoch": 0.34722464934467695, + "grad_norm": 0.91015625, + "learning_rate": 3.664982274343531e-05, + "loss": 0.7948, + "step": 4719 + }, + { + "epoch": 0.34729822947804095, + "grad_norm": 0.85546875, + "learning_rate": 3.66446980947092e-05, + "loss": 0.8549, + "step": 4720 + }, + { + "epoch": 0.3473718096114049, + "grad_norm": 0.73046875, + "learning_rate": 3.663957282103412e-05, + "loss": 0.9215, + "step": 4721 + }, + { + "epoch": 0.3474453897447689, + "grad_norm": 0.65234375, + "learning_rate": 3.663444692268517e-05, + "loss": 0.7448, + "step": 4722 + }, + { + "epoch": 0.3475189698781329, + "grad_norm": 0.7734375, + "learning_rate": 3.6629320399937414e-05, + "loss": 0.9834, + "step": 4723 + }, + { + "epoch": 0.3475925500114969, + "grad_norm": 0.734375, + "learning_rate": 3.6624193253065996e-05, + "loss": 1.0663, + "step": 4724 + }, + { + "epoch": 0.34766613014486086, + "grad_norm": 0.79296875, + "learning_rate": 3.6619065482346084e-05, + "loss": 0.5999, + "step": 4725 + }, + { + "epoch": 0.34773971027822487, + "grad_norm": 0.96875, + "learning_rate": 3.6613937088052876e-05, + "loss": 0.9717, + "step": 4726 + }, + { + "epoch": 0.34781329041158887, + "grad_norm": 0.8359375, + "learning_rate": 3.660880807046159e-05, + "loss": 0.9176, + "step": 4727 + }, + { + "epoch": 0.3478868705449529, + "grad_norm": 0.7734375, + "learning_rate": 3.660367842984751e-05, + "loss": 1.0512, + "step": 4728 + }, + { + "epoch": 0.3479604506783169, + "grad_norm": 3.578125, + "learning_rate": 3.659854816648592e-05, + "loss": 1.1632, + "step": 4729 + }, + { + "epoch": 0.34803403081168083, + "grad_norm": 0.9140625, + "learning_rate": 3.6593417280652164e-05, + "loss": 0.7006, + "step": 4730 + }, + { + "epoch": 0.34810761094504483, + "grad_norm": 0.90234375, + "learning_rate": 3.6588285772621586e-05, + "loss": 0.9917, + "step": 4731 + }, + { + "epoch": 0.34818119107840884, + "grad_norm": 0.98828125, + "learning_rate": 3.658315364266961e-05, + "loss": 1.096, + "step": 4732 + }, + { + "epoch": 0.34825477121177284, + "grad_norm": 0.85546875, + "learning_rate": 3.657802089107165e-05, + "loss": 0.9812, + "step": 4733 + }, + { + "epoch": 0.3483283513451368, + "grad_norm": 0.62109375, + "learning_rate": 3.657288751810318e-05, + "loss": 0.6361, + "step": 4734 + }, + { + "epoch": 0.3484019314785008, + "grad_norm": 0.73046875, + "learning_rate": 3.656775352403969e-05, + "loss": 0.6014, + "step": 4735 + }, + { + "epoch": 0.3484755116118648, + "grad_norm": 0.86328125, + "learning_rate": 3.6562618909156726e-05, + "loss": 0.8307, + "step": 4736 + }, + { + "epoch": 0.3485490917452288, + "grad_norm": 0.89453125, + "learning_rate": 3.6557483673729834e-05, + "loss": 1.0536, + "step": 4737 + }, + { + "epoch": 0.3486226718785928, + "grad_norm": 0.87890625, + "learning_rate": 3.6552347818034626e-05, + "loss": 1.0115, + "step": 4738 + }, + { + "epoch": 0.34869625201195675, + "grad_norm": 1.0, + "learning_rate": 3.654721134234673e-05, + "loss": 1.311, + "step": 4739 + }, + { + "epoch": 0.34876983214532076, + "grad_norm": 0.78515625, + "learning_rate": 3.6542074246941816e-05, + "loss": 0.9436, + "step": 4740 + }, + { + "epoch": 0.34884341227868476, + "grad_norm": 0.8515625, + "learning_rate": 3.6536936532095565e-05, + "loss": 0.9542, + "step": 4741 + }, + { + "epoch": 0.34891699241204877, + "grad_norm": 0.91015625, + "learning_rate": 3.653179819808373e-05, + "loss": 1.1838, + "step": 4742 + }, + { + "epoch": 0.3489905725454127, + "grad_norm": 0.953125, + "learning_rate": 3.652665924518206e-05, + "loss": 0.938, + "step": 4743 + }, + { + "epoch": 0.3490641526787767, + "grad_norm": 0.8515625, + "learning_rate": 3.652151967366637e-05, + "loss": 0.9013, + "step": 4744 + }, + { + "epoch": 0.3491377328121407, + "grad_norm": 5.28125, + "learning_rate": 3.651637948381247e-05, + "loss": 1.3578, + "step": 4745 + }, + { + "epoch": 0.3492113129455047, + "grad_norm": 0.80078125, + "learning_rate": 3.651123867589623e-05, + "loss": 0.8332, + "step": 4746 + }, + { + "epoch": 0.34928489307886873, + "grad_norm": 0.984375, + "learning_rate": 3.650609725019356e-05, + "loss": 0.8459, + "step": 4747 + }, + { + "epoch": 0.3493584732122327, + "grad_norm": 0.79296875, + "learning_rate": 3.650095520698038e-05, + "loss": 0.8967, + "step": 4748 + }, + { + "epoch": 0.3494320533455967, + "grad_norm": 0.765625, + "learning_rate": 3.6495812546532664e-05, + "loss": 0.8182, + "step": 4749 + }, + { + "epoch": 0.3495056334789607, + "grad_norm": 0.7890625, + "learning_rate": 3.649066926912639e-05, + "loss": 0.9408, + "step": 4750 + }, + { + "epoch": 0.3495792136123247, + "grad_norm": 0.80859375, + "learning_rate": 3.64855253750376e-05, + "loss": 0.6891, + "step": 4751 + }, + { + "epoch": 0.34965279374568864, + "grad_norm": 0.6953125, + "learning_rate": 3.648038086454236e-05, + "loss": 0.6999, + "step": 4752 + }, + { + "epoch": 0.34972637387905264, + "grad_norm": 0.84375, + "learning_rate": 3.647523573791677e-05, + "loss": 0.8032, + "step": 4753 + }, + { + "epoch": 0.34979995401241665, + "grad_norm": 0.87109375, + "learning_rate": 3.647008999543694e-05, + "loss": 1.1803, + "step": 4754 + }, + { + "epoch": 0.34987353414578065, + "grad_norm": 1.109375, + "learning_rate": 3.6464943637379053e-05, + "loss": 1.5012, + "step": 4755 + }, + { + "epoch": 0.34994711427914466, + "grad_norm": 0.890625, + "learning_rate": 3.645979666401929e-05, + "loss": 1.2367, + "step": 4756 + }, + { + "epoch": 0.3500206944125086, + "grad_norm": 1.09375, + "learning_rate": 3.6454649075633885e-05, + "loss": 1.372, + "step": 4757 + }, + { + "epoch": 0.3500942745458726, + "grad_norm": 0.7890625, + "learning_rate": 3.64495008724991e-05, + "loss": 0.8756, + "step": 4758 + }, + { + "epoch": 0.3501678546792366, + "grad_norm": 0.98828125, + "learning_rate": 3.644435205489122e-05, + "loss": 1.0225, + "step": 4759 + }, + { + "epoch": 0.3502414348126006, + "grad_norm": 0.7890625, + "learning_rate": 3.643920262308659e-05, + "loss": 0.677, + "step": 4760 + }, + { + "epoch": 0.35031501494596456, + "grad_norm": 1.203125, + "learning_rate": 3.6434052577361556e-05, + "loss": 1.448, + "step": 4761 + }, + { + "epoch": 0.35038859507932857, + "grad_norm": 0.74609375, + "learning_rate": 3.642890191799252e-05, + "loss": 0.6937, + "step": 4762 + }, + { + "epoch": 0.3504621752126926, + "grad_norm": 0.609375, + "learning_rate": 3.642375064525591e-05, + "loss": 0.6267, + "step": 4763 + }, + { + "epoch": 0.3505357553460566, + "grad_norm": 0.87109375, + "learning_rate": 3.641859875942816e-05, + "loss": 1.0058, + "step": 4764 + }, + { + "epoch": 0.3506093354794206, + "grad_norm": 1.3046875, + "learning_rate": 3.6413446260785797e-05, + "loss": 1.1573, + "step": 4765 + }, + { + "epoch": 0.35068291561278453, + "grad_norm": 0.73046875, + "learning_rate": 3.640829314960532e-05, + "loss": 0.7338, + "step": 4766 + }, + { + "epoch": 0.35075649574614853, + "grad_norm": 0.8671875, + "learning_rate": 3.64031394261633e-05, + "loss": 1.1617, + "step": 4767 + }, + { + "epoch": 0.35083007587951254, + "grad_norm": 0.9140625, + "learning_rate": 3.6397985090736336e-05, + "loss": 0.8588, + "step": 4768 + }, + { + "epoch": 0.35090365601287654, + "grad_norm": 0.76953125, + "learning_rate": 3.639283014360103e-05, + "loss": 0.822, + "step": 4769 + }, + { + "epoch": 0.3509772361462405, + "grad_norm": 0.8671875, + "learning_rate": 3.638767458503405e-05, + "loss": 0.8289, + "step": 4770 + }, + { + "epoch": 0.3510508162796045, + "grad_norm": 4.53125, + "learning_rate": 3.638251841531208e-05, + "loss": 0.7602, + "step": 4771 + }, + { + "epoch": 0.3511243964129685, + "grad_norm": 0.984375, + "learning_rate": 3.637736163471185e-05, + "loss": 1.2078, + "step": 4772 + }, + { + "epoch": 0.3511979765463325, + "grad_norm": 0.94140625, + "learning_rate": 3.63722042435101e-05, + "loss": 0.9631, + "step": 4773 + }, + { + "epoch": 0.3512715566796965, + "grad_norm": 0.6328125, + "learning_rate": 3.6367046241983644e-05, + "loss": 0.6661, + "step": 4774 + }, + { + "epoch": 0.35134513681306045, + "grad_norm": 0.74609375, + "learning_rate": 3.636188763040928e-05, + "loss": 0.9251, + "step": 4775 + }, + { + "epoch": 0.35141871694642446, + "grad_norm": 0.93359375, + "learning_rate": 3.6356728409063856e-05, + "loss": 1.3103, + "step": 4776 + }, + { + "epoch": 0.35149229707978846, + "grad_norm": 0.88671875, + "learning_rate": 3.6351568578224275e-05, + "loss": 1.4042, + "step": 4777 + }, + { + "epoch": 0.35156587721315247, + "grad_norm": 0.85546875, + "learning_rate": 3.6346408138167455e-05, + "loss": 0.6177, + "step": 4778 + }, + { + "epoch": 0.3516394573465164, + "grad_norm": 0.7421875, + "learning_rate": 3.634124708917033e-05, + "loss": 0.868, + "step": 4779 + }, + { + "epoch": 0.3517130374798804, + "grad_norm": 0.90234375, + "learning_rate": 3.6336085431509895e-05, + "loss": 0.7677, + "step": 4780 + }, + { + "epoch": 0.3517866176132444, + "grad_norm": 0.7578125, + "learning_rate": 3.6330923165463174e-05, + "loss": 0.8498, + "step": 4781 + }, + { + "epoch": 0.3518601977466084, + "grad_norm": 0.7890625, + "learning_rate": 3.63257602913072e-05, + "loss": 0.8257, + "step": 4782 + }, + { + "epoch": 0.35193377787997243, + "grad_norm": 0.87890625, + "learning_rate": 3.6320596809319064e-05, + "loss": 0.7374, + "step": 4783 + }, + { + "epoch": 0.3520073580133364, + "grad_norm": 1.1640625, + "learning_rate": 3.6315432719775885e-05, + "loss": 1.3143, + "step": 4784 + }, + { + "epoch": 0.3520809381467004, + "grad_norm": 0.9296875, + "learning_rate": 3.63102680229548e-05, + "loss": 1.0398, + "step": 4785 + }, + { + "epoch": 0.3521545182800644, + "grad_norm": 0.95703125, + "learning_rate": 3.630510271913298e-05, + "loss": 0.9623, + "step": 4786 + }, + { + "epoch": 0.3522280984134284, + "grad_norm": 0.87109375, + "learning_rate": 3.6299936808587666e-05, + "loss": 0.877, + "step": 4787 + }, + { + "epoch": 0.35230167854679234, + "grad_norm": 0.74609375, + "learning_rate": 3.629477029159608e-05, + "loss": 0.7758, + "step": 4788 + }, + { + "epoch": 0.35237525868015634, + "grad_norm": 1.0234375, + "learning_rate": 3.6289603168435504e-05, + "loss": 1.0819, + "step": 4789 + }, + { + "epoch": 0.35244883881352035, + "grad_norm": 0.7109375, + "learning_rate": 3.6284435439383254e-05, + "loss": 0.7601, + "step": 4790 + }, + { + "epoch": 0.35252241894688435, + "grad_norm": 0.9765625, + "learning_rate": 3.6279267104716664e-05, + "loss": 1.6886, + "step": 4791 + }, + { + "epoch": 0.35259599908024836, + "grad_norm": 0.796875, + "learning_rate": 3.627409816471311e-05, + "loss": 0.8755, + "step": 4792 + }, + { + "epoch": 0.3526695792136123, + "grad_norm": 1.015625, + "learning_rate": 3.626892861965e-05, + "loss": 1.0923, + "step": 4793 + }, + { + "epoch": 0.3527431593469763, + "grad_norm": 0.890625, + "learning_rate": 3.626375846980477e-05, + "loss": 0.8383, + "step": 4794 + }, + { + "epoch": 0.3528167394803403, + "grad_norm": 0.7421875, + "learning_rate": 3.625858771545492e-05, + "loss": 0.8668, + "step": 4795 + }, + { + "epoch": 0.3528903196137043, + "grad_norm": 0.84375, + "learning_rate": 3.625341635687791e-05, + "loss": 1.0451, + "step": 4796 + }, + { + "epoch": 0.35296389974706827, + "grad_norm": 0.77734375, + "learning_rate": 3.624824439435132e-05, + "loss": 0.8534, + "step": 4797 + }, + { + "epoch": 0.35303747988043227, + "grad_norm": 0.7265625, + "learning_rate": 3.624307182815268e-05, + "loss": 0.8749, + "step": 4798 + }, + { + "epoch": 0.3531110600137963, + "grad_norm": 1.171875, + "learning_rate": 3.623789865855961e-05, + "loss": 0.9576, + "step": 4799 + }, + { + "epoch": 0.3531846401471603, + "grad_norm": 0.921875, + "learning_rate": 3.6232724885849745e-05, + "loss": 0.9915, + "step": 4800 + }, + { + "epoch": 0.3532582202805243, + "grad_norm": 1.1796875, + "learning_rate": 3.6227550510300754e-05, + "loss": 1.1721, + "step": 4801 + }, + { + "epoch": 0.35333180041388823, + "grad_norm": 0.69140625, + "learning_rate": 3.622237553219034e-05, + "loss": 0.8471, + "step": 4802 + }, + { + "epoch": 0.35340538054725223, + "grad_norm": 0.9140625, + "learning_rate": 3.621719995179622e-05, + "loss": 1.0099, + "step": 4803 + }, + { + "epoch": 0.35347896068061624, + "grad_norm": 0.78125, + "learning_rate": 3.6212023769396165e-05, + "loss": 0.7313, + "step": 4804 + }, + { + "epoch": 0.35355254081398024, + "grad_norm": 0.83203125, + "learning_rate": 3.620684698526797e-05, + "loss": 0.7327, + "step": 4805 + }, + { + "epoch": 0.3536261209473442, + "grad_norm": 0.9140625, + "learning_rate": 3.6201669599689465e-05, + "loss": 0.7027, + "step": 4806 + }, + { + "epoch": 0.3536997010807082, + "grad_norm": 0.92578125, + "learning_rate": 3.619649161293851e-05, + "loss": 1.1363, + "step": 4807 + }, + { + "epoch": 0.3537732812140722, + "grad_norm": 0.69140625, + "learning_rate": 3.6191313025292996e-05, + "loss": 0.7394, + "step": 4808 + }, + { + "epoch": 0.3538468613474362, + "grad_norm": 0.80859375, + "learning_rate": 3.6186133837030856e-05, + "loss": 0.8749, + "step": 4809 + }, + { + "epoch": 0.3539204414808002, + "grad_norm": 0.82421875, + "learning_rate": 3.618095404843003e-05, + "loss": 0.9417, + "step": 4810 + }, + { + "epoch": 0.35399402161416416, + "grad_norm": 0.8984375, + "learning_rate": 3.617577365976853e-05, + "loss": 0.8377, + "step": 4811 + }, + { + "epoch": 0.35406760174752816, + "grad_norm": 0.79296875, + "learning_rate": 3.617059267132435e-05, + "loss": 0.9462, + "step": 4812 + }, + { + "epoch": 0.35414118188089216, + "grad_norm": 0.91015625, + "learning_rate": 3.6165411083375575e-05, + "loss": 0.8838, + "step": 4813 + }, + { + "epoch": 0.35421476201425617, + "grad_norm": 0.87890625, + "learning_rate": 3.6160228896200265e-05, + "loss": 1.2216, + "step": 4814 + }, + { + "epoch": 0.3542883421476201, + "grad_norm": 0.9453125, + "learning_rate": 3.6155046110076554e-05, + "loss": 0.9618, + "step": 4815 + }, + { + "epoch": 0.3543619222809841, + "grad_norm": 0.80859375, + "learning_rate": 3.6149862725282587e-05, + "loss": 0.8811, + "step": 4816 + }, + { + "epoch": 0.3544355024143481, + "grad_norm": 0.7421875, + "learning_rate": 3.614467874209654e-05, + "loss": 1.0577, + "step": 4817 + }, + { + "epoch": 0.35450908254771213, + "grad_norm": 0.734375, + "learning_rate": 3.613949416079665e-05, + "loss": 0.5258, + "step": 4818 + }, + { + "epoch": 0.35458266268107613, + "grad_norm": 0.91796875, + "learning_rate": 3.613430898166113e-05, + "loss": 1.047, + "step": 4819 + }, + { + "epoch": 0.3546562428144401, + "grad_norm": 0.87109375, + "learning_rate": 3.612912320496829e-05, + "loss": 0.9518, + "step": 4820 + }, + { + "epoch": 0.3547298229478041, + "grad_norm": 0.9921875, + "learning_rate": 3.612393683099642e-05, + "loss": 1.0906, + "step": 4821 + }, + { + "epoch": 0.3548034030811681, + "grad_norm": 0.828125, + "learning_rate": 3.611874986002387e-05, + "loss": 0.935, + "step": 4822 + }, + { + "epoch": 0.3548769832145321, + "grad_norm": 0.66796875, + "learning_rate": 3.6113562292329016e-05, + "loss": 0.6102, + "step": 4823 + }, + { + "epoch": 0.35495056334789604, + "grad_norm": 1.0078125, + "learning_rate": 3.610837412819027e-05, + "loss": 1.1, + "step": 4824 + }, + { + "epoch": 0.35502414348126005, + "grad_norm": 0.96484375, + "learning_rate": 3.610318536788606e-05, + "loss": 1.2297, + "step": 4825 + }, + { + "epoch": 0.35509772361462405, + "grad_norm": 0.734375, + "learning_rate": 3.609799601169486e-05, + "loss": 0.7288, + "step": 4826 + }, + { + "epoch": 0.35517130374798805, + "grad_norm": 0.8515625, + "learning_rate": 3.609280605989518e-05, + "loss": 0.8733, + "step": 4827 + }, + { + "epoch": 0.35524488388135206, + "grad_norm": 0.98046875, + "learning_rate": 3.608761551276555e-05, + "loss": 1.189, + "step": 4828 + }, + { + "epoch": 0.355318464014716, + "grad_norm": 0.6875, + "learning_rate": 3.608242437058454e-05, + "loss": 0.6602, + "step": 4829 + }, + { + "epoch": 0.35539204414808, + "grad_norm": 0.859375, + "learning_rate": 3.607723263363074e-05, + "loss": 0.7282, + "step": 4830 + }, + { + "epoch": 0.355465624281444, + "grad_norm": 0.75, + "learning_rate": 3.6072040302182784e-05, + "loss": 0.9348, + "step": 4831 + }, + { + "epoch": 0.355539204414808, + "grad_norm": 0.83984375, + "learning_rate": 3.6066847376519345e-05, + "loss": 1.2901, + "step": 4832 + }, + { + "epoch": 0.35561278454817197, + "grad_norm": 0.98046875, + "learning_rate": 3.60616538569191e-05, + "loss": 1.0461, + "step": 4833 + }, + { + "epoch": 0.35568636468153597, + "grad_norm": 0.75390625, + "learning_rate": 3.60564597436608e-05, + "loss": 0.6251, + "step": 4834 + }, + { + "epoch": 0.3557599448149, + "grad_norm": 0.87109375, + "learning_rate": 3.6051265037023176e-05, + "loss": 0.9186, + "step": 4835 + }, + { + "epoch": 0.355833524948264, + "grad_norm": 0.83203125, + "learning_rate": 3.604606973728504e-05, + "loss": 0.984, + "step": 4836 + }, + { + "epoch": 0.355907105081628, + "grad_norm": 1.046875, + "learning_rate": 3.604087384472519e-05, + "loss": 1.1697, + "step": 4837 + }, + { + "epoch": 0.35598068521499193, + "grad_norm": 0.7890625, + "learning_rate": 3.603567735962251e-05, + "loss": 0.9941, + "step": 4838 + }, + { + "epoch": 0.35605426534835594, + "grad_norm": 0.80078125, + "learning_rate": 3.603048028225585e-05, + "loss": 1.1957, + "step": 4839 + }, + { + "epoch": 0.35612784548171994, + "grad_norm": 0.92578125, + "learning_rate": 3.6025282612904157e-05, + "loss": 1.4154, + "step": 4840 + }, + { + "epoch": 0.35620142561508394, + "grad_norm": 2.328125, + "learning_rate": 3.602008435184638e-05, + "loss": 1.1748, + "step": 4841 + }, + { + "epoch": 0.3562750057484479, + "grad_norm": 0.76953125, + "learning_rate": 3.6014885499361476e-05, + "loss": 1.0554, + "step": 4842 + }, + { + "epoch": 0.3563485858818119, + "grad_norm": 0.796875, + "learning_rate": 3.600968605572848e-05, + "loss": 0.9385, + "step": 4843 + }, + { + "epoch": 0.3564221660151759, + "grad_norm": 0.7578125, + "learning_rate": 3.600448602122643e-05, + "loss": 0.729, + "step": 4844 + }, + { + "epoch": 0.3564957461485399, + "grad_norm": 0.796875, + "learning_rate": 3.599928539613439e-05, + "loss": 0.7915, + "step": 4845 + }, + { + "epoch": 0.3565693262819039, + "grad_norm": 1.015625, + "learning_rate": 3.599408418073147e-05, + "loss": 0.8664, + "step": 4846 + }, + { + "epoch": 0.35664290641526786, + "grad_norm": 0.796875, + "learning_rate": 3.598888237529684e-05, + "loss": 0.7553, + "step": 4847 + }, + { + "epoch": 0.35671648654863186, + "grad_norm": 0.859375, + "learning_rate": 3.598367998010963e-05, + "loss": 0.9498, + "step": 4848 + }, + { + "epoch": 0.35679006668199587, + "grad_norm": 0.9296875, + "learning_rate": 3.5978476995449066e-05, + "loss": 0.9862, + "step": 4849 + }, + { + "epoch": 0.35686364681535987, + "grad_norm": 0.83203125, + "learning_rate": 3.597327342159438e-05, + "loss": 0.981, + "step": 4850 + }, + { + "epoch": 0.3569372269487238, + "grad_norm": 0.828125, + "learning_rate": 3.596806925882483e-05, + "loss": 0.9306, + "step": 4851 + }, + { + "epoch": 0.3570108070820878, + "grad_norm": 0.86328125, + "learning_rate": 3.596286450741973e-05, + "loss": 0.6954, + "step": 4852 + }, + { + "epoch": 0.3570843872154518, + "grad_norm": 0.73828125, + "learning_rate": 3.595765916765838e-05, + "loss": 0.77, + "step": 4853 + }, + { + "epoch": 0.35715796734881583, + "grad_norm": 1.0625, + "learning_rate": 3.595245323982017e-05, + "loss": 0.8674, + "step": 4854 + }, + { + "epoch": 0.35723154748217983, + "grad_norm": 0.828125, + "learning_rate": 3.594724672418448e-05, + "loss": 1.17, + "step": 4855 + }, + { + "epoch": 0.3573051276155438, + "grad_norm": 1.234375, + "learning_rate": 3.594203962103073e-05, + "loss": 0.8265, + "step": 4856 + }, + { + "epoch": 0.3573787077489078, + "grad_norm": 0.8984375, + "learning_rate": 3.5936831930638395e-05, + "loss": 0.9776, + "step": 4857 + }, + { + "epoch": 0.3574522878822718, + "grad_norm": 0.98828125, + "learning_rate": 3.593162365328693e-05, + "loss": 1.1234, + "step": 4858 + }, + { + "epoch": 0.3575258680156358, + "grad_norm": 0.828125, + "learning_rate": 3.5926414789255875e-05, + "loss": 0.9733, + "step": 4859 + }, + { + "epoch": 0.35759944814899974, + "grad_norm": 0.9453125, + "learning_rate": 3.592120533882477e-05, + "loss": 1.1785, + "step": 4860 + }, + { + "epoch": 0.35767302828236375, + "grad_norm": 0.83203125, + "learning_rate": 3.5915995302273214e-05, + "loss": 1.1252, + "step": 4861 + }, + { + "epoch": 0.35774660841572775, + "grad_norm": 1.046875, + "learning_rate": 3.5910784679880805e-05, + "loss": 1.2581, + "step": 4862 + }, + { + "epoch": 0.35782018854909176, + "grad_norm": 0.9765625, + "learning_rate": 3.590557347192719e-05, + "loss": 1.158, + "step": 4863 + }, + { + "epoch": 0.35789376868245576, + "grad_norm": 0.78125, + "learning_rate": 3.590036167869203e-05, + "loss": 0.921, + "step": 4864 + }, + { + "epoch": 0.3579673488158197, + "grad_norm": 0.83203125, + "learning_rate": 3.5895149300455063e-05, + "loss": 0.9467, + "step": 4865 + }, + { + "epoch": 0.3580409289491837, + "grad_norm": 1.0546875, + "learning_rate": 3.5889936337496e-05, + "loss": 1.5529, + "step": 4866 + }, + { + "epoch": 0.3581145090825477, + "grad_norm": 0.84375, + "learning_rate": 3.5884722790094626e-05, + "loss": 1.187, + "step": 4867 + }, + { + "epoch": 0.3581880892159117, + "grad_norm": 0.87890625, + "learning_rate": 3.587950865853075e-05, + "loss": 1.0382, + "step": 4868 + }, + { + "epoch": 0.35826166934927567, + "grad_norm": 1.09375, + "learning_rate": 3.587429394308418e-05, + "loss": 1.2916, + "step": 4869 + }, + { + "epoch": 0.3583352494826397, + "grad_norm": 0.90625, + "learning_rate": 3.586907864403479e-05, + "loss": 0.7197, + "step": 4870 + }, + { + "epoch": 0.3584088296160037, + "grad_norm": 0.88671875, + "learning_rate": 3.5863862761662485e-05, + "loss": 0.9155, + "step": 4871 + }, + { + "epoch": 0.3584824097493677, + "grad_norm": 0.8515625, + "learning_rate": 3.5858646296247186e-05, + "loss": 1.0277, + "step": 4872 + }, + { + "epoch": 0.3585559898827317, + "grad_norm": 0.8359375, + "learning_rate": 3.585342924806884e-05, + "loss": 1.0115, + "step": 4873 + }, + { + "epoch": 0.35862957001609563, + "grad_norm": 0.78125, + "learning_rate": 3.584821161740745e-05, + "loss": 0.9582, + "step": 4874 + }, + { + "epoch": 0.35870315014945964, + "grad_norm": 0.875, + "learning_rate": 3.5842993404543034e-05, + "loss": 1.0438, + "step": 4875 + }, + { + "epoch": 0.35877673028282364, + "grad_norm": 0.75390625, + "learning_rate": 3.583777460975564e-05, + "loss": 0.9466, + "step": 4876 + }, + { + "epoch": 0.35885031041618765, + "grad_norm": 1.0234375, + "learning_rate": 3.583255523332536e-05, + "loss": 1.8376, + "step": 4877 + }, + { + "epoch": 0.3589238905495516, + "grad_norm": 0.96484375, + "learning_rate": 3.5827335275532293e-05, + "loss": 1.1748, + "step": 4878 + }, + { + "epoch": 0.3589974706829156, + "grad_norm": 0.859375, + "learning_rate": 3.582211473665659e-05, + "loss": 0.6493, + "step": 4879 + }, + { + "epoch": 0.3590710508162796, + "grad_norm": 0.76171875, + "learning_rate": 3.581689361697843e-05, + "loss": 1.1134, + "step": 4880 + }, + { + "epoch": 0.3591446309496436, + "grad_norm": 0.91015625, + "learning_rate": 3.581167191677802e-05, + "loss": 0.9733, + "step": 4881 + }, + { + "epoch": 0.3592182110830076, + "grad_norm": 0.80859375, + "learning_rate": 3.5806449636335606e-05, + "loss": 0.8222, + "step": 4882 + }, + { + "epoch": 0.35929179121637156, + "grad_norm": 0.94140625, + "learning_rate": 3.580122677593144e-05, + "loss": 1.1154, + "step": 4883 + }, + { + "epoch": 0.35936537134973556, + "grad_norm": 0.84375, + "learning_rate": 3.579600333584584e-05, + "loss": 0.85, + "step": 4884 + }, + { + "epoch": 0.35943895148309957, + "grad_norm": 0.96484375, + "learning_rate": 3.579077931635913e-05, + "loss": 0.74, + "step": 4885 + }, + { + "epoch": 0.35951253161646357, + "grad_norm": 1.0234375, + "learning_rate": 3.578555471775167e-05, + "loss": 0.8637, + "step": 4886 + }, + { + "epoch": 0.3595861117498275, + "grad_norm": 0.89453125, + "learning_rate": 3.5780329540303865e-05, + "loss": 0.859, + "step": 4887 + }, + { + "epoch": 0.3596596918831915, + "grad_norm": 0.8125, + "learning_rate": 3.5775103784296135e-05, + "loss": 0.8033, + "step": 4888 + }, + { + "epoch": 0.3597332720165555, + "grad_norm": 0.81640625, + "learning_rate": 3.5769877450008935e-05, + "loss": 0.6638, + "step": 4889 + }, + { + "epoch": 0.35980685214991953, + "grad_norm": 1.015625, + "learning_rate": 3.576465053772275e-05, + "loss": 1.4388, + "step": 4890 + }, + { + "epoch": 0.35988043228328354, + "grad_norm": 0.90234375, + "learning_rate": 3.575942304771811e-05, + "loss": 1.4897, + "step": 4891 + }, + { + "epoch": 0.3599540124166475, + "grad_norm": 0.90234375, + "learning_rate": 3.5754194980275554e-05, + "loss": 1.3071, + "step": 4892 + }, + { + "epoch": 0.3600275925500115, + "grad_norm": 0.8359375, + "learning_rate": 3.574896633567566e-05, + "loss": 0.7576, + "step": 4893 + }, + { + "epoch": 0.3601011726833755, + "grad_norm": 1.0390625, + "learning_rate": 3.5743737114199045e-05, + "loss": 1.513, + "step": 4894 + }, + { + "epoch": 0.3601747528167395, + "grad_norm": 0.73046875, + "learning_rate": 3.573850731612636e-05, + "loss": 0.8965, + "step": 4895 + }, + { + "epoch": 0.36024833295010344, + "grad_norm": 0.7890625, + "learning_rate": 3.573327694173826e-05, + "loss": 0.7815, + "step": 4896 + }, + { + "epoch": 0.36032191308346745, + "grad_norm": 0.8984375, + "learning_rate": 3.572804599131546e-05, + "loss": 0.9349, + "step": 4897 + }, + { + "epoch": 0.36039549321683145, + "grad_norm": 0.90234375, + "learning_rate": 3.57228144651387e-05, + "loss": 0.8704, + "step": 4898 + }, + { + "epoch": 0.36046907335019546, + "grad_norm": 0.9140625, + "learning_rate": 3.5717582363488735e-05, + "loss": 1.1795, + "step": 4899 + }, + { + "epoch": 0.36054265348355946, + "grad_norm": 0.83984375, + "learning_rate": 3.5712349686646365e-05, + "loss": 0.9342, + "step": 4900 + }, + { + "epoch": 0.3606162336169234, + "grad_norm": 1.0, + "learning_rate": 3.5707116434892424e-05, + "loss": 1.1258, + "step": 4901 + }, + { + "epoch": 0.3606898137502874, + "grad_norm": 0.8515625, + "learning_rate": 3.570188260850777e-05, + "loss": 0.8858, + "step": 4902 + }, + { + "epoch": 0.3607633938836514, + "grad_norm": 0.8046875, + "learning_rate": 3.569664820777329e-05, + "loss": 0.8702, + "step": 4903 + }, + { + "epoch": 0.3608369740170154, + "grad_norm": 1.015625, + "learning_rate": 3.56914132329699e-05, + "loss": 1.3725, + "step": 4904 + }, + { + "epoch": 0.36091055415037937, + "grad_norm": 0.734375, + "learning_rate": 3.568617768437855e-05, + "loss": 0.6883, + "step": 4905 + }, + { + "epoch": 0.3609841342837434, + "grad_norm": 0.79296875, + "learning_rate": 3.5680941562280235e-05, + "loss": 0.8077, + "step": 4906 + }, + { + "epoch": 0.3610577144171074, + "grad_norm": 1.0078125, + "learning_rate": 3.5675704866955954e-05, + "loss": 1.11, + "step": 4907 + }, + { + "epoch": 0.3611312945504714, + "grad_norm": 0.98828125, + "learning_rate": 3.5670467598686756e-05, + "loss": 1.0383, + "step": 4908 + }, + { + "epoch": 0.3612048746838354, + "grad_norm": 0.7578125, + "learning_rate": 3.566522975775373e-05, + "loss": 0.7719, + "step": 4909 + }, + { + "epoch": 0.36127845481719933, + "grad_norm": 0.90625, + "learning_rate": 3.565999134443795e-05, + "loss": 1.0769, + "step": 4910 + }, + { + "epoch": 0.36135203495056334, + "grad_norm": 1.0390625, + "learning_rate": 3.565475235902058e-05, + "loss": 1.0033, + "step": 4911 + }, + { + "epoch": 0.36142561508392734, + "grad_norm": 0.94921875, + "learning_rate": 3.5649512801782764e-05, + "loss": 1.0606, + "step": 4912 + }, + { + "epoch": 0.36149919521729135, + "grad_norm": 0.91796875, + "learning_rate": 3.5644272673005715e-05, + "loss": 0.9322, + "step": 4913 + }, + { + "epoch": 0.3615727753506553, + "grad_norm": 0.87890625, + "learning_rate": 3.5639031972970646e-05, + "loss": 0.9531, + "step": 4914 + }, + { + "epoch": 0.3616463554840193, + "grad_norm": 0.59765625, + "learning_rate": 3.563379070195883e-05, + "loss": 0.8194, + "step": 4915 + }, + { + "epoch": 0.3617199356173833, + "grad_norm": 0.8828125, + "learning_rate": 3.5628548860251565e-05, + "loss": 0.8207, + "step": 4916 + }, + { + "epoch": 0.3617935157507473, + "grad_norm": 0.828125, + "learning_rate": 3.562330644813015e-05, + "loss": 1.0512, + "step": 4917 + }, + { + "epoch": 0.3618670958841113, + "grad_norm": 0.86328125, + "learning_rate": 3.561806346587594e-05, + "loss": 0.8682, + "step": 4918 + }, + { + "epoch": 0.36194067601747526, + "grad_norm": 0.91796875, + "learning_rate": 3.5612819913770325e-05, + "loss": 0.864, + "step": 4919 + }, + { + "epoch": 0.36201425615083926, + "grad_norm": 0.81640625, + "learning_rate": 3.56075757920947e-05, + "loss": 0.788, + "step": 4920 + }, + { + "epoch": 0.36208783628420327, + "grad_norm": 1.4765625, + "learning_rate": 3.560233110113052e-05, + "loss": 1.4926, + "step": 4921 + }, + { + "epoch": 0.3621614164175673, + "grad_norm": 0.78125, + "learning_rate": 3.559708584115925e-05, + "loss": 0.6587, + "step": 4922 + }, + { + "epoch": 0.3622349965509312, + "grad_norm": 1.0625, + "learning_rate": 3.559184001246241e-05, + "loss": 0.9058, + "step": 4923 + }, + { + "epoch": 0.3623085766842952, + "grad_norm": 0.8046875, + "learning_rate": 3.558659361532152e-05, + "loss": 0.9397, + "step": 4924 + }, + { + "epoch": 0.36238215681765923, + "grad_norm": 0.86328125, + "learning_rate": 3.558134665001814e-05, + "loss": 1.0313, + "step": 4925 + }, + { + "epoch": 0.36245573695102323, + "grad_norm": 0.88671875, + "learning_rate": 3.5576099116833874e-05, + "loss": 0.9287, + "step": 4926 + }, + { + "epoch": 0.36252931708438724, + "grad_norm": 0.91015625, + "learning_rate": 3.557085101605034e-05, + "loss": 0.8493, + "step": 4927 + }, + { + "epoch": 0.3626028972177512, + "grad_norm": 0.72265625, + "learning_rate": 3.55656023479492e-05, + "loss": 0.6525, + "step": 4928 + }, + { + "epoch": 0.3626764773511152, + "grad_norm": 0.91015625, + "learning_rate": 3.556035311281213e-05, + "loss": 1.2842, + "step": 4929 + }, + { + "epoch": 0.3627500574844792, + "grad_norm": 0.8125, + "learning_rate": 3.555510331092087e-05, + "loss": 1.1034, + "step": 4930 + }, + { + "epoch": 0.3628236376178432, + "grad_norm": 1.6171875, + "learning_rate": 3.554985294255714e-05, + "loss": 1.0617, + "step": 4931 + }, + { + "epoch": 0.3628972177512072, + "grad_norm": 0.984375, + "learning_rate": 3.554460200800273e-05, + "loss": 0.876, + "step": 4932 + }, + { + "epoch": 0.36297079788457115, + "grad_norm": 0.66796875, + "learning_rate": 3.553935050753945e-05, + "loss": 0.7704, + "step": 4933 + }, + { + "epoch": 0.36304437801793515, + "grad_norm": 0.76953125, + "learning_rate": 3.553409844144913e-05, + "loss": 0.6025, + "step": 4934 + }, + { + "epoch": 0.36311795815129916, + "grad_norm": 1.171875, + "learning_rate": 3.552884581001364e-05, + "loss": 0.7969, + "step": 4935 + }, + { + "epoch": 0.36319153828466316, + "grad_norm": 0.7421875, + "learning_rate": 3.5523592613514886e-05, + "loss": 0.7456, + "step": 4936 + }, + { + "epoch": 0.3632651184180271, + "grad_norm": 0.98046875, + "learning_rate": 3.551833885223479e-05, + "loss": 1.182, + "step": 4937 + }, + { + "epoch": 0.3633386985513911, + "grad_norm": 0.94140625, + "learning_rate": 3.551308452645532e-05, + "loss": 1.0166, + "step": 4938 + }, + { + "epoch": 0.3634122786847551, + "grad_norm": 1.0, + "learning_rate": 3.5507829636458454e-05, + "loss": 1.0311, + "step": 4939 + }, + { + "epoch": 0.3634858588181191, + "grad_norm": 0.84375, + "learning_rate": 3.550257418252622e-05, + "loss": 0.6928, + "step": 4940 + }, + { + "epoch": 0.3635594389514831, + "grad_norm": 0.83984375, + "learning_rate": 3.5497318164940665e-05, + "loss": 1.1217, + "step": 4941 + }, + { + "epoch": 0.3636330190848471, + "grad_norm": 0.93359375, + "learning_rate": 3.549206158398387e-05, + "loss": 1.3708, + "step": 4942 + }, + { + "epoch": 0.3637065992182111, + "grad_norm": 0.8515625, + "learning_rate": 3.5486804439937954e-05, + "loss": 0.8204, + "step": 4943 + }, + { + "epoch": 0.3637801793515751, + "grad_norm": 0.74609375, + "learning_rate": 3.548154673308504e-05, + "loss": 0.6268, + "step": 4944 + }, + { + "epoch": 0.3638537594849391, + "grad_norm": 1.21875, + "learning_rate": 3.547628846370731e-05, + "loss": 1.4384, + "step": 4945 + }, + { + "epoch": 0.36392733961830304, + "grad_norm": 0.73046875, + "learning_rate": 3.547102963208698e-05, + "loss": 0.9204, + "step": 4946 + }, + { + "epoch": 0.36400091975166704, + "grad_norm": 0.7265625, + "learning_rate": 3.546577023850625e-05, + "loss": 0.8621, + "step": 4947 + }, + { + "epoch": 0.36407449988503104, + "grad_norm": 0.78515625, + "learning_rate": 3.546051028324741e-05, + "loss": 0.9103, + "step": 4948 + }, + { + "epoch": 0.36414808001839505, + "grad_norm": 0.71484375, + "learning_rate": 3.545524976659273e-05, + "loss": 0.9211, + "step": 4949 + }, + { + "epoch": 0.36422166015175905, + "grad_norm": 1.1953125, + "learning_rate": 3.544998868882455e-05, + "loss": 1.1029, + "step": 4950 + }, + { + "epoch": 0.364295240285123, + "grad_norm": 0.83203125, + "learning_rate": 3.544472705022521e-05, + "loss": 0.831, + "step": 4951 + }, + { + "epoch": 0.364368820418487, + "grad_norm": 0.734375, + "learning_rate": 3.5439464851077096e-05, + "loss": 0.7651, + "step": 4952 + }, + { + "epoch": 0.364442400551851, + "grad_norm": 0.79296875, + "learning_rate": 3.5434202091662625e-05, + "loss": 1.0167, + "step": 4953 + }, + { + "epoch": 0.364515980685215, + "grad_norm": 0.9140625, + "learning_rate": 3.542893877226423e-05, + "loss": 1.185, + "step": 4954 + }, + { + "epoch": 0.36458956081857896, + "grad_norm": 0.82421875, + "learning_rate": 3.542367489316439e-05, + "loss": 0.7352, + "step": 4955 + }, + { + "epoch": 0.36466314095194297, + "grad_norm": 0.8515625, + "learning_rate": 3.5418410454645606e-05, + "loss": 0.7912, + "step": 4956 + }, + { + "epoch": 0.36473672108530697, + "grad_norm": 0.98828125, + "learning_rate": 3.5413145456990414e-05, + "loss": 0.9649, + "step": 4957 + }, + { + "epoch": 0.364810301218671, + "grad_norm": 0.7421875, + "learning_rate": 3.540787990048137e-05, + "loss": 0.7054, + "step": 4958 + }, + { + "epoch": 0.364883881352035, + "grad_norm": 0.89453125, + "learning_rate": 3.5402613785401076e-05, + "loss": 0.8387, + "step": 4959 + }, + { + "epoch": 0.3649574614853989, + "grad_norm": 0.8203125, + "learning_rate": 3.539734711203214e-05, + "loss": 0.9519, + "step": 4960 + }, + { + "epoch": 0.36503104161876293, + "grad_norm": 0.8515625, + "learning_rate": 3.539207988065722e-05, + "loss": 0.8418, + "step": 4961 + }, + { + "epoch": 0.36510462175212693, + "grad_norm": 0.7734375, + "learning_rate": 3.5386812091559e-05, + "loss": 0.8378, + "step": 4962 + }, + { + "epoch": 0.36517820188549094, + "grad_norm": 0.828125, + "learning_rate": 3.53815437450202e-05, + "loss": 1.0, + "step": 4963 + }, + { + "epoch": 0.3652517820188549, + "grad_norm": 1.125, + "learning_rate": 3.5376274841323557e-05, + "loss": 1.3167, + "step": 4964 + }, + { + "epoch": 0.3653253621522189, + "grad_norm": 0.87890625, + "learning_rate": 3.537100538075183e-05, + "loss": 1.0696, + "step": 4965 + }, + { + "epoch": 0.3653989422855829, + "grad_norm": 1.109375, + "learning_rate": 3.536573536358785e-05, + "loss": 1.2404, + "step": 4966 + }, + { + "epoch": 0.3654725224189469, + "grad_norm": 0.98046875, + "learning_rate": 3.5360464790114416e-05, + "loss": 0.9548, + "step": 4967 + }, + { + "epoch": 0.3655461025523109, + "grad_norm": 0.8359375, + "learning_rate": 3.535519366061441e-05, + "loss": 1.042, + "step": 4968 + }, + { + "epoch": 0.36561968268567485, + "grad_norm": 1.0, + "learning_rate": 3.534992197537071e-05, + "loss": 1.0249, + "step": 4969 + }, + { + "epoch": 0.36569326281903886, + "grad_norm": 0.78515625, + "learning_rate": 3.5344649734666256e-05, + "loss": 0.7701, + "step": 4970 + }, + { + "epoch": 0.36576684295240286, + "grad_norm": 1.09375, + "learning_rate": 3.533937693878399e-05, + "loss": 0.9024, + "step": 4971 + }, + { + "epoch": 0.36584042308576686, + "grad_norm": 0.95703125, + "learning_rate": 3.533410358800689e-05, + "loss": 1.2592, + "step": 4972 + }, + { + "epoch": 0.3659140032191308, + "grad_norm": 0.70703125, + "learning_rate": 3.532882968261797e-05, + "loss": 0.8673, + "step": 4973 + }, + { + "epoch": 0.3659875833524948, + "grad_norm": 1.2578125, + "learning_rate": 3.532355522290026e-05, + "loss": 1.1793, + "step": 4974 + }, + { + "epoch": 0.3660611634858588, + "grad_norm": 0.88671875, + "learning_rate": 3.531828020913685e-05, + "loss": 0.9704, + "step": 4975 + }, + { + "epoch": 0.3661347436192228, + "grad_norm": 0.9296875, + "learning_rate": 3.5313004641610825e-05, + "loss": 0.7921, + "step": 4976 + }, + { + "epoch": 0.36620832375258683, + "grad_norm": 0.66015625, + "learning_rate": 3.530772852060532e-05, + "loss": 0.616, + "step": 4977 + }, + { + "epoch": 0.3662819038859508, + "grad_norm": 0.86328125, + "learning_rate": 3.5302451846403496e-05, + "loss": 1.0702, + "step": 4978 + }, + { + "epoch": 0.3663554840193148, + "grad_norm": 0.8203125, + "learning_rate": 3.5297174619288536e-05, + "loss": 0.9164, + "step": 4979 + }, + { + "epoch": 0.3664290641526788, + "grad_norm": 0.78515625, + "learning_rate": 3.529189683954367e-05, + "loss": 0.931, + "step": 4980 + }, + { + "epoch": 0.3665026442860428, + "grad_norm": 1.1171875, + "learning_rate": 3.528661850745213e-05, + "loss": 0.9794, + "step": 4981 + }, + { + "epoch": 0.36657622441940674, + "grad_norm": 0.890625, + "learning_rate": 3.528133962329721e-05, + "loss": 0.8532, + "step": 4982 + }, + { + "epoch": 0.36664980455277074, + "grad_norm": 0.921875, + "learning_rate": 3.52760601873622e-05, + "loss": 1.0545, + "step": 4983 + }, + { + "epoch": 0.36672338468613475, + "grad_norm": 0.92578125, + "learning_rate": 3.5270780199930465e-05, + "loss": 1.0776, + "step": 4984 + }, + { + "epoch": 0.36679696481949875, + "grad_norm": 0.89453125, + "learning_rate": 3.526549966128535e-05, + "loss": 1.0756, + "step": 4985 + }, + { + "epoch": 0.36687054495286275, + "grad_norm": 0.85546875, + "learning_rate": 3.526021857171025e-05, + "loss": 1.2265, + "step": 4986 + }, + { + "epoch": 0.3669441250862267, + "grad_norm": 0.8984375, + "learning_rate": 3.525493693148861e-05, + "loss": 1.0376, + "step": 4987 + }, + { + "epoch": 0.3670177052195907, + "grad_norm": 0.82421875, + "learning_rate": 3.524965474090386e-05, + "loss": 0.8881, + "step": 4988 + }, + { + "epoch": 0.3670912853529547, + "grad_norm": 0.7890625, + "learning_rate": 3.524437200023951e-05, + "loss": 0.7254, + "step": 4989 + }, + { + "epoch": 0.3671648654863187, + "grad_norm": 0.86328125, + "learning_rate": 3.523908870977906e-05, + "loss": 0.8913, + "step": 4990 + }, + { + "epoch": 0.36723844561968266, + "grad_norm": 0.78515625, + "learning_rate": 3.523380486980605e-05, + "loss": 0.9651, + "step": 4991 + }, + { + "epoch": 0.36731202575304667, + "grad_norm": 0.828125, + "learning_rate": 3.522852048060407e-05, + "loss": 0.7655, + "step": 4992 + }, + { + "epoch": 0.36738560588641067, + "grad_norm": 0.89453125, + "learning_rate": 3.522323554245671e-05, + "loss": 1.1908, + "step": 4993 + }, + { + "epoch": 0.3674591860197747, + "grad_norm": 0.99609375, + "learning_rate": 3.5217950055647616e-05, + "loss": 1.0507, + "step": 4994 + }, + { + "epoch": 0.3675327661531387, + "grad_norm": 0.859375, + "learning_rate": 3.521266402046044e-05, + "loss": 0.756, + "step": 4995 + }, + { + "epoch": 0.3676063462865026, + "grad_norm": 0.73046875, + "learning_rate": 3.520737743717886e-05, + "loss": 0.7442, + "step": 4996 + }, + { + "epoch": 0.36767992641986663, + "grad_norm": 0.9609375, + "learning_rate": 3.520209030608662e-05, + "loss": 0.8751, + "step": 4997 + }, + { + "epoch": 0.36775350655323064, + "grad_norm": 0.96484375, + "learning_rate": 3.519680262746747e-05, + "loss": 1.0225, + "step": 4998 + }, + { + "epoch": 0.36782708668659464, + "grad_norm": 0.9140625, + "learning_rate": 3.5191514401605176e-05, + "loss": 0.9934, + "step": 4999 + }, + { + "epoch": 0.3679006668199586, + "grad_norm": 0.84765625, + "learning_rate": 3.518622562878356e-05, + "loss": 1.2016, + "step": 5000 + }, + { + "epoch": 0.3679742469533226, + "grad_norm": 0.86328125, + "learning_rate": 3.518093630928644e-05, + "loss": 0.695, + "step": 5001 + }, + { + "epoch": 0.3680478270866866, + "grad_norm": 0.76953125, + "learning_rate": 3.51756464433977e-05, + "loss": 1.1051, + "step": 5002 + }, + { + "epoch": 0.3681214072200506, + "grad_norm": 0.76171875, + "learning_rate": 3.517035603140125e-05, + "loss": 0.8496, + "step": 5003 + }, + { + "epoch": 0.3681949873534146, + "grad_norm": 0.7578125, + "learning_rate": 3.516506507358099e-05, + "loss": 0.8501, + "step": 5004 + }, + { + "epoch": 0.36826856748677855, + "grad_norm": 0.73828125, + "learning_rate": 3.515977357022089e-05, + "loss": 0.8302, + "step": 5005 + }, + { + "epoch": 0.36834214762014256, + "grad_norm": 0.828125, + "learning_rate": 3.515448152160492e-05, + "loss": 0.8431, + "step": 5006 + }, + { + "epoch": 0.36841572775350656, + "grad_norm": 0.9375, + "learning_rate": 3.514918892801712e-05, + "loss": 1.1764, + "step": 5007 + }, + { + "epoch": 0.36848930788687057, + "grad_norm": 0.83203125, + "learning_rate": 3.5143895789741516e-05, + "loss": 1.0617, + "step": 5008 + }, + { + "epoch": 0.3685628880202345, + "grad_norm": 0.9296875, + "learning_rate": 3.5138602107062174e-05, + "loss": 0.7635, + "step": 5009 + }, + { + "epoch": 0.3686364681535985, + "grad_norm": 0.9140625, + "learning_rate": 3.513330788026322e-05, + "loss": 0.8319, + "step": 5010 + }, + { + "epoch": 0.3687100482869625, + "grad_norm": 1.1484375, + "learning_rate": 3.512801310962876e-05, + "loss": 0.9136, + "step": 5011 + }, + { + "epoch": 0.3687836284203265, + "grad_norm": 0.859375, + "learning_rate": 3.512271779544297e-05, + "loss": 0.8254, + "step": 5012 + }, + { + "epoch": 0.36885720855369053, + "grad_norm": 0.99609375, + "learning_rate": 3.5117421937990036e-05, + "loss": 1.3523, + "step": 5013 + }, + { + "epoch": 0.3689307886870545, + "grad_norm": 0.9765625, + "learning_rate": 3.511212553755418e-05, + "loss": 1.2158, + "step": 5014 + }, + { + "epoch": 0.3690043688204185, + "grad_norm": 0.76171875, + "learning_rate": 3.510682859441964e-05, + "loss": 1.1216, + "step": 5015 + }, + { + "epoch": 0.3690779489537825, + "grad_norm": 0.90625, + "learning_rate": 3.51015311088707e-05, + "loss": 1.0272, + "step": 5016 + }, + { + "epoch": 0.3691515290871465, + "grad_norm": 1.078125, + "learning_rate": 3.509623308119166e-05, + "loss": 1.136, + "step": 5017 + }, + { + "epoch": 0.36922510922051044, + "grad_norm": 0.94140625, + "learning_rate": 3.509093451166686e-05, + "loss": 1.0315, + "step": 5018 + }, + { + "epoch": 0.36929868935387444, + "grad_norm": 1.1640625, + "learning_rate": 3.5085635400580675e-05, + "loss": 1.2436, + "step": 5019 + }, + { + "epoch": 0.36937226948723845, + "grad_norm": 0.91015625, + "learning_rate": 3.508033574821747e-05, + "loss": 1.3433, + "step": 5020 + }, + { + "epoch": 0.36944584962060245, + "grad_norm": 0.64453125, + "learning_rate": 3.50750355548617e-05, + "loss": 0.6279, + "step": 5021 + }, + { + "epoch": 0.36951942975396646, + "grad_norm": 0.76171875, + "learning_rate": 3.506973482079779e-05, + "loss": 0.6971, + "step": 5022 + }, + { + "epoch": 0.3695930098873304, + "grad_norm": 0.78515625, + "learning_rate": 3.5064433546310236e-05, + "loss": 0.6801, + "step": 5023 + }, + { + "epoch": 0.3696665900206944, + "grad_norm": 0.87890625, + "learning_rate": 3.505913173168354e-05, + "loss": 0.8268, + "step": 5024 + }, + { + "epoch": 0.3697401701540584, + "grad_norm": 0.88671875, + "learning_rate": 3.505382937720225e-05, + "loss": 0.9737, + "step": 5025 + }, + { + "epoch": 0.3698137502874224, + "grad_norm": 0.7734375, + "learning_rate": 3.504852648315092e-05, + "loss": 0.8916, + "step": 5026 + }, + { + "epoch": 0.36988733042078636, + "grad_norm": 0.74609375, + "learning_rate": 3.5043223049814156e-05, + "loss": 0.9102, + "step": 5027 + }, + { + "epoch": 0.36996091055415037, + "grad_norm": 0.83984375, + "learning_rate": 3.5037919077476576e-05, + "loss": 1.4119, + "step": 5028 + }, + { + "epoch": 0.3700344906875144, + "grad_norm": 0.73046875, + "learning_rate": 3.503261456642284e-05, + "loss": 0.8671, + "step": 5029 + }, + { + "epoch": 0.3701080708208784, + "grad_norm": 0.83203125, + "learning_rate": 3.502730951693763e-05, + "loss": 1.2578, + "step": 5030 + }, + { + "epoch": 0.3701816509542424, + "grad_norm": 0.875, + "learning_rate": 3.502200392930565e-05, + "loss": 0.7428, + "step": 5031 + }, + { + "epoch": 0.37025523108760633, + "grad_norm": 0.7890625, + "learning_rate": 3.501669780381166e-05, + "loss": 0.8569, + "step": 5032 + }, + { + "epoch": 0.37032881122097033, + "grad_norm": 0.96875, + "learning_rate": 3.501139114074041e-05, + "loss": 0.7413, + "step": 5033 + }, + { + "epoch": 0.37040239135433434, + "grad_norm": 0.77734375, + "learning_rate": 3.5006083940376714e-05, + "loss": 0.9027, + "step": 5034 + }, + { + "epoch": 0.37047597148769834, + "grad_norm": 1.1796875, + "learning_rate": 3.5000776203005384e-05, + "loss": 1.1972, + "step": 5035 + }, + { + "epoch": 0.3705495516210623, + "grad_norm": 0.66015625, + "learning_rate": 3.499546792891128e-05, + "loss": 0.7057, + "step": 5036 + }, + { + "epoch": 0.3706231317544263, + "grad_norm": 0.796875, + "learning_rate": 3.49901591183793e-05, + "loss": 0.7118, + "step": 5037 + }, + { + "epoch": 0.3706967118877903, + "grad_norm": 0.8203125, + "learning_rate": 3.4984849771694345e-05, + "loss": 0.8855, + "step": 5038 + }, + { + "epoch": 0.3707702920211543, + "grad_norm": 0.9765625, + "learning_rate": 3.497953988914137e-05, + "loss": 0.9269, + "step": 5039 + }, + { + "epoch": 0.3708438721545183, + "grad_norm": 0.9296875, + "learning_rate": 3.497422947100533e-05, + "loss": 1.2386, + "step": 5040 + }, + { + "epoch": 0.37091745228788225, + "grad_norm": 0.66796875, + "learning_rate": 3.496891851757123e-05, + "loss": 0.5296, + "step": 5041 + }, + { + "epoch": 0.37099103242124626, + "grad_norm": 0.76953125, + "learning_rate": 3.4963607029124094e-05, + "loss": 0.9789, + "step": 5042 + }, + { + "epoch": 0.37106461255461026, + "grad_norm": 0.69140625, + "learning_rate": 3.4958295005949e-05, + "loss": 0.7716, + "step": 5043 + }, + { + "epoch": 0.37113819268797427, + "grad_norm": 0.8984375, + "learning_rate": 3.495298244833102e-05, + "loss": 0.9389, + "step": 5044 + }, + { + "epoch": 0.3712117728213382, + "grad_norm": 0.64453125, + "learning_rate": 3.494766935655527e-05, + "loss": 0.5788, + "step": 5045 + }, + { + "epoch": 0.3712853529547022, + "grad_norm": 1.1328125, + "learning_rate": 3.49423557309069e-05, + "loss": 0.6275, + "step": 5046 + }, + { + "epoch": 0.3713589330880662, + "grad_norm": 0.859375, + "learning_rate": 3.493704157167107e-05, + "loss": 0.8667, + "step": 5047 + }, + { + "epoch": 0.3714325132214302, + "grad_norm": 0.9296875, + "learning_rate": 3.493172687913299e-05, + "loss": 0.875, + "step": 5048 + }, + { + "epoch": 0.37150609335479423, + "grad_norm": 0.89453125, + "learning_rate": 3.492641165357788e-05, + "loss": 0.8389, + "step": 5049 + }, + { + "epoch": 0.3715796734881582, + "grad_norm": 0.91015625, + "learning_rate": 3.492109589529101e-05, + "loss": 1.293, + "step": 5050 + }, + { + "epoch": 0.3716532536215222, + "grad_norm": 0.93359375, + "learning_rate": 3.491577960455766e-05, + "loss": 0.9775, + "step": 5051 + }, + { + "epoch": 0.3717268337548862, + "grad_norm": 0.6640625, + "learning_rate": 3.4910462781663146e-05, + "loss": 0.8613, + "step": 5052 + }, + { + "epoch": 0.3718004138882502, + "grad_norm": 0.98828125, + "learning_rate": 3.4905145426892824e-05, + "loss": 1.4802, + "step": 5053 + }, + { + "epoch": 0.37187399402161414, + "grad_norm": 0.7734375, + "learning_rate": 3.489982754053204e-05, + "loss": 0.7911, + "step": 5054 + }, + { + "epoch": 0.37194757415497814, + "grad_norm": 0.65234375, + "learning_rate": 3.4894509122866216e-05, + "loss": 0.7443, + "step": 5055 + }, + { + "epoch": 0.37202115428834215, + "grad_norm": 0.7421875, + "learning_rate": 3.4889190174180776e-05, + "loss": 0.6426, + "step": 5056 + }, + { + "epoch": 0.37209473442170615, + "grad_norm": 1.0, + "learning_rate": 3.4883870694761175e-05, + "loss": 1.1715, + "step": 5057 + }, + { + "epoch": 0.37216831455507016, + "grad_norm": 0.7734375, + "learning_rate": 3.48785506848929e-05, + "loss": 0.7959, + "step": 5058 + }, + { + "epoch": 0.3722418946884341, + "grad_norm": 0.9140625, + "learning_rate": 3.487323014486147e-05, + "loss": 0.7008, + "step": 5059 + }, + { + "epoch": 0.3723154748217981, + "grad_norm": 0.71875, + "learning_rate": 3.486790907495243e-05, + "loss": 0.5899, + "step": 5060 + }, + { + "epoch": 0.3723890549551621, + "grad_norm": 0.73828125, + "learning_rate": 3.486258747545135e-05, + "loss": 0.7655, + "step": 5061 + }, + { + "epoch": 0.3724626350885261, + "grad_norm": 0.83984375, + "learning_rate": 3.485726534664382e-05, + "loss": 0.7658, + "step": 5062 + }, + { + "epoch": 0.37253621522189007, + "grad_norm": 0.81640625, + "learning_rate": 3.485194268881547e-05, + "loss": 1.0227, + "step": 5063 + }, + { + "epoch": 0.37260979535525407, + "grad_norm": 0.94921875, + "learning_rate": 3.484661950225198e-05, + "loss": 1.016, + "step": 5064 + }, + { + "epoch": 0.3726833754886181, + "grad_norm": 0.984375, + "learning_rate": 3.4841295787239015e-05, + "loss": 1.0784, + "step": 5065 + }, + { + "epoch": 0.3727569556219821, + "grad_norm": 0.78515625, + "learning_rate": 3.483597154406228e-05, + "loss": 0.9107, + "step": 5066 + }, + { + "epoch": 0.3728305357553461, + "grad_norm": 0.9453125, + "learning_rate": 3.4830646773007544e-05, + "loss": 1.178, + "step": 5067 + }, + { + "epoch": 0.37290411588871003, + "grad_norm": 0.80078125, + "learning_rate": 3.482532147436056e-05, + "loss": 1.0666, + "step": 5068 + }, + { + "epoch": 0.37297769602207403, + "grad_norm": 0.890625, + "learning_rate": 3.4819995648407125e-05, + "loss": 1.0449, + "step": 5069 + }, + { + "epoch": 0.37305127615543804, + "grad_norm": 0.8359375, + "learning_rate": 3.481466929543308e-05, + "loss": 0.8709, + "step": 5070 + }, + { + "epoch": 0.37312485628880204, + "grad_norm": 0.875, + "learning_rate": 3.4809342415724266e-05, + "loss": 0.8263, + "step": 5071 + }, + { + "epoch": 0.373198436422166, + "grad_norm": 0.67578125, + "learning_rate": 3.480401500956657e-05, + "loss": 0.7905, + "step": 5072 + }, + { + "epoch": 0.37327201655553, + "grad_norm": 0.89453125, + "learning_rate": 3.479868707724591e-05, + "loss": 1.0048, + "step": 5073 + }, + { + "epoch": 0.373345596688894, + "grad_norm": 0.9609375, + "learning_rate": 3.479335861904822e-05, + "loss": 0.8132, + "step": 5074 + }, + { + "epoch": 0.373419176822258, + "grad_norm": 0.9140625, + "learning_rate": 3.478802963525947e-05, + "loss": 0.8332, + "step": 5075 + }, + { + "epoch": 0.373492756955622, + "grad_norm": 0.65234375, + "learning_rate": 3.478270012616565e-05, + "loss": 0.8352, + "step": 5076 + }, + { + "epoch": 0.37356633708898596, + "grad_norm": 0.90625, + "learning_rate": 3.477737009205279e-05, + "loss": 0.9812, + "step": 5077 + }, + { + "epoch": 0.37363991722234996, + "grad_norm": 1.046875, + "learning_rate": 3.4772039533206954e-05, + "loss": 0.9344, + "step": 5078 + }, + { + "epoch": 0.37371349735571396, + "grad_norm": 0.82421875, + "learning_rate": 3.476670844991421e-05, + "loss": 0.9901, + "step": 5079 + }, + { + "epoch": 0.37378707748907797, + "grad_norm": 0.890625, + "learning_rate": 3.476137684246067e-05, + "loss": 1.0108, + "step": 5080 + }, + { + "epoch": 0.3738606576224419, + "grad_norm": 1.0078125, + "learning_rate": 3.475604471113247e-05, + "loss": 1.5467, + "step": 5081 + }, + { + "epoch": 0.3739342377558059, + "grad_norm": 0.87109375, + "learning_rate": 3.475071205621578e-05, + "loss": 1.1166, + "step": 5082 + }, + { + "epoch": 0.3740078178891699, + "grad_norm": 0.94140625, + "learning_rate": 3.474537887799678e-05, + "loss": 0.8274, + "step": 5083 + }, + { + "epoch": 0.37408139802253393, + "grad_norm": 0.78515625, + "learning_rate": 3.474004517676171e-05, + "loss": 0.6075, + "step": 5084 + }, + { + "epoch": 0.37415497815589793, + "grad_norm": 0.7734375, + "learning_rate": 3.473471095279682e-05, + "loss": 0.9428, + "step": 5085 + }, + { + "epoch": 0.3742285582892619, + "grad_norm": 0.89453125, + "learning_rate": 3.472937620638837e-05, + "loss": 0.9936, + "step": 5086 + }, + { + "epoch": 0.3743021384226259, + "grad_norm": 0.890625, + "learning_rate": 3.472404093782268e-05, + "loss": 0.9604, + "step": 5087 + }, + { + "epoch": 0.3743757185559899, + "grad_norm": 0.765625, + "learning_rate": 3.4718705147386075e-05, + "loss": 0.8169, + "step": 5088 + }, + { + "epoch": 0.3744492986893539, + "grad_norm": 0.78125, + "learning_rate": 3.471336883536491e-05, + "loss": 0.7736, + "step": 5089 + }, + { + "epoch": 0.37452287882271784, + "grad_norm": 1.09375, + "learning_rate": 3.47080320020456e-05, + "loss": 0.9711, + "step": 5090 + }, + { + "epoch": 0.37459645895608185, + "grad_norm": 0.83984375, + "learning_rate": 3.470269464771454e-05, + "loss": 0.9966, + "step": 5091 + }, + { + "epoch": 0.37467003908944585, + "grad_norm": 0.7421875, + "learning_rate": 3.469735677265819e-05, + "loss": 0.7787, + "step": 5092 + }, + { + "epoch": 0.37474361922280985, + "grad_norm": 0.9140625, + "learning_rate": 3.4692018377163024e-05, + "loss": 1.1409, + "step": 5093 + }, + { + "epoch": 0.37481719935617386, + "grad_norm": 1.0703125, + "learning_rate": 3.4686679461515534e-05, + "loss": 1.6977, + "step": 5094 + }, + { + "epoch": 0.3748907794895378, + "grad_norm": 4.625, + "learning_rate": 3.4681340026002243e-05, + "loss": 0.7545, + "step": 5095 + }, + { + "epoch": 0.3749643596229018, + "grad_norm": 0.87890625, + "learning_rate": 3.467600007090972e-05, + "loss": 0.9121, + "step": 5096 + }, + { + "epoch": 0.3750379397562658, + "grad_norm": 0.79296875, + "learning_rate": 3.467065959652456e-05, + "loss": 1.0055, + "step": 5097 + }, + { + "epoch": 0.3751115198896298, + "grad_norm": 0.83984375, + "learning_rate": 3.466531860313335e-05, + "loss": 0.9226, + "step": 5098 + }, + { + "epoch": 0.37518510002299377, + "grad_norm": 1.09375, + "learning_rate": 3.465997709102276e-05, + "loss": 1.2116, + "step": 5099 + }, + { + "epoch": 0.37525868015635777, + "grad_norm": 0.8984375, + "learning_rate": 3.4654635060479434e-05, + "loss": 0.8605, + "step": 5100 + }, + { + "epoch": 0.3753322602897218, + "grad_norm": 0.7109375, + "learning_rate": 3.4649292511790085e-05, + "loss": 0.8016, + "step": 5101 + }, + { + "epoch": 0.3754058404230858, + "grad_norm": 0.92578125, + "learning_rate": 3.464394944524143e-05, + "loss": 0.6412, + "step": 5102 + }, + { + "epoch": 0.3754794205564498, + "grad_norm": 0.91796875, + "learning_rate": 3.463860586112022e-05, + "loss": 1.0554, + "step": 5103 + }, + { + "epoch": 0.37555300068981373, + "grad_norm": 0.83203125, + "learning_rate": 3.463326175971324e-05, + "loss": 0.8955, + "step": 5104 + }, + { + "epoch": 0.37562658082317774, + "grad_norm": 0.85546875, + "learning_rate": 3.4627917141307295e-05, + "loss": 0.8602, + "step": 5105 + }, + { + "epoch": 0.37570016095654174, + "grad_norm": 0.89453125, + "learning_rate": 3.462257200618923e-05, + "loss": 0.9276, + "step": 5106 + }, + { + "epoch": 0.37577374108990574, + "grad_norm": 0.6640625, + "learning_rate": 3.4617226354645894e-05, + "loss": 0.6035, + "step": 5107 + }, + { + "epoch": 0.3758473212232697, + "grad_norm": 0.94140625, + "learning_rate": 3.4611880186964185e-05, + "loss": 1.0858, + "step": 5108 + }, + { + "epoch": 0.3759209013566337, + "grad_norm": 0.8203125, + "learning_rate": 3.4606533503431015e-05, + "loss": 0.9035, + "step": 5109 + }, + { + "epoch": 0.3759944814899977, + "grad_norm": 0.80078125, + "learning_rate": 3.460118630433334e-05, + "loss": 0.7965, + "step": 5110 + }, + { + "epoch": 0.3760680616233617, + "grad_norm": 0.80078125, + "learning_rate": 3.459583858995813e-05, + "loss": 0.9757, + "step": 5111 + }, + { + "epoch": 0.3761416417567257, + "grad_norm": 0.82421875, + "learning_rate": 3.459049036059239e-05, + "loss": 0.8417, + "step": 5112 + }, + { + "epoch": 0.37621522189008966, + "grad_norm": 0.8515625, + "learning_rate": 3.458514161652314e-05, + "loss": 1.2755, + "step": 5113 + }, + { + "epoch": 0.37628880202345366, + "grad_norm": 1.015625, + "learning_rate": 3.457979235803744e-05, + "loss": 1.1431, + "step": 5114 + }, + { + "epoch": 0.37636238215681767, + "grad_norm": 0.9921875, + "learning_rate": 3.45744425854224e-05, + "loss": 1.1099, + "step": 5115 + }, + { + "epoch": 0.37643596229018167, + "grad_norm": 0.7578125, + "learning_rate": 3.456909229896509e-05, + "loss": 0.772, + "step": 5116 + }, + { + "epoch": 0.3765095424235456, + "grad_norm": 0.91015625, + "learning_rate": 3.4563741498952674e-05, + "loss": 0.8679, + "step": 5117 + }, + { + "epoch": 0.3765831225569096, + "grad_norm": 0.84765625, + "learning_rate": 3.455839018567231e-05, + "loss": 0.796, + "step": 5118 + }, + { + "epoch": 0.3766567026902736, + "grad_norm": 0.95703125, + "learning_rate": 3.455303835941121e-05, + "loss": 0.7404, + "step": 5119 + }, + { + "epoch": 0.37673028282363763, + "grad_norm": 0.609375, + "learning_rate": 3.4547686020456574e-05, + "loss": 0.604, + "step": 5120 + }, + { + "epoch": 0.37680386295700163, + "grad_norm": 0.625, + "learning_rate": 3.454233316909567e-05, + "loss": 0.6039, + "step": 5121 + }, + { + "epoch": 0.3768774430903656, + "grad_norm": 0.859375, + "learning_rate": 3.453697980561576e-05, + "loss": 0.8094, + "step": 5122 + }, + { + "epoch": 0.3769510232237296, + "grad_norm": 0.875, + "learning_rate": 3.4531625930304155e-05, + "loss": 0.9612, + "step": 5123 + }, + { + "epoch": 0.3770246033570936, + "grad_norm": 0.7265625, + "learning_rate": 3.4526271543448196e-05, + "loss": 0.601, + "step": 5124 + }, + { + "epoch": 0.3770981834904576, + "grad_norm": 0.8671875, + "learning_rate": 3.452091664533523e-05, + "loss": 0.9024, + "step": 5125 + }, + { + "epoch": 0.37717176362382154, + "grad_norm": 0.86328125, + "learning_rate": 3.451556123625266e-05, + "loss": 0.7551, + "step": 5126 + }, + { + "epoch": 0.37724534375718555, + "grad_norm": 0.953125, + "learning_rate": 3.4510205316487885e-05, + "loss": 0.9634, + "step": 5127 + }, + { + "epoch": 0.37731892389054955, + "grad_norm": 0.984375, + "learning_rate": 3.450484888632836e-05, + "loss": 1.0097, + "step": 5128 + }, + { + "epoch": 0.37739250402391356, + "grad_norm": 0.93359375, + "learning_rate": 3.4499491946061534e-05, + "loss": 0.6972, + "step": 5129 + }, + { + "epoch": 0.37746608415727756, + "grad_norm": 0.875, + "learning_rate": 3.449413449597492e-05, + "loss": 0.7327, + "step": 5130 + }, + { + "epoch": 0.3775396642906415, + "grad_norm": 0.83984375, + "learning_rate": 3.448877653635604e-05, + "loss": 1.0995, + "step": 5131 + }, + { + "epoch": 0.3776132444240055, + "grad_norm": 1.03125, + "learning_rate": 3.448341806749245e-05, + "loss": 1.0709, + "step": 5132 + }, + { + "epoch": 0.3776868245573695, + "grad_norm": 0.80859375, + "learning_rate": 3.4478059089671725e-05, + "loss": 0.7534, + "step": 5133 + }, + { + "epoch": 0.3777604046907335, + "grad_norm": 2.859375, + "learning_rate": 3.447269960318147e-05, + "loss": 1.3757, + "step": 5134 + }, + { + "epoch": 0.37783398482409747, + "grad_norm": 0.70703125, + "learning_rate": 3.4467339608309316e-05, + "loss": 0.6253, + "step": 5135 + }, + { + "epoch": 0.3779075649574615, + "grad_norm": 0.87109375, + "learning_rate": 3.4461979105342925e-05, + "loss": 0.7474, + "step": 5136 + }, + { + "epoch": 0.3779811450908255, + "grad_norm": 1.140625, + "learning_rate": 3.445661809456999e-05, + "loss": 1.2046, + "step": 5137 + }, + { + "epoch": 0.3780547252241895, + "grad_norm": 0.97265625, + "learning_rate": 3.4451256576278215e-05, + "loss": 0.9753, + "step": 5138 + }, + { + "epoch": 0.3781283053575535, + "grad_norm": 0.94140625, + "learning_rate": 3.4445894550755356e-05, + "loss": 0.7771, + "step": 5139 + }, + { + "epoch": 0.37820188549091743, + "grad_norm": 1.078125, + "learning_rate": 3.444053201828918e-05, + "loss": 1.2612, + "step": 5140 + }, + { + "epoch": 0.37827546562428144, + "grad_norm": 1.109375, + "learning_rate": 3.4435168979167476e-05, + "loss": 1.3281, + "step": 5141 + }, + { + "epoch": 0.37834904575764544, + "grad_norm": 0.98828125, + "learning_rate": 3.442980543367808e-05, + "loss": 1.2024, + "step": 5142 + }, + { + "epoch": 0.37842262589100945, + "grad_norm": 1.0625, + "learning_rate": 3.442444138210883e-05, + "loss": 1.2625, + "step": 5143 + }, + { + "epoch": 0.3784962060243734, + "grad_norm": 0.91796875, + "learning_rate": 3.441907682474762e-05, + "loss": 0.888, + "step": 5144 + }, + { + "epoch": 0.3785697861577374, + "grad_norm": 0.75, + "learning_rate": 3.441371176188233e-05, + "loss": 0.782, + "step": 5145 + }, + { + "epoch": 0.3786433662911014, + "grad_norm": 1.0, + "learning_rate": 3.440834619380092e-05, + "loss": 0.8574, + "step": 5146 + }, + { + "epoch": 0.3787169464244654, + "grad_norm": 1.0234375, + "learning_rate": 3.4402980120791345e-05, + "loss": 1.3946, + "step": 5147 + }, + { + "epoch": 0.3787905265578294, + "grad_norm": 0.84765625, + "learning_rate": 3.439761354314158e-05, + "loss": 1.2674, + "step": 5148 + }, + { + "epoch": 0.37886410669119336, + "grad_norm": 0.79296875, + "learning_rate": 3.4392246461139656e-05, + "loss": 0.7175, + "step": 5149 + }, + { + "epoch": 0.37893768682455736, + "grad_norm": 0.82421875, + "learning_rate": 3.4386878875073594e-05, + "loss": 0.9561, + "step": 5150 + }, + { + "epoch": 0.37901126695792137, + "grad_norm": 0.85546875, + "learning_rate": 3.438151078523147e-05, + "loss": 0.6987, + "step": 5151 + }, + { + "epoch": 0.37908484709128537, + "grad_norm": 0.8828125, + "learning_rate": 3.4376142191901385e-05, + "loss": 0.8381, + "step": 5152 + }, + { + "epoch": 0.3791584272246493, + "grad_norm": 0.9375, + "learning_rate": 3.437077309537146e-05, + "loss": 1.326, + "step": 5153 + }, + { + "epoch": 0.3792320073580133, + "grad_norm": 0.7734375, + "learning_rate": 3.436540349592984e-05, + "loss": 0.779, + "step": 5154 + }, + { + "epoch": 0.3793055874913773, + "grad_norm": 0.94921875, + "learning_rate": 3.4360033393864696e-05, + "loss": 0.8868, + "step": 5155 + }, + { + "epoch": 0.37937916762474133, + "grad_norm": 0.79296875, + "learning_rate": 3.4354662789464246e-05, + "loss": 0.8471, + "step": 5156 + }, + { + "epoch": 0.37945274775810534, + "grad_norm": 0.78515625, + "learning_rate": 3.4349291683016715e-05, + "loss": 0.8737, + "step": 5157 + }, + { + "epoch": 0.3795263278914693, + "grad_norm": 0.7578125, + "learning_rate": 3.434392007481035e-05, + "loss": 0.7175, + "step": 5158 + }, + { + "epoch": 0.3795999080248333, + "grad_norm": 1.0, + "learning_rate": 3.433854796513344e-05, + "loss": 0.786, + "step": 5159 + }, + { + "epoch": 0.3796734881581973, + "grad_norm": 1.0, + "learning_rate": 3.433317535427432e-05, + "loss": 0.9675, + "step": 5160 + }, + { + "epoch": 0.3797470682915613, + "grad_norm": 0.78125, + "learning_rate": 3.432780224252129e-05, + "loss": 0.8816, + "step": 5161 + }, + { + "epoch": 0.37982064842492524, + "grad_norm": 0.87890625, + "learning_rate": 3.432242863016273e-05, + "loss": 1.2071, + "step": 5162 + }, + { + "epoch": 0.37989422855828925, + "grad_norm": 0.86328125, + "learning_rate": 3.431705451748703e-05, + "loss": 1.0243, + "step": 5163 + }, + { + "epoch": 0.37996780869165325, + "grad_norm": 0.84375, + "learning_rate": 3.4311679904782625e-05, + "loss": 1.092, + "step": 5164 + }, + { + "epoch": 0.38004138882501726, + "grad_norm": 0.7734375, + "learning_rate": 3.4306304792337934e-05, + "loss": 0.7073, + "step": 5165 + }, + { + "epoch": 0.38011496895838126, + "grad_norm": 0.91796875, + "learning_rate": 3.430092918044145e-05, + "loss": 0.805, + "step": 5166 + }, + { + "epoch": 0.3801885490917452, + "grad_norm": 0.671875, + "learning_rate": 3.4295553069381664e-05, + "loss": 0.7357, + "step": 5167 + }, + { + "epoch": 0.3802621292251092, + "grad_norm": 0.796875, + "learning_rate": 3.4290176459447096e-05, + "loss": 0.7566, + "step": 5168 + }, + { + "epoch": 0.3803357093584732, + "grad_norm": 0.71875, + "learning_rate": 3.428479935092631e-05, + "loss": 0.8424, + "step": 5169 + }, + { + "epoch": 0.3804092894918372, + "grad_norm": 0.78515625, + "learning_rate": 3.427942174410787e-05, + "loss": 0.8054, + "step": 5170 + }, + { + "epoch": 0.38048286962520117, + "grad_norm": 0.921875, + "learning_rate": 3.42740436392804e-05, + "loss": 1.303, + "step": 5171 + }, + { + "epoch": 0.3805564497585652, + "grad_norm": 0.83203125, + "learning_rate": 3.426866503673252e-05, + "loss": 0.9888, + "step": 5172 + }, + { + "epoch": 0.3806300298919292, + "grad_norm": 0.84375, + "learning_rate": 3.42632859367529e-05, + "loss": 1.2661, + "step": 5173 + }, + { + "epoch": 0.3807036100252932, + "grad_norm": 0.91796875, + "learning_rate": 3.425790633963021e-05, + "loss": 0.8517, + "step": 5174 + }, + { + "epoch": 0.3807771901586572, + "grad_norm": 0.8359375, + "learning_rate": 3.4252526245653184e-05, + "loss": 0.8681, + "step": 5175 + }, + { + "epoch": 0.38085077029202113, + "grad_norm": 0.9375, + "learning_rate": 3.4247145655110544e-05, + "loss": 1.0429, + "step": 5176 + }, + { + "epoch": 0.38092435042538514, + "grad_norm": 0.98828125, + "learning_rate": 3.4241764568291057e-05, + "loss": 1.257, + "step": 5177 + }, + { + "epoch": 0.38099793055874914, + "grad_norm": 0.7265625, + "learning_rate": 3.423638298548352e-05, + "loss": 0.7095, + "step": 5178 + }, + { + "epoch": 0.38107151069211315, + "grad_norm": 1.0625, + "learning_rate": 3.423100090697676e-05, + "loss": 1.2795, + "step": 5179 + }, + { + "epoch": 0.3811450908254771, + "grad_norm": 0.80078125, + "learning_rate": 3.422561833305962e-05, + "loss": 1.0184, + "step": 5180 + }, + { + "epoch": 0.3812186709588411, + "grad_norm": 0.9296875, + "learning_rate": 3.422023526402096e-05, + "loss": 1.1854, + "step": 5181 + }, + { + "epoch": 0.3812922510922051, + "grad_norm": 0.91015625, + "learning_rate": 3.421485170014969e-05, + "loss": 1.0371, + "step": 5182 + }, + { + "epoch": 0.3813658312255691, + "grad_norm": 0.8515625, + "learning_rate": 3.420946764173474e-05, + "loss": 0.8685, + "step": 5183 + }, + { + "epoch": 0.3814394113589331, + "grad_norm": 0.98046875, + "learning_rate": 3.4204083089065054e-05, + "loss": 1.1983, + "step": 5184 + }, + { + "epoch": 0.38151299149229706, + "grad_norm": 0.99609375, + "learning_rate": 3.4198698042429604e-05, + "loss": 1.0159, + "step": 5185 + }, + { + "epoch": 0.38158657162566106, + "grad_norm": 1.03125, + "learning_rate": 3.419331250211741e-05, + "loss": 0.9349, + "step": 5186 + }, + { + "epoch": 0.38166015175902507, + "grad_norm": 0.92578125, + "learning_rate": 3.418792646841749e-05, + "loss": 0.7611, + "step": 5187 + }, + { + "epoch": 0.38173373189238907, + "grad_norm": 0.92578125, + "learning_rate": 3.418253994161892e-05, + "loss": 1.2321, + "step": 5188 + }, + { + "epoch": 0.381807312025753, + "grad_norm": 0.86328125, + "learning_rate": 3.4177152922010776e-05, + "loss": 0.8143, + "step": 5189 + }, + { + "epoch": 0.381880892159117, + "grad_norm": 0.8359375, + "learning_rate": 3.4171765409882165e-05, + "loss": 0.8956, + "step": 5190 + }, + { + "epoch": 0.38195447229248103, + "grad_norm": 0.7109375, + "learning_rate": 3.416637740552222e-05, + "loss": 0.9225, + "step": 5191 + }, + { + "epoch": 0.38202805242584503, + "grad_norm": 0.95703125, + "learning_rate": 3.416098890922012e-05, + "loss": 1.4415, + "step": 5192 + }, + { + "epoch": 0.38210163255920904, + "grad_norm": 0.8203125, + "learning_rate": 3.4155599921265044e-05, + "loss": 0.7804, + "step": 5193 + }, + { + "epoch": 0.382175212692573, + "grad_norm": 0.73828125, + "learning_rate": 3.415021044194622e-05, + "loss": 0.8919, + "step": 5194 + }, + { + "epoch": 0.382248792825937, + "grad_norm": 0.73046875, + "learning_rate": 3.414482047155288e-05, + "loss": 0.8574, + "step": 5195 + }, + { + "epoch": 0.382322372959301, + "grad_norm": 0.953125, + "learning_rate": 3.413943001037429e-05, + "loss": 0.9972, + "step": 5196 + }, + { + "epoch": 0.382395953092665, + "grad_norm": 1.0078125, + "learning_rate": 3.4134039058699765e-05, + "loss": 1.4238, + "step": 5197 + }, + { + "epoch": 0.38246953322602895, + "grad_norm": 0.92578125, + "learning_rate": 3.412864761681861e-05, + "loss": 1.0294, + "step": 5198 + }, + { + "epoch": 0.38254311335939295, + "grad_norm": 0.82421875, + "learning_rate": 3.412325568502018e-05, + "loss": 0.786, + "step": 5199 + }, + { + "epoch": 0.38261669349275695, + "grad_norm": 1.1328125, + "learning_rate": 3.411786326359384e-05, + "loss": 1.2168, + "step": 5200 + }, + { + "epoch": 0.38269027362612096, + "grad_norm": 0.75, + "learning_rate": 3.411247035282902e-05, + "loss": 0.588, + "step": 5201 + }, + { + "epoch": 0.38276385375948496, + "grad_norm": 0.64453125, + "learning_rate": 3.410707695301511e-05, + "loss": 0.7805, + "step": 5202 + }, + { + "epoch": 0.3828374338928489, + "grad_norm": 0.7578125, + "learning_rate": 3.410168306444158e-05, + "loss": 0.7293, + "step": 5203 + }, + { + "epoch": 0.3829110140262129, + "grad_norm": 0.75390625, + "learning_rate": 3.409628868739793e-05, + "loss": 0.9363, + "step": 5204 + }, + { + "epoch": 0.3829845941595769, + "grad_norm": 0.85546875, + "learning_rate": 3.409089382217362e-05, + "loss": 0.8522, + "step": 5205 + }, + { + "epoch": 0.3830581742929409, + "grad_norm": 1.171875, + "learning_rate": 3.408549846905822e-05, + "loss": 0.8536, + "step": 5206 + }, + { + "epoch": 0.38313175442630487, + "grad_norm": 1.0390625, + "learning_rate": 3.408010262834128e-05, + "loss": 0.9996, + "step": 5207 + }, + { + "epoch": 0.3832053345596689, + "grad_norm": 0.71875, + "learning_rate": 3.4074706300312385e-05, + "loss": 0.6353, + "step": 5208 + }, + { + "epoch": 0.3832789146930329, + "grad_norm": 0.859375, + "learning_rate": 3.406930948526114e-05, + "loss": 0.91, + "step": 5209 + }, + { + "epoch": 0.3833524948263969, + "grad_norm": 0.93359375, + "learning_rate": 3.4063912183477186e-05, + "loss": 1.0982, + "step": 5210 + }, + { + "epoch": 0.3834260749597609, + "grad_norm": 0.9375, + "learning_rate": 3.405851439525018e-05, + "loss": 1.0828, + "step": 5211 + }, + { + "epoch": 0.38349965509312484, + "grad_norm": 0.95703125, + "learning_rate": 3.405311612086981e-05, + "loss": 1.0176, + "step": 5212 + }, + { + "epoch": 0.38357323522648884, + "grad_norm": 0.859375, + "learning_rate": 3.4047717360625804e-05, + "loss": 0.9045, + "step": 5213 + }, + { + "epoch": 0.38364681535985284, + "grad_norm": 0.71875, + "learning_rate": 3.404231811480789e-05, + "loss": 0.507, + "step": 5214 + }, + { + "epoch": 0.38372039549321685, + "grad_norm": 0.92578125, + "learning_rate": 3.403691838370585e-05, + "loss": 0.8231, + "step": 5215 + }, + { + "epoch": 0.3837939756265808, + "grad_norm": 1.0078125, + "learning_rate": 3.403151816760947e-05, + "loss": 1.22, + "step": 5216 + }, + { + "epoch": 0.3838675557599448, + "grad_norm": 0.71484375, + "learning_rate": 3.402611746680857e-05, + "loss": 0.5894, + "step": 5217 + }, + { + "epoch": 0.3839411358933088, + "grad_norm": 0.78125, + "learning_rate": 3.4020716281592974e-05, + "loss": 0.8356, + "step": 5218 + }, + { + "epoch": 0.3840147160266728, + "grad_norm": 0.84375, + "learning_rate": 3.401531461225258e-05, + "loss": 0.8682, + "step": 5219 + }, + { + "epoch": 0.3840882961600368, + "grad_norm": 0.8984375, + "learning_rate": 3.400991245907729e-05, + "loss": 1.2747, + "step": 5220 + }, + { + "epoch": 0.38416187629340076, + "grad_norm": 0.88671875, + "learning_rate": 3.400450982235701e-05, + "loss": 1.047, + "step": 5221 + }, + { + "epoch": 0.38423545642676477, + "grad_norm": 1.0390625, + "learning_rate": 3.39991067023817e-05, + "loss": 1.0059, + "step": 5222 + }, + { + "epoch": 0.38430903656012877, + "grad_norm": 0.859375, + "learning_rate": 3.3993703099441323e-05, + "loss": 0.7447, + "step": 5223 + }, + { + "epoch": 0.3843826166934928, + "grad_norm": 0.9609375, + "learning_rate": 3.3988299013825894e-05, + "loss": 1.1768, + "step": 5224 + }, + { + "epoch": 0.3844561968268567, + "grad_norm": 0.828125, + "learning_rate": 3.398289444582542e-05, + "loss": 0.8489, + "step": 5225 + }, + { + "epoch": 0.3845297769602207, + "grad_norm": 0.796875, + "learning_rate": 3.397748939572998e-05, + "loss": 0.9217, + "step": 5226 + }, + { + "epoch": 0.38460335709358473, + "grad_norm": 0.921875, + "learning_rate": 3.397208386382963e-05, + "loss": 1.3784, + "step": 5227 + }, + { + "epoch": 0.38467693722694873, + "grad_norm": 0.99609375, + "learning_rate": 3.396667785041449e-05, + "loss": 0.9415, + "step": 5228 + }, + { + "epoch": 0.38475051736031274, + "grad_norm": 0.6796875, + "learning_rate": 3.396127135577469e-05, + "loss": 0.7007, + "step": 5229 + }, + { + "epoch": 0.3848240974936767, + "grad_norm": 0.8984375, + "learning_rate": 3.3955864380200374e-05, + "loss": 1.1998, + "step": 5230 + }, + { + "epoch": 0.3848976776270407, + "grad_norm": 0.80078125, + "learning_rate": 3.3950456923981736e-05, + "loss": 1.1884, + "step": 5231 + }, + { + "epoch": 0.3849712577604047, + "grad_norm": 0.87109375, + "learning_rate": 3.394504898740898e-05, + "loss": 0.924, + "step": 5232 + }, + { + "epoch": 0.3850448378937687, + "grad_norm": 0.91796875, + "learning_rate": 3.3939640570772344e-05, + "loss": 1.4184, + "step": 5233 + }, + { + "epoch": 0.38511841802713265, + "grad_norm": 0.84765625, + "learning_rate": 3.393423167436208e-05, + "loss": 0.9991, + "step": 5234 + }, + { + "epoch": 0.38519199816049665, + "grad_norm": 0.64453125, + "learning_rate": 3.392882229846847e-05, + "loss": 0.7082, + "step": 5235 + }, + { + "epoch": 0.38526557829386066, + "grad_norm": 0.83984375, + "learning_rate": 3.392341244338184e-05, + "loss": 0.6199, + "step": 5236 + }, + { + "epoch": 0.38533915842722466, + "grad_norm": 0.97265625, + "learning_rate": 3.391800210939251e-05, + "loss": 1.1754, + "step": 5237 + }, + { + "epoch": 0.38541273856058866, + "grad_norm": 0.79296875, + "learning_rate": 3.391259129679086e-05, + "loss": 1.2509, + "step": 5238 + }, + { + "epoch": 0.3854863186939526, + "grad_norm": 0.78515625, + "learning_rate": 3.390718000586725e-05, + "loss": 0.9425, + "step": 5239 + }, + { + "epoch": 0.3855598988273166, + "grad_norm": 0.95703125, + "learning_rate": 3.390176823691214e-05, + "loss": 0.893, + "step": 5240 + }, + { + "epoch": 0.3856334789606806, + "grad_norm": 0.75390625, + "learning_rate": 3.3896355990215925e-05, + "loss": 0.9134, + "step": 5241 + }, + { + "epoch": 0.3857070590940446, + "grad_norm": 0.8203125, + "learning_rate": 3.389094326606909e-05, + "loss": 0.7703, + "step": 5242 + }, + { + "epoch": 0.3857806392274086, + "grad_norm": 1.078125, + "learning_rate": 3.388553006476212e-05, + "loss": 1.2572, + "step": 5243 + }, + { + "epoch": 0.3858542193607726, + "grad_norm": 0.703125, + "learning_rate": 3.388011638658554e-05, + "loss": 0.9101, + "step": 5244 + }, + { + "epoch": 0.3859277994941366, + "grad_norm": 0.8515625, + "learning_rate": 3.3874702231829883e-05, + "loss": 0.7354, + "step": 5245 + }, + { + "epoch": 0.3860013796275006, + "grad_norm": 0.87109375, + "learning_rate": 3.386928760078571e-05, + "loss": 1.211, + "step": 5246 + }, + { + "epoch": 0.3860749597608646, + "grad_norm": 0.8515625, + "learning_rate": 3.386387249374364e-05, + "loss": 1.1345, + "step": 5247 + }, + { + "epoch": 0.38614853989422854, + "grad_norm": 0.69140625, + "learning_rate": 3.385845691099426e-05, + "loss": 0.6798, + "step": 5248 + }, + { + "epoch": 0.38622212002759254, + "grad_norm": 0.94140625, + "learning_rate": 3.385304085282824e-05, + "loss": 0.9198, + "step": 5249 + }, + { + "epoch": 0.38629570016095655, + "grad_norm": 0.83203125, + "learning_rate": 3.384762431953623e-05, + "loss": 0.7693, + "step": 5250 + }, + { + "epoch": 0.38636928029432055, + "grad_norm": 0.7734375, + "learning_rate": 3.384220731140894e-05, + "loss": 0.7477, + "step": 5251 + }, + { + "epoch": 0.3864428604276845, + "grad_norm": 0.73046875, + "learning_rate": 3.3836789828737074e-05, + "loss": 0.8239, + "step": 5252 + }, + { + "epoch": 0.3865164405610485, + "grad_norm": 0.77734375, + "learning_rate": 3.383137187181139e-05, + "loss": 0.8524, + "step": 5253 + }, + { + "epoch": 0.3865900206944125, + "grad_norm": 0.86328125, + "learning_rate": 3.382595344092267e-05, + "loss": 1.1263, + "step": 5254 + }, + { + "epoch": 0.3866636008277765, + "grad_norm": 0.94140625, + "learning_rate": 3.382053453636169e-05, + "loss": 1.233, + "step": 5255 + }, + { + "epoch": 0.3867371809611405, + "grad_norm": 0.953125, + "learning_rate": 3.3815115158419287e-05, + "loss": 1.3229, + "step": 5256 + }, + { + "epoch": 0.38681076109450446, + "grad_norm": 0.83203125, + "learning_rate": 3.3809695307386294e-05, + "loss": 0.9049, + "step": 5257 + }, + { + "epoch": 0.38688434122786847, + "grad_norm": 0.7578125, + "learning_rate": 3.38042749835536e-05, + "loss": 0.7716, + "step": 5258 + }, + { + "epoch": 0.38695792136123247, + "grad_norm": 0.96484375, + "learning_rate": 3.379885418721209e-05, + "loss": 1.0446, + "step": 5259 + }, + { + "epoch": 0.3870315014945965, + "grad_norm": 1.0625, + "learning_rate": 3.3793432918652695e-05, + "loss": 1.0852, + "step": 5260 + }, + { + "epoch": 0.3871050816279604, + "grad_norm": 0.88671875, + "learning_rate": 3.378801117816637e-05, + "loss": 0.8312, + "step": 5261 + }, + { + "epoch": 0.3871786617613244, + "grad_norm": 0.8203125, + "learning_rate": 3.378258896604408e-05, + "loss": 0.795, + "step": 5262 + }, + { + "epoch": 0.38725224189468843, + "grad_norm": 1.1796875, + "learning_rate": 3.377716628257683e-05, + "loss": 1.0622, + "step": 5263 + }, + { + "epoch": 0.38732582202805244, + "grad_norm": 0.91015625, + "learning_rate": 3.3771743128055645e-05, + "loss": 0.953, + "step": 5264 + }, + { + "epoch": 0.38739940216141644, + "grad_norm": 1.0078125, + "learning_rate": 3.3766319502771565e-05, + "loss": 0.901, + "step": 5265 + }, + { + "epoch": 0.3874729822947804, + "grad_norm": 0.8046875, + "learning_rate": 3.376089540701568e-05, + "loss": 0.9593, + "step": 5266 + }, + { + "epoch": 0.3875465624281444, + "grad_norm": 0.625, + "learning_rate": 3.375547084107908e-05, + "loss": 0.7164, + "step": 5267 + }, + { + "epoch": 0.3876201425615084, + "grad_norm": 1.0703125, + "learning_rate": 3.375004580525291e-05, + "loss": 1.3671, + "step": 5268 + }, + { + "epoch": 0.3876937226948724, + "grad_norm": 0.9609375, + "learning_rate": 3.37446202998283e-05, + "loss": 1.0811, + "step": 5269 + }, + { + "epoch": 0.38776730282823635, + "grad_norm": 1.0078125, + "learning_rate": 3.3739194325096436e-05, + "loss": 1.0696, + "step": 5270 + }, + { + "epoch": 0.38784088296160035, + "grad_norm": 0.9609375, + "learning_rate": 3.373376788134852e-05, + "loss": 1.0669, + "step": 5271 + }, + { + "epoch": 0.38791446309496436, + "grad_norm": 0.74609375, + "learning_rate": 3.3728340968875773e-05, + "loss": 0.7129, + "step": 5272 + }, + { + "epoch": 0.38798804322832836, + "grad_norm": 0.78125, + "learning_rate": 3.372291358796945e-05, + "loss": 0.8409, + "step": 5273 + }, + { + "epoch": 0.38806162336169236, + "grad_norm": 0.96875, + "learning_rate": 3.371748573892084e-05, + "loss": 1.2108, + "step": 5274 + }, + { + "epoch": 0.3881352034950563, + "grad_norm": 0.7578125, + "learning_rate": 3.3712057422021224e-05, + "loss": 0.7328, + "step": 5275 + }, + { + "epoch": 0.3882087836284203, + "grad_norm": 0.92578125, + "learning_rate": 3.370662863756194e-05, + "loss": 0.97, + "step": 5276 + }, + { + "epoch": 0.3882823637617843, + "grad_norm": 0.703125, + "learning_rate": 3.370119938583436e-05, + "loss": 0.8496, + "step": 5277 + }, + { + "epoch": 0.3883559438951483, + "grad_norm": 1.078125, + "learning_rate": 3.369576966712982e-05, + "loss": 0.8405, + "step": 5278 + }, + { + "epoch": 0.3884295240285123, + "grad_norm": 0.87109375, + "learning_rate": 3.369033948173976e-05, + "loss": 0.9884, + "step": 5279 + }, + { + "epoch": 0.3885031041618763, + "grad_norm": 0.875, + "learning_rate": 3.3684908829955586e-05, + "loss": 0.7494, + "step": 5280 + }, + { + "epoch": 0.3885766842952403, + "grad_norm": 0.92578125, + "learning_rate": 3.367947771206877e-05, + "loss": 0.7816, + "step": 5281 + }, + { + "epoch": 0.3886502644286043, + "grad_norm": 0.875, + "learning_rate": 3.3674046128370766e-05, + "loss": 0.6771, + "step": 5282 + }, + { + "epoch": 0.3887238445619683, + "grad_norm": 0.72265625, + "learning_rate": 3.366861407915309e-05, + "loss": 0.9264, + "step": 5283 + }, + { + "epoch": 0.38879742469533224, + "grad_norm": 0.83984375, + "learning_rate": 3.3663181564707286e-05, + "loss": 0.8988, + "step": 5284 + }, + { + "epoch": 0.38887100482869624, + "grad_norm": 0.80078125, + "learning_rate": 3.365774858532487e-05, + "loss": 1.3125, + "step": 5285 + }, + { + "epoch": 0.38894458496206025, + "grad_norm": 0.79296875, + "learning_rate": 3.365231514129745e-05, + "loss": 1.2386, + "step": 5286 + }, + { + "epoch": 0.38901816509542425, + "grad_norm": 0.953125, + "learning_rate": 3.364688123291662e-05, + "loss": 0.934, + "step": 5287 + }, + { + "epoch": 0.3890917452287882, + "grad_norm": 0.93359375, + "learning_rate": 3.3641446860474e-05, + "loss": 0.8637, + "step": 5288 + }, + { + "epoch": 0.3891653253621522, + "grad_norm": 0.79296875, + "learning_rate": 3.363601202426124e-05, + "loss": 0.9003, + "step": 5289 + }, + { + "epoch": 0.3892389054955162, + "grad_norm": 0.8359375, + "learning_rate": 3.3630576724570046e-05, + "loss": 0.9079, + "step": 5290 + }, + { + "epoch": 0.3893124856288802, + "grad_norm": 0.59765625, + "learning_rate": 3.3625140961692084e-05, + "loss": 0.6688, + "step": 5291 + }, + { + "epoch": 0.3893860657622442, + "grad_norm": 0.84765625, + "learning_rate": 3.361970473591911e-05, + "loss": 0.7819, + "step": 5292 + }, + { + "epoch": 0.38945964589560816, + "grad_norm": 0.703125, + "learning_rate": 3.361426804754285e-05, + "loss": 0.6088, + "step": 5293 + }, + { + "epoch": 0.38953322602897217, + "grad_norm": 0.95703125, + "learning_rate": 3.36088308968551e-05, + "loss": 1.0231, + "step": 5294 + }, + { + "epoch": 0.38960680616233617, + "grad_norm": 1.0703125, + "learning_rate": 3.3603393284147656e-05, + "loss": 1.3235, + "step": 5295 + }, + { + "epoch": 0.3896803862957002, + "grad_norm": 0.96875, + "learning_rate": 3.3597955209712337e-05, + "loss": 0.8804, + "step": 5296 + }, + { + "epoch": 0.3897539664290642, + "grad_norm": 0.84765625, + "learning_rate": 3.359251667384101e-05, + "loss": 1.1223, + "step": 5297 + }, + { + "epoch": 0.38982754656242813, + "grad_norm": 0.7109375, + "learning_rate": 3.358707767682554e-05, + "loss": 0.8199, + "step": 5298 + }, + { + "epoch": 0.38990112669579213, + "grad_norm": 0.85546875, + "learning_rate": 3.358163821895783e-05, + "loss": 0.661, + "step": 5299 + }, + { + "epoch": 0.38997470682915614, + "grad_norm": 1.296875, + "learning_rate": 3.3576198300529804e-05, + "loss": 1.1993, + "step": 5300 + }, + { + "epoch": 0.39004828696252014, + "grad_norm": 0.8203125, + "learning_rate": 3.357075792183341e-05, + "loss": 1.0676, + "step": 5301 + }, + { + "epoch": 0.3901218670958841, + "grad_norm": 0.8671875, + "learning_rate": 3.356531708316063e-05, + "loss": 1.09, + "step": 5302 + }, + { + "epoch": 0.3901954472292481, + "grad_norm": 0.7109375, + "learning_rate": 3.3559875784803465e-05, + "loss": 0.92, + "step": 5303 + }, + { + "epoch": 0.3902690273626121, + "grad_norm": 0.9140625, + "learning_rate": 3.3554434027053926e-05, + "loss": 1.0002, + "step": 5304 + }, + { + "epoch": 0.3903426074959761, + "grad_norm": 0.92578125, + "learning_rate": 3.354899181020407e-05, + "loss": 1.2513, + "step": 5305 + }, + { + "epoch": 0.3904161876293401, + "grad_norm": 0.6796875, + "learning_rate": 3.3543549134545975e-05, + "loss": 0.6741, + "step": 5306 + }, + { + "epoch": 0.39048976776270405, + "grad_norm": 0.7734375, + "learning_rate": 3.353810600037173e-05, + "loss": 1.0504, + "step": 5307 + }, + { + "epoch": 0.39056334789606806, + "grad_norm": 0.62890625, + "learning_rate": 3.353266240797346e-05, + "loss": 0.5393, + "step": 5308 + }, + { + "epoch": 0.39063692802943206, + "grad_norm": 0.75, + "learning_rate": 3.352721835764333e-05, + "loss": 1.1147, + "step": 5309 + }, + { + "epoch": 0.39071050816279607, + "grad_norm": 1.0625, + "learning_rate": 3.3521773849673475e-05, + "loss": 0.9932, + "step": 5310 + }, + { + "epoch": 0.39078408829616, + "grad_norm": 0.8203125, + "learning_rate": 3.351632888435613e-05, + "loss": 0.7278, + "step": 5311 + }, + { + "epoch": 0.390857668429524, + "grad_norm": 0.68359375, + "learning_rate": 3.351088346198349e-05, + "loss": 0.7127, + "step": 5312 + }, + { + "epoch": 0.390931248562888, + "grad_norm": 0.7890625, + "learning_rate": 3.350543758284781e-05, + "loss": 0.9187, + "step": 5313 + }, + { + "epoch": 0.391004828696252, + "grad_norm": 0.7734375, + "learning_rate": 3.349999124724136e-05, + "loss": 0.9182, + "step": 5314 + }, + { + "epoch": 0.39107840882961603, + "grad_norm": 0.65234375, + "learning_rate": 3.349454445545644e-05, + "loss": 0.7731, + "step": 5315 + }, + { + "epoch": 0.39115198896298, + "grad_norm": 1.640625, + "learning_rate": 3.348909720778535e-05, + "loss": 0.8143, + "step": 5316 + }, + { + "epoch": 0.391225569096344, + "grad_norm": 0.890625, + "learning_rate": 3.348364950452046e-05, + "loss": 1.1468, + "step": 5317 + }, + { + "epoch": 0.391299149229708, + "grad_norm": 0.94140625, + "learning_rate": 3.347820134595412e-05, + "loss": 0.9937, + "step": 5318 + }, + { + "epoch": 0.391372729363072, + "grad_norm": 0.86328125, + "learning_rate": 3.347275273237872e-05, + "loss": 1.2644, + "step": 5319 + }, + { + "epoch": 0.39144630949643594, + "grad_norm": 0.90234375, + "learning_rate": 3.346730366408669e-05, + "loss": 0.9775, + "step": 5320 + }, + { + "epoch": 0.39151988962979994, + "grad_norm": 0.91796875, + "learning_rate": 3.346185414137046e-05, + "loss": 0.8551, + "step": 5321 + }, + { + "epoch": 0.39159346976316395, + "grad_norm": 0.9609375, + "learning_rate": 3.345640416452251e-05, + "loss": 1.1624, + "step": 5322 + }, + { + "epoch": 0.39166704989652795, + "grad_norm": 0.828125, + "learning_rate": 3.345095373383531e-05, + "loss": 0.8431, + "step": 5323 + }, + { + "epoch": 0.39174063002989196, + "grad_norm": 0.69140625, + "learning_rate": 3.344550284960139e-05, + "loss": 0.5652, + "step": 5324 + }, + { + "epoch": 0.3918142101632559, + "grad_norm": 0.859375, + "learning_rate": 3.3440051512113285e-05, + "loss": 0.7254, + "step": 5325 + }, + { + "epoch": 0.3918877902966199, + "grad_norm": 0.87109375, + "learning_rate": 3.343459972166355e-05, + "loss": 0.899, + "step": 5326 + }, + { + "epoch": 0.3919613704299839, + "grad_norm": 0.91796875, + "learning_rate": 3.342914747854478e-05, + "loss": 1.1617, + "step": 5327 + }, + { + "epoch": 0.3920349505633479, + "grad_norm": 0.859375, + "learning_rate": 3.342369478304958e-05, + "loss": 1.0367, + "step": 5328 + }, + { + "epoch": 0.39210853069671187, + "grad_norm": 1.0078125, + "learning_rate": 3.341824163547061e-05, + "loss": 1.0766, + "step": 5329 + }, + { + "epoch": 0.39218211083007587, + "grad_norm": 0.8046875, + "learning_rate": 3.3412788036100504e-05, + "loss": 0.8493, + "step": 5330 + }, + { + "epoch": 0.3922556909634399, + "grad_norm": 0.87109375, + "learning_rate": 3.340733398523195e-05, + "loss": 1.1773, + "step": 5331 + }, + { + "epoch": 0.3923292710968039, + "grad_norm": 0.8125, + "learning_rate": 3.3401879483157655e-05, + "loss": 0.8706, + "step": 5332 + }, + { + "epoch": 0.3924028512301679, + "grad_norm": 1.015625, + "learning_rate": 3.339642453017036e-05, + "loss": 1.0878, + "step": 5333 + }, + { + "epoch": 0.39247643136353183, + "grad_norm": 0.81640625, + "learning_rate": 3.3390969126562824e-05, + "loss": 0.688, + "step": 5334 + }, + { + "epoch": 0.39255001149689583, + "grad_norm": 0.828125, + "learning_rate": 3.3385513272627816e-05, + "loss": 0.8408, + "step": 5335 + }, + { + "epoch": 0.39262359163025984, + "grad_norm": 1.1015625, + "learning_rate": 3.338005696865816e-05, + "loss": 1.3848, + "step": 5336 + }, + { + "epoch": 0.39269717176362384, + "grad_norm": 0.76953125, + "learning_rate": 3.337460021494666e-05, + "loss": 0.8371, + "step": 5337 + }, + { + "epoch": 0.3927707518969878, + "grad_norm": 0.9453125, + "learning_rate": 3.3369143011786195e-05, + "loss": 0.8894, + "step": 5338 + }, + { + "epoch": 0.3928443320303518, + "grad_norm": 0.87109375, + "learning_rate": 3.336368535946963e-05, + "loss": 0.9933, + "step": 5339 + }, + { + "epoch": 0.3929179121637158, + "grad_norm": 0.8203125, + "learning_rate": 3.3358227258289865e-05, + "loss": 0.8759, + "step": 5340 + }, + { + "epoch": 0.3929914922970798, + "grad_norm": 0.859375, + "learning_rate": 3.335276870853983e-05, + "loss": 1.1022, + "step": 5341 + }, + { + "epoch": 0.3930650724304438, + "grad_norm": 0.77734375, + "learning_rate": 3.334730971051247e-05, + "loss": 0.9794, + "step": 5342 + }, + { + "epoch": 0.39313865256380776, + "grad_norm": 0.76171875, + "learning_rate": 3.334185026450077e-05, + "loss": 0.8649, + "step": 5343 + }, + { + "epoch": 0.39321223269717176, + "grad_norm": 0.76171875, + "learning_rate": 3.333639037079772e-05, + "loss": 0.7994, + "step": 5344 + }, + { + "epoch": 0.39328581283053576, + "grad_norm": 0.859375, + "learning_rate": 3.333093002969635e-05, + "loss": 0.9591, + "step": 5345 + }, + { + "epoch": 0.39335939296389977, + "grad_norm": 0.78125, + "learning_rate": 3.3325469241489696e-05, + "loss": 0.8415, + "step": 5346 + }, + { + "epoch": 0.3934329730972637, + "grad_norm": 1.0703125, + "learning_rate": 3.3320008006470825e-05, + "loss": 1.1434, + "step": 5347 + }, + { + "epoch": 0.3935065532306277, + "grad_norm": 0.9296875, + "learning_rate": 3.331454632493284e-05, + "loss": 1.3751, + "step": 5348 + }, + { + "epoch": 0.3935801333639917, + "grad_norm": 0.6484375, + "learning_rate": 3.330908419716886e-05, + "loss": 0.9702, + "step": 5349 + }, + { + "epoch": 0.39365371349735573, + "grad_norm": 1.09375, + "learning_rate": 3.330362162347204e-05, + "loss": 1.0524, + "step": 5350 + }, + { + "epoch": 0.39372729363071973, + "grad_norm": 1.03125, + "learning_rate": 3.329815860413551e-05, + "loss": 0.8381, + "step": 5351 + }, + { + "epoch": 0.3938008737640837, + "grad_norm": 0.875, + "learning_rate": 3.329269513945249e-05, + "loss": 0.9965, + "step": 5352 + }, + { + "epoch": 0.3938744538974477, + "grad_norm": 0.84765625, + "learning_rate": 3.328723122971619e-05, + "loss": 1.1901, + "step": 5353 + }, + { + "epoch": 0.3939480340308117, + "grad_norm": 0.75390625, + "learning_rate": 3.328176687521983e-05, + "loss": 0.9843, + "step": 5354 + }, + { + "epoch": 0.3940216141641757, + "grad_norm": 0.79296875, + "learning_rate": 3.327630207625668e-05, + "loss": 0.827, + "step": 5355 + }, + { + "epoch": 0.39409519429753964, + "grad_norm": 0.89453125, + "learning_rate": 3.327083683312004e-05, + "loss": 0.8483, + "step": 5356 + }, + { + "epoch": 0.39416877443090365, + "grad_norm": 0.7578125, + "learning_rate": 3.326537114610321e-05, + "loss": 1.094, + "step": 5357 + }, + { + "epoch": 0.39424235456426765, + "grad_norm": 0.8828125, + "learning_rate": 3.325990501549952e-05, + "loss": 0.8335, + "step": 5358 + }, + { + "epoch": 0.39431593469763165, + "grad_norm": 0.81640625, + "learning_rate": 3.325443844160233e-05, + "loss": 0.8375, + "step": 5359 + }, + { + "epoch": 0.39438951483099566, + "grad_norm": 0.875, + "learning_rate": 3.324897142470502e-05, + "loss": 0.8763, + "step": 5360 + }, + { + "epoch": 0.3944630949643596, + "grad_norm": 1.390625, + "learning_rate": 3.324350396510099e-05, + "loss": 1.296, + "step": 5361 + }, + { + "epoch": 0.3945366750977236, + "grad_norm": 0.96484375, + "learning_rate": 3.323803606308367e-05, + "loss": 1.3584, + "step": 5362 + }, + { + "epoch": 0.3946102552310876, + "grad_norm": 0.75, + "learning_rate": 3.3232567718946516e-05, + "loss": 0.7868, + "step": 5363 + }, + { + "epoch": 0.3946838353644516, + "grad_norm": 0.84765625, + "learning_rate": 3.3227098932983e-05, + "loss": 0.9081, + "step": 5364 + }, + { + "epoch": 0.39475741549781557, + "grad_norm": 0.7265625, + "learning_rate": 3.3221629705486627e-05, + "loss": 0.8263, + "step": 5365 + }, + { + "epoch": 0.39483099563117957, + "grad_norm": 0.8671875, + "learning_rate": 3.321616003675091e-05, + "loss": 1.1382, + "step": 5366 + }, + { + "epoch": 0.3949045757645436, + "grad_norm": 1.078125, + "learning_rate": 3.32106899270694e-05, + "loss": 1.0956, + "step": 5367 + }, + { + "epoch": 0.3949781558979076, + "grad_norm": 0.80078125, + "learning_rate": 3.320521937673568e-05, + "loss": 0.9621, + "step": 5368 + }, + { + "epoch": 0.3950517360312716, + "grad_norm": 0.80859375, + "learning_rate": 3.319974838604333e-05, + "loss": 0.6868, + "step": 5369 + }, + { + "epoch": 0.39512531616463553, + "grad_norm": 0.6875, + "learning_rate": 3.3194276955285976e-05, + "loss": 1.1196, + "step": 5370 + }, + { + "epoch": 0.39519889629799954, + "grad_norm": 0.796875, + "learning_rate": 3.3188805084757244e-05, + "loss": 0.6754, + "step": 5371 + }, + { + "epoch": 0.39527247643136354, + "grad_norm": 0.703125, + "learning_rate": 3.318333277475081e-05, + "loss": 0.7185, + "step": 5372 + }, + { + "epoch": 0.39534605656472754, + "grad_norm": 0.8671875, + "learning_rate": 3.317786002556037e-05, + "loss": 0.7248, + "step": 5373 + }, + { + "epoch": 0.3954196366980915, + "grad_norm": 0.7578125, + "learning_rate": 3.3172386837479625e-05, + "loss": 1.2999, + "step": 5374 + }, + { + "epoch": 0.3954932168314555, + "grad_norm": 0.76953125, + "learning_rate": 3.316691321080231e-05, + "loss": 0.7879, + "step": 5375 + }, + { + "epoch": 0.3955667969648195, + "grad_norm": 0.90625, + "learning_rate": 3.3161439145822194e-05, + "loss": 0.7453, + "step": 5376 + }, + { + "epoch": 0.3956403770981835, + "grad_norm": 0.94140625, + "learning_rate": 3.315596464283306e-05, + "loss": 0.8616, + "step": 5377 + }, + { + "epoch": 0.3957139572315475, + "grad_norm": 1.078125, + "learning_rate": 3.315048970212869e-05, + "loss": 0.9999, + "step": 5378 + }, + { + "epoch": 0.39578753736491146, + "grad_norm": 1.0078125, + "learning_rate": 3.3145014324002944e-05, + "loss": 1.459, + "step": 5379 + }, + { + "epoch": 0.39586111749827546, + "grad_norm": 0.93359375, + "learning_rate": 3.313953850874966e-05, + "loss": 0.9809, + "step": 5380 + }, + { + "epoch": 0.39593469763163947, + "grad_norm": 1.1171875, + "learning_rate": 3.3134062256662714e-05, + "loss": 0.9415, + "step": 5381 + }, + { + "epoch": 0.39600827776500347, + "grad_norm": 0.86328125, + "learning_rate": 3.3128585568036014e-05, + "loss": 1.1217, + "step": 5382 + }, + { + "epoch": 0.3960818578983674, + "grad_norm": 0.84375, + "learning_rate": 3.3123108443163474e-05, + "loss": 0.6739, + "step": 5383 + }, + { + "epoch": 0.3961554380317314, + "grad_norm": 0.9921875, + "learning_rate": 3.3117630882339054e-05, + "loss": 1.3538, + "step": 5384 + }, + { + "epoch": 0.3962290181650954, + "grad_norm": 0.96875, + "learning_rate": 3.3112152885856716e-05, + "loss": 0.6795, + "step": 5385 + }, + { + "epoch": 0.39630259829845943, + "grad_norm": 1.0390625, + "learning_rate": 3.3106674454010454e-05, + "loss": 1.1909, + "step": 5386 + }, + { + "epoch": 0.39637617843182343, + "grad_norm": 0.8046875, + "learning_rate": 3.310119558709428e-05, + "loss": 0.9258, + "step": 5387 + }, + { + "epoch": 0.3964497585651874, + "grad_norm": 1.3515625, + "learning_rate": 3.309571628540224e-05, + "loss": 1.1506, + "step": 5388 + }, + { + "epoch": 0.3965233386985514, + "grad_norm": 0.890625, + "learning_rate": 3.30902365492284e-05, + "loss": 1.0797, + "step": 5389 + }, + { + "epoch": 0.3965969188319154, + "grad_norm": 0.80078125, + "learning_rate": 3.3084756378866845e-05, + "loss": 0.7262, + "step": 5390 + }, + { + "epoch": 0.3966704989652794, + "grad_norm": 0.65234375, + "learning_rate": 3.307927577461169e-05, + "loss": 0.5526, + "step": 5391 + }, + { + "epoch": 0.39674407909864334, + "grad_norm": 0.8515625, + "learning_rate": 3.3073794736757056e-05, + "loss": 0.8195, + "step": 5392 + }, + { + "epoch": 0.39681765923200735, + "grad_norm": 0.8828125, + "learning_rate": 3.306831326559712e-05, + "loss": 0.8256, + "step": 5393 + }, + { + "epoch": 0.39689123936537135, + "grad_norm": 0.89453125, + "learning_rate": 3.306283136142603e-05, + "loss": 1.4114, + "step": 5394 + }, + { + "epoch": 0.39696481949873536, + "grad_norm": 0.74609375, + "learning_rate": 3.305734902453802e-05, + "loss": 0.7751, + "step": 5395 + }, + { + "epoch": 0.39703839963209936, + "grad_norm": 0.88671875, + "learning_rate": 3.305186625522731e-05, + "loss": 0.7799, + "step": 5396 + }, + { + "epoch": 0.3971119797654633, + "grad_norm": 0.7734375, + "learning_rate": 3.304638305378814e-05, + "loss": 0.937, + "step": 5397 + }, + { + "epoch": 0.3971855598988273, + "grad_norm": 0.75390625, + "learning_rate": 3.3040899420514793e-05, + "loss": 0.6851, + "step": 5398 + }, + { + "epoch": 0.3972591400321913, + "grad_norm": 0.8828125, + "learning_rate": 3.303541535570156e-05, + "loss": 1.0003, + "step": 5399 + }, + { + "epoch": 0.3973327201655553, + "grad_norm": 0.83984375, + "learning_rate": 3.302993085964276e-05, + "loss": 1.0351, + "step": 5400 + }, + { + "epoch": 0.39740630029891927, + "grad_norm": 1.0078125, + "learning_rate": 3.302444593263273e-05, + "loss": 0.8263, + "step": 5401 + }, + { + "epoch": 0.3974798804322833, + "grad_norm": 0.84375, + "learning_rate": 3.3018960574965864e-05, + "loss": 0.7955, + "step": 5402 + }, + { + "epoch": 0.3975534605656473, + "grad_norm": 0.79296875, + "learning_rate": 3.301347478693651e-05, + "loss": 0.8923, + "step": 5403 + }, + { + "epoch": 0.3976270406990113, + "grad_norm": 0.91015625, + "learning_rate": 3.3007988568839104e-05, + "loss": 0.7069, + "step": 5404 + }, + { + "epoch": 0.3977006208323753, + "grad_norm": 0.83203125, + "learning_rate": 3.300250192096808e-05, + "loss": 0.979, + "step": 5405 + }, + { + "epoch": 0.39777420096573923, + "grad_norm": 0.87890625, + "learning_rate": 3.299701484361789e-05, + "loss": 0.9844, + "step": 5406 + }, + { + "epoch": 0.39784778109910324, + "grad_norm": 0.79296875, + "learning_rate": 3.299152733708303e-05, + "loss": 0.7956, + "step": 5407 + }, + { + "epoch": 0.39792136123246724, + "grad_norm": 1.140625, + "learning_rate": 3.298603940165797e-05, + "loss": 1.2359, + "step": 5408 + }, + { + "epoch": 0.39799494136583125, + "grad_norm": 1.140625, + "learning_rate": 3.298055103763727e-05, + "loss": 1.1349, + "step": 5409 + }, + { + "epoch": 0.3980685214991952, + "grad_norm": 0.8359375, + "learning_rate": 3.297506224531547e-05, + "loss": 0.852, + "step": 5410 + }, + { + "epoch": 0.3981421016325592, + "grad_norm": 0.75390625, + "learning_rate": 3.2969573024987136e-05, + "loss": 0.706, + "step": 5411 + }, + { + "epoch": 0.3982156817659232, + "grad_norm": 0.73828125, + "learning_rate": 3.296408337694687e-05, + "loss": 0.7747, + "step": 5412 + }, + { + "epoch": 0.3982892618992872, + "grad_norm": 0.7265625, + "learning_rate": 3.2958593301489296e-05, + "loss": 0.7966, + "step": 5413 + }, + { + "epoch": 0.3983628420326512, + "grad_norm": 1.0546875, + "learning_rate": 3.2953102798909055e-05, + "loss": 1.4578, + "step": 5414 + }, + { + "epoch": 0.39843642216601516, + "grad_norm": 0.859375, + "learning_rate": 3.29476118695008e-05, + "loss": 0.6992, + "step": 5415 + }, + { + "epoch": 0.39851000229937916, + "grad_norm": 1.0234375, + "learning_rate": 3.294212051355923e-05, + "loss": 1.0596, + "step": 5416 + }, + { + "epoch": 0.39858358243274317, + "grad_norm": 1.1484375, + "learning_rate": 3.293662873137906e-05, + "loss": 1.3676, + "step": 5417 + }, + { + "epoch": 0.39865716256610717, + "grad_norm": 0.671875, + "learning_rate": 3.293113652325501e-05, + "loss": 0.8509, + "step": 5418 + }, + { + "epoch": 0.3987307426994711, + "grad_norm": 0.90234375, + "learning_rate": 3.292564388948184e-05, + "loss": 0.7004, + "step": 5419 + }, + { + "epoch": 0.3988043228328351, + "grad_norm": 0.80078125, + "learning_rate": 3.292015083035433e-05, + "loss": 0.8331, + "step": 5420 + }, + { + "epoch": 0.3988779029661991, + "grad_norm": 0.81640625, + "learning_rate": 3.291465734616729e-05, + "loss": 0.736, + "step": 5421 + }, + { + "epoch": 0.39895148309956313, + "grad_norm": 1.0546875, + "learning_rate": 3.290916343721553e-05, + "loss": 1.1495, + "step": 5422 + }, + { + "epoch": 0.39902506323292714, + "grad_norm": 0.79296875, + "learning_rate": 3.2903669103793916e-05, + "loss": 0.5683, + "step": 5423 + }, + { + "epoch": 0.3990986433662911, + "grad_norm": 0.8515625, + "learning_rate": 3.2898174346197306e-05, + "loss": 1.2228, + "step": 5424 + }, + { + "epoch": 0.3991722234996551, + "grad_norm": 1.3359375, + "learning_rate": 3.289267916472059e-05, + "loss": 1.1784, + "step": 5425 + }, + { + "epoch": 0.3992458036330191, + "grad_norm": 0.9453125, + "learning_rate": 3.288718355965871e-05, + "loss": 0.9894, + "step": 5426 + }, + { + "epoch": 0.3993193837663831, + "grad_norm": 0.75390625, + "learning_rate": 3.288168753130657e-05, + "loss": 0.9713, + "step": 5427 + }, + { + "epoch": 0.39939296389974704, + "grad_norm": 0.6875, + "learning_rate": 3.287619107995914e-05, + "loss": 0.9314, + "step": 5428 + }, + { + "epoch": 0.39946654403311105, + "grad_norm": 0.9140625, + "learning_rate": 3.287069420591142e-05, + "loss": 1.343, + "step": 5429 + }, + { + "epoch": 0.39954012416647505, + "grad_norm": 0.65625, + "learning_rate": 3.286519690945841e-05, + "loss": 0.7085, + "step": 5430 + }, + { + "epoch": 0.39961370429983906, + "grad_norm": 0.9296875, + "learning_rate": 3.2859699190895135e-05, + "loss": 0.9537, + "step": 5431 + }, + { + "epoch": 0.39968728443320306, + "grad_norm": 0.84375, + "learning_rate": 3.285420105051665e-05, + "loss": 0.7191, + "step": 5432 + }, + { + "epoch": 0.399760864566567, + "grad_norm": 0.625, + "learning_rate": 3.2848702488618025e-05, + "loss": 0.4963, + "step": 5433 + }, + { + "epoch": 0.399834444699931, + "grad_norm": 0.76953125, + "learning_rate": 3.284320350549436e-05, + "loss": 0.707, + "step": 5434 + }, + { + "epoch": 0.399908024833295, + "grad_norm": 1.078125, + "learning_rate": 3.283770410144078e-05, + "loss": 1.1303, + "step": 5435 + }, + { + "epoch": 0.399981604966659, + "grad_norm": 0.9921875, + "learning_rate": 3.2832204276752435e-05, + "loss": 1.1261, + "step": 5436 + }, + { + "epoch": 0.40005518510002297, + "grad_norm": 0.82421875, + "learning_rate": 3.282670403172447e-05, + "loss": 1.2074, + "step": 5437 + }, + { + "epoch": 0.400128765233387, + "grad_norm": 0.859375, + "learning_rate": 3.282120336665207e-05, + "loss": 0.7685, + "step": 5438 + }, + { + "epoch": 0.400202345366751, + "grad_norm": 0.9765625, + "learning_rate": 3.281570228183047e-05, + "loss": 1.1608, + "step": 5439 + }, + { + "epoch": 0.400275925500115, + "grad_norm": 0.84765625, + "learning_rate": 3.2810200777554887e-05, + "loss": 0.7845, + "step": 5440 + }, + { + "epoch": 0.400349505633479, + "grad_norm": 0.96484375, + "learning_rate": 3.280469885412058e-05, + "loss": 0.9856, + "step": 5441 + }, + { + "epoch": 0.40042308576684293, + "grad_norm": 0.875, + "learning_rate": 3.2799196511822835e-05, + "loss": 0.8395, + "step": 5442 + }, + { + "epoch": 0.40049666590020694, + "grad_norm": 0.87109375, + "learning_rate": 3.279369375095694e-05, + "loss": 0.8938, + "step": 5443 + }, + { + "epoch": 0.40057024603357094, + "grad_norm": 0.87109375, + "learning_rate": 3.278819057181821e-05, + "loss": 1.1752, + "step": 5444 + }, + { + "epoch": 0.40064382616693495, + "grad_norm": 1.0078125, + "learning_rate": 3.278268697470202e-05, + "loss": 1.0623, + "step": 5445 + }, + { + "epoch": 0.4007174063002989, + "grad_norm": 0.890625, + "learning_rate": 3.2777182959903705e-05, + "loss": 0.7103, + "step": 5446 + }, + { + "epoch": 0.4007909864336629, + "grad_norm": 0.859375, + "learning_rate": 3.277167852771867e-05, + "loss": 0.6163, + "step": 5447 + }, + { + "epoch": 0.4008645665670269, + "grad_norm": 0.8671875, + "learning_rate": 3.276617367844233e-05, + "loss": 0.9038, + "step": 5448 + }, + { + "epoch": 0.4009381467003909, + "grad_norm": 0.6953125, + "learning_rate": 3.276066841237012e-05, + "loss": 0.7005, + "step": 5449 + }, + { + "epoch": 0.4010117268337549, + "grad_norm": 0.8984375, + "learning_rate": 3.275516272979749e-05, + "loss": 1.1644, + "step": 5450 + }, + { + "epoch": 0.40108530696711886, + "grad_norm": 0.90234375, + "learning_rate": 3.274965663101993e-05, + "loss": 1.0017, + "step": 5451 + }, + { + "epoch": 0.40115888710048286, + "grad_norm": 0.984375, + "learning_rate": 3.274415011633294e-05, + "loss": 1.1346, + "step": 5452 + }, + { + "epoch": 0.40123246723384687, + "grad_norm": 0.890625, + "learning_rate": 3.2738643186032026e-05, + "loss": 0.9945, + "step": 5453 + }, + { + "epoch": 0.40130604736721087, + "grad_norm": 0.8671875, + "learning_rate": 3.273313584041276e-05, + "loss": 0.8145, + "step": 5454 + }, + { + "epoch": 0.4013796275005748, + "grad_norm": 1.046875, + "learning_rate": 3.272762807977068e-05, + "loss": 1.1186, + "step": 5455 + }, + { + "epoch": 0.4014532076339388, + "grad_norm": 0.91796875, + "learning_rate": 3.272211990440141e-05, + "loss": 1.1701, + "step": 5456 + }, + { + "epoch": 0.40152678776730283, + "grad_norm": 0.8671875, + "learning_rate": 3.2716611314600554e-05, + "loss": 0.8724, + "step": 5457 + }, + { + "epoch": 0.40160036790066683, + "grad_norm": 0.765625, + "learning_rate": 3.271110231066374e-05, + "loss": 0.9826, + "step": 5458 + }, + { + "epoch": 0.40167394803403084, + "grad_norm": 1.09375, + "learning_rate": 3.270559289288663e-05, + "loss": 0.9174, + "step": 5459 + }, + { + "epoch": 0.4017475281673948, + "grad_norm": 0.7265625, + "learning_rate": 3.27000830615649e-05, + "loss": 1.2029, + "step": 5460 + }, + { + "epoch": 0.4018211083007588, + "grad_norm": 0.796875, + "learning_rate": 3.269457281699426e-05, + "loss": 0.7479, + "step": 5461 + }, + { + "epoch": 0.4018946884341228, + "grad_norm": 0.70703125, + "learning_rate": 3.268906215947042e-05, + "loss": 0.7568, + "step": 5462 + }, + { + "epoch": 0.4019682685674868, + "grad_norm": 0.8359375, + "learning_rate": 3.2683551089289144e-05, + "loss": 0.8404, + "step": 5463 + }, + { + "epoch": 0.40204184870085075, + "grad_norm": 1.015625, + "learning_rate": 3.267803960674619e-05, + "loss": 1.1092, + "step": 5464 + }, + { + "epoch": 0.40211542883421475, + "grad_norm": 0.9765625, + "learning_rate": 3.2672527712137356e-05, + "loss": 1.0652, + "step": 5465 + }, + { + "epoch": 0.40218900896757875, + "grad_norm": 0.8828125, + "learning_rate": 3.2667015405758453e-05, + "loss": 1.0759, + "step": 5466 + }, + { + "epoch": 0.40226258910094276, + "grad_norm": 0.8125, + "learning_rate": 3.266150268790531e-05, + "loss": 1.0637, + "step": 5467 + }, + { + "epoch": 0.40233616923430676, + "grad_norm": 1.046875, + "learning_rate": 3.265598955887379e-05, + "loss": 1.2914, + "step": 5468 + }, + { + "epoch": 0.4024097493676707, + "grad_norm": 0.8828125, + "learning_rate": 3.2650476018959765e-05, + "loss": 0.7047, + "step": 5469 + }, + { + "epoch": 0.4024833295010347, + "grad_norm": 1.0234375, + "learning_rate": 3.264496206845915e-05, + "loss": 1.0054, + "step": 5470 + }, + { + "epoch": 0.4025569096343987, + "grad_norm": 0.921875, + "learning_rate": 3.263944770766785e-05, + "loss": 1.0365, + "step": 5471 + }, + { + "epoch": 0.4026304897677627, + "grad_norm": 0.98046875, + "learning_rate": 3.2633932936881825e-05, + "loss": 1.5661, + "step": 5472 + }, + { + "epoch": 0.40270406990112667, + "grad_norm": 0.90234375, + "learning_rate": 3.262841775639705e-05, + "loss": 1.0951, + "step": 5473 + }, + { + "epoch": 0.4027776500344907, + "grad_norm": 0.78515625, + "learning_rate": 3.262290216650948e-05, + "loss": 0.7258, + "step": 5474 + }, + { + "epoch": 0.4028512301678547, + "grad_norm": 0.765625, + "learning_rate": 3.2617386167515164e-05, + "loss": 0.8595, + "step": 5475 + }, + { + "epoch": 0.4029248103012187, + "grad_norm": 0.7578125, + "learning_rate": 3.261186975971011e-05, + "loss": 0.6583, + "step": 5476 + }, + { + "epoch": 0.4029983904345827, + "grad_norm": 0.828125, + "learning_rate": 3.260635294339039e-05, + "loss": 0.9518, + "step": 5477 + }, + { + "epoch": 0.40307197056794664, + "grad_norm": 1.0, + "learning_rate": 3.260083571885207e-05, + "loss": 1.1576, + "step": 5478 + }, + { + "epoch": 0.40314555070131064, + "grad_norm": 0.8828125, + "learning_rate": 3.259531808639126e-05, + "loss": 0.9954, + "step": 5479 + }, + { + "epoch": 0.40321913083467464, + "grad_norm": 0.83984375, + "learning_rate": 3.258980004630407e-05, + "loss": 1.269, + "step": 5480 + }, + { + "epoch": 0.40329271096803865, + "grad_norm": 1.1953125, + "learning_rate": 3.258428159888664e-05, + "loss": 1.5506, + "step": 5481 + }, + { + "epoch": 0.4033662911014026, + "grad_norm": 1.0625, + "learning_rate": 3.257876274443515e-05, + "loss": 1.0325, + "step": 5482 + }, + { + "epoch": 0.4034398712347666, + "grad_norm": 0.8359375, + "learning_rate": 3.2573243483245774e-05, + "loss": 0.778, + "step": 5483 + }, + { + "epoch": 0.4035134513681306, + "grad_norm": 0.8515625, + "learning_rate": 3.2567723815614726e-05, + "loss": 0.7232, + "step": 5484 + }, + { + "epoch": 0.4035870315014946, + "grad_norm": 1.0, + "learning_rate": 3.256220374183823e-05, + "loss": 1.0646, + "step": 5485 + }, + { + "epoch": 0.4036606116348586, + "grad_norm": 0.796875, + "learning_rate": 3.255668326221255e-05, + "loss": 1.1485, + "step": 5486 + }, + { + "epoch": 0.40373419176822256, + "grad_norm": 0.91796875, + "learning_rate": 3.2551162377033945e-05, + "loss": 1.0045, + "step": 5487 + }, + { + "epoch": 0.40380777190158657, + "grad_norm": 0.73046875, + "learning_rate": 3.254564108659872e-05, + "loss": 0.6102, + "step": 5488 + }, + { + "epoch": 0.40388135203495057, + "grad_norm": 0.9609375, + "learning_rate": 3.2540119391203186e-05, + "loss": 1.3953, + "step": 5489 + }, + { + "epoch": 0.4039549321683146, + "grad_norm": 0.84765625, + "learning_rate": 3.253459729114368e-05, + "loss": 0.9125, + "step": 5490 + }, + { + "epoch": 0.4040285123016785, + "grad_norm": 1.0078125, + "learning_rate": 3.252907478671659e-05, + "loss": 1.0289, + "step": 5491 + }, + { + "epoch": 0.4041020924350425, + "grad_norm": 0.7734375, + "learning_rate": 3.252355187821825e-05, + "loss": 0.8511, + "step": 5492 + }, + { + "epoch": 0.40417567256840653, + "grad_norm": 1.015625, + "learning_rate": 3.2518028565945104e-05, + "loss": 1.0297, + "step": 5493 + }, + { + "epoch": 0.40424925270177053, + "grad_norm": 0.76953125, + "learning_rate": 3.251250485019357e-05, + "loss": 0.7107, + "step": 5494 + }, + { + "epoch": 0.40432283283513454, + "grad_norm": 0.72265625, + "learning_rate": 3.250698073126008e-05, + "loss": 0.6151, + "step": 5495 + }, + { + "epoch": 0.4043964129684985, + "grad_norm": 0.859375, + "learning_rate": 3.2501456209441114e-05, + "loss": 0.717, + "step": 5496 + }, + { + "epoch": 0.4044699931018625, + "grad_norm": 0.8359375, + "learning_rate": 3.2495931285033166e-05, + "loss": 0.842, + "step": 5497 + }, + { + "epoch": 0.4045435732352265, + "grad_norm": 0.91015625, + "learning_rate": 3.249040595833274e-05, + "loss": 0.8624, + "step": 5498 + }, + { + "epoch": 0.4046171533685905, + "grad_norm": 0.7734375, + "learning_rate": 3.2484880229636375e-05, + "loss": 0.7128, + "step": 5499 + }, + { + "epoch": 0.40469073350195445, + "grad_norm": 0.85546875, + "learning_rate": 3.247935409924063e-05, + "loss": 0.6546, + "step": 5500 + }, + { + "epoch": 0.40476431363531845, + "grad_norm": 0.7421875, + "learning_rate": 3.247382756744207e-05, + "loss": 0.7038, + "step": 5501 + }, + { + "epoch": 0.40483789376868246, + "grad_norm": 1.1171875, + "learning_rate": 3.24683006345373e-05, + "loss": 1.0853, + "step": 5502 + }, + { + "epoch": 0.40491147390204646, + "grad_norm": 0.984375, + "learning_rate": 3.246277330082295e-05, + "loss": 0.7791, + "step": 5503 + }, + { + "epoch": 0.40498505403541046, + "grad_norm": 0.67578125, + "learning_rate": 3.245724556659564e-05, + "loss": 0.7281, + "step": 5504 + }, + { + "epoch": 0.4050586341687744, + "grad_norm": 1.0234375, + "learning_rate": 3.245171743215205e-05, + "loss": 0.8079, + "step": 5505 + }, + { + "epoch": 0.4051322143021384, + "grad_norm": 0.96875, + "learning_rate": 3.244618889778886e-05, + "loss": 0.9249, + "step": 5506 + }, + { + "epoch": 0.4052057944355024, + "grad_norm": 0.625, + "learning_rate": 3.2440659963802785e-05, + "loss": 0.7506, + "step": 5507 + }, + { + "epoch": 0.4052793745688664, + "grad_norm": 0.84375, + "learning_rate": 3.243513063049053e-05, + "loss": 0.8894, + "step": 5508 + }, + { + "epoch": 0.4053529547022304, + "grad_norm": 0.84375, + "learning_rate": 3.242960089814886e-05, + "loss": 1.2063, + "step": 5509 + }, + { + "epoch": 0.4054265348355944, + "grad_norm": 0.87890625, + "learning_rate": 3.2424070767074546e-05, + "loss": 1.0831, + "step": 5510 + }, + { + "epoch": 0.4055001149689584, + "grad_norm": 0.89453125, + "learning_rate": 3.241854023756437e-05, + "loss": 0.9957, + "step": 5511 + }, + { + "epoch": 0.4055736951023224, + "grad_norm": 0.8359375, + "learning_rate": 3.241300930991515e-05, + "loss": 0.7813, + "step": 5512 + }, + { + "epoch": 0.4056472752356864, + "grad_norm": 0.73046875, + "learning_rate": 3.2407477984423726e-05, + "loss": 0.9488, + "step": 5513 + }, + { + "epoch": 0.40572085536905034, + "grad_norm": 1.109375, + "learning_rate": 3.240194626138696e-05, + "loss": 1.2827, + "step": 5514 + }, + { + "epoch": 0.40579443550241434, + "grad_norm": 0.90625, + "learning_rate": 3.239641414110171e-05, + "loss": 1.1106, + "step": 5515 + }, + { + "epoch": 0.40586801563577835, + "grad_norm": 0.96875, + "learning_rate": 3.239088162386487e-05, + "loss": 1.4845, + "step": 5516 + }, + { + "epoch": 0.40594159576914235, + "grad_norm": 0.80078125, + "learning_rate": 3.238534870997338e-05, + "loss": 0.9536, + "step": 5517 + }, + { + "epoch": 0.4060151759025063, + "grad_norm": 0.828125, + "learning_rate": 3.237981539972418e-05, + "loss": 1.1186, + "step": 5518 + }, + { + "epoch": 0.4060887560358703, + "grad_norm": 0.890625, + "learning_rate": 3.2374281693414226e-05, + "loss": 0.8825, + "step": 5519 + }, + { + "epoch": 0.4061623361692343, + "grad_norm": 0.79296875, + "learning_rate": 3.2368747591340496e-05, + "loss": 0.6731, + "step": 5520 + }, + { + "epoch": 0.4062359163025983, + "grad_norm": 0.84375, + "learning_rate": 3.2363213093800006e-05, + "loss": 1.0177, + "step": 5521 + }, + { + "epoch": 0.4063094964359623, + "grad_norm": 1.0625, + "learning_rate": 3.235767820108977e-05, + "loss": 0.8926, + "step": 5522 + }, + { + "epoch": 0.40638307656932626, + "grad_norm": 0.7421875, + "learning_rate": 3.2352142913506846e-05, + "loss": 0.6149, + "step": 5523 + }, + { + "epoch": 0.40645665670269027, + "grad_norm": 0.93359375, + "learning_rate": 3.23466072313483e-05, + "loss": 1.2331, + "step": 5524 + }, + { + "epoch": 0.40653023683605427, + "grad_norm": 0.859375, + "learning_rate": 3.2341071154911215e-05, + "loss": 1.1672, + "step": 5525 + }, + { + "epoch": 0.4066038169694183, + "grad_norm": 0.734375, + "learning_rate": 3.233553468449271e-05, + "loss": 0.7292, + "step": 5526 + }, + { + "epoch": 0.4066773971027822, + "grad_norm": 0.8125, + "learning_rate": 3.232999782038991e-05, + "loss": 0.6549, + "step": 5527 + }, + { + "epoch": 0.4067509772361462, + "grad_norm": 0.94921875, + "learning_rate": 3.232446056289997e-05, + "loss": 1.1656, + "step": 5528 + }, + { + "epoch": 0.40682455736951023, + "grad_norm": 1.046875, + "learning_rate": 3.231892291232007e-05, + "loss": 1.3971, + "step": 5529 + }, + { + "epoch": 0.40689813750287424, + "grad_norm": 0.86328125, + "learning_rate": 3.23133848689474e-05, + "loss": 1.2544, + "step": 5530 + }, + { + "epoch": 0.40697171763623824, + "grad_norm": 0.87109375, + "learning_rate": 3.230784643307917e-05, + "loss": 0.8279, + "step": 5531 + }, + { + "epoch": 0.4070452977696022, + "grad_norm": 0.73046875, + "learning_rate": 3.230230760501264e-05, + "loss": 0.7329, + "step": 5532 + }, + { + "epoch": 0.4071188779029662, + "grad_norm": 0.95703125, + "learning_rate": 3.229676838504504e-05, + "loss": 1.2462, + "step": 5533 + }, + { + "epoch": 0.4071924580363302, + "grad_norm": 0.8515625, + "learning_rate": 3.229122877347366e-05, + "loss": 1.0062, + "step": 5534 + }, + { + "epoch": 0.4072660381696942, + "grad_norm": 0.80078125, + "learning_rate": 3.2285688770595815e-05, + "loss": 0.9326, + "step": 5535 + }, + { + "epoch": 0.40733961830305815, + "grad_norm": 0.94140625, + "learning_rate": 3.228014837670881e-05, + "loss": 0.763, + "step": 5536 + }, + { + "epoch": 0.40741319843642215, + "grad_norm": 0.73828125, + "learning_rate": 3.227460759210999e-05, + "loss": 0.7049, + "step": 5537 + }, + { + "epoch": 0.40748677856978616, + "grad_norm": 0.86328125, + "learning_rate": 3.2269066417096715e-05, + "loss": 0.78, + "step": 5538 + }, + { + "epoch": 0.40756035870315016, + "grad_norm": 0.91015625, + "learning_rate": 3.2263524851966385e-05, + "loss": 0.9355, + "step": 5539 + }, + { + "epoch": 0.40763393883651416, + "grad_norm": 0.85546875, + "learning_rate": 3.2257982897016395e-05, + "loss": 0.9373, + "step": 5540 + }, + { + "epoch": 0.4077075189698781, + "grad_norm": 0.71484375, + "learning_rate": 3.225244055254416e-05, + "loss": 0.722, + "step": 5541 + }, + { + "epoch": 0.4077810991032421, + "grad_norm": 0.83984375, + "learning_rate": 3.224689781884715e-05, + "loss": 0.9293, + "step": 5542 + }, + { + "epoch": 0.4078546792366061, + "grad_norm": 0.8515625, + "learning_rate": 3.224135469622282e-05, + "loss": 1.1588, + "step": 5543 + }, + { + "epoch": 0.4079282593699701, + "grad_norm": 0.7265625, + "learning_rate": 3.223581118496865e-05, + "loss": 0.8876, + "step": 5544 + }, + { + "epoch": 0.4080018395033341, + "grad_norm": 0.953125, + "learning_rate": 3.223026728538216e-05, + "loss": 1.0028, + "step": 5545 + }, + { + "epoch": 0.4080754196366981, + "grad_norm": 0.859375, + "learning_rate": 3.22247229977609e-05, + "loss": 1.0211, + "step": 5546 + }, + { + "epoch": 0.4081489997700621, + "grad_norm": 0.70703125, + "learning_rate": 3.221917832240239e-05, + "loss": 0.8526, + "step": 5547 + }, + { + "epoch": 0.4082225799034261, + "grad_norm": 0.86328125, + "learning_rate": 3.221363325960422e-05, + "loss": 0.8572, + "step": 5548 + }, + { + "epoch": 0.4082961600367901, + "grad_norm": 0.6875, + "learning_rate": 3.2208087809663966e-05, + "loss": 0.5536, + "step": 5549 + }, + { + "epoch": 0.40836974017015404, + "grad_norm": 0.8359375, + "learning_rate": 3.2202541972879256e-05, + "loss": 0.7928, + "step": 5550 + }, + { + "epoch": 0.40844332030351804, + "grad_norm": 0.70703125, + "learning_rate": 3.2196995749547725e-05, + "loss": 0.8852, + "step": 5551 + }, + { + "epoch": 0.40851690043688205, + "grad_norm": 0.91015625, + "learning_rate": 3.219144913996702e-05, + "loss": 0.8989, + "step": 5552 + }, + { + "epoch": 0.40859048057024605, + "grad_norm": 0.78125, + "learning_rate": 3.218590214443484e-05, + "loss": 0.8728, + "step": 5553 + }, + { + "epoch": 0.40866406070361, + "grad_norm": 1.0, + "learning_rate": 3.218035476324885e-05, + "loss": 1.4778, + "step": 5554 + }, + { + "epoch": 0.408737640836974, + "grad_norm": 0.91796875, + "learning_rate": 3.2174806996706785e-05, + "loss": 1.132, + "step": 5555 + }, + { + "epoch": 0.408811220970338, + "grad_norm": 0.6875, + "learning_rate": 3.216925884510638e-05, + "loss": 0.6525, + "step": 5556 + }, + { + "epoch": 0.408884801103702, + "grad_norm": 1.0546875, + "learning_rate": 3.2163710308745385e-05, + "loss": 1.2691, + "step": 5557 + }, + { + "epoch": 0.408958381237066, + "grad_norm": 1.0078125, + "learning_rate": 3.21581613879216e-05, + "loss": 1.1924, + "step": 5558 + }, + { + "epoch": 0.40903196137042996, + "grad_norm": 0.8203125, + "learning_rate": 3.215261208293281e-05, + "loss": 1.179, + "step": 5559 + }, + { + "epoch": 0.40910554150379397, + "grad_norm": 0.78515625, + "learning_rate": 3.214706239407684e-05, + "loss": 0.7966, + "step": 5560 + }, + { + "epoch": 0.40917912163715797, + "grad_norm": 0.7734375, + "learning_rate": 3.214151232165152e-05, + "loss": 0.7218, + "step": 5561 + }, + { + "epoch": 0.409252701770522, + "grad_norm": 1.15625, + "learning_rate": 3.213596186595474e-05, + "loss": 1.6786, + "step": 5562 + }, + { + "epoch": 0.4093262819038859, + "grad_norm": 0.78515625, + "learning_rate": 3.213041102728435e-05, + "loss": 0.7659, + "step": 5563 + }, + { + "epoch": 0.40939986203724993, + "grad_norm": 0.97265625, + "learning_rate": 3.212485980593827e-05, + "loss": 0.9232, + "step": 5564 + }, + { + "epoch": 0.40947344217061393, + "grad_norm": 1.0859375, + "learning_rate": 3.211930820221443e-05, + "loss": 1.1072, + "step": 5565 + }, + { + "epoch": 0.40954702230397794, + "grad_norm": 0.73046875, + "learning_rate": 3.211375621641075e-05, + "loss": 0.8378, + "step": 5566 + }, + { + "epoch": 0.40962060243734194, + "grad_norm": 0.86328125, + "learning_rate": 3.210820384882522e-05, + "loss": 0.8205, + "step": 5567 + }, + { + "epoch": 0.4096941825707059, + "grad_norm": 0.87890625, + "learning_rate": 3.2102651099755815e-05, + "loss": 1.1259, + "step": 5568 + }, + { + "epoch": 0.4097677627040699, + "grad_norm": 0.8125, + "learning_rate": 3.209709796950054e-05, + "loss": 1.0587, + "step": 5569 + }, + { + "epoch": 0.4098413428374339, + "grad_norm": 0.7421875, + "learning_rate": 3.209154445835742e-05, + "loss": 0.7716, + "step": 5570 + }, + { + "epoch": 0.4099149229707979, + "grad_norm": 1.03125, + "learning_rate": 3.20859905666245e-05, + "loss": 1.0481, + "step": 5571 + }, + { + "epoch": 0.40998850310416185, + "grad_norm": 0.6484375, + "learning_rate": 3.208043629459986e-05, + "loss": 0.5851, + "step": 5572 + }, + { + "epoch": 0.41006208323752585, + "grad_norm": 0.86328125, + "learning_rate": 3.2074881642581564e-05, + "loss": 1.0272, + "step": 5573 + }, + { + "epoch": 0.41013566337088986, + "grad_norm": 0.7421875, + "learning_rate": 3.206932661086775e-05, + "loss": 0.9631, + "step": 5574 + }, + { + "epoch": 0.41020924350425386, + "grad_norm": 0.71875, + "learning_rate": 3.206377119975651e-05, + "loss": 0.612, + "step": 5575 + }, + { + "epoch": 0.41028282363761787, + "grad_norm": 0.84765625, + "learning_rate": 3.205821540954602e-05, + "loss": 0.769, + "step": 5576 + }, + { + "epoch": 0.4103564037709818, + "grad_norm": 0.734375, + "learning_rate": 3.205265924053443e-05, + "loss": 0.9553, + "step": 5577 + }, + { + "epoch": 0.4104299839043458, + "grad_norm": 0.87890625, + "learning_rate": 3.204710269301996e-05, + "loss": 0.867, + "step": 5578 + }, + { + "epoch": 0.4105035640377098, + "grad_norm": 0.7890625, + "learning_rate": 3.2041545767300785e-05, + "loss": 0.8351, + "step": 5579 + }, + { + "epoch": 0.4105771441710738, + "grad_norm": 0.70703125, + "learning_rate": 3.203598846367515e-05, + "loss": 0.8639, + "step": 5580 + }, + { + "epoch": 0.4106507243044378, + "grad_norm": 0.87890625, + "learning_rate": 3.20304307824413e-05, + "loss": 0.6448, + "step": 5581 + }, + { + "epoch": 0.4107243044378018, + "grad_norm": 0.78125, + "learning_rate": 3.2024872723897514e-05, + "loss": 0.5816, + "step": 5582 + }, + { + "epoch": 0.4107978845711658, + "grad_norm": 0.828125, + "learning_rate": 3.2019314288342075e-05, + "loss": 0.8489, + "step": 5583 + }, + { + "epoch": 0.4108714647045298, + "grad_norm": 1.1015625, + "learning_rate": 3.2013755476073294e-05, + "loss": 0.9564, + "step": 5584 + }, + { + "epoch": 0.4109450448378938, + "grad_norm": 0.87890625, + "learning_rate": 3.200819628738951e-05, + "loss": 1.0322, + "step": 5585 + }, + { + "epoch": 0.41101862497125774, + "grad_norm": 0.828125, + "learning_rate": 3.200263672258906e-05, + "loss": 0.9438, + "step": 5586 + }, + { + "epoch": 0.41109220510462174, + "grad_norm": 0.8671875, + "learning_rate": 3.199707678197033e-05, + "loss": 0.9212, + "step": 5587 + }, + { + "epoch": 0.41116578523798575, + "grad_norm": 0.96875, + "learning_rate": 3.1991516465831696e-05, + "loss": 1.0347, + "step": 5588 + }, + { + "epoch": 0.41123936537134975, + "grad_norm": 0.8359375, + "learning_rate": 3.198595577447159e-05, + "loss": 1.0681, + "step": 5589 + }, + { + "epoch": 0.4113129455047137, + "grad_norm": 0.8515625, + "learning_rate": 3.1980394708188425e-05, + "loss": 0.8626, + "step": 5590 + }, + { + "epoch": 0.4113865256380777, + "grad_norm": 4.09375, + "learning_rate": 3.197483326728065e-05, + "loss": 0.9472, + "step": 5591 + }, + { + "epoch": 0.4114601057714417, + "grad_norm": 0.8203125, + "learning_rate": 3.196927145204676e-05, + "loss": 0.9151, + "step": 5592 + }, + { + "epoch": 0.4115336859048057, + "grad_norm": 1.125, + "learning_rate": 3.1963709262785234e-05, + "loss": 1.0061, + "step": 5593 + }, + { + "epoch": 0.4116072660381697, + "grad_norm": 0.6953125, + "learning_rate": 3.1958146699794586e-05, + "loss": 0.7624, + "step": 5594 + }, + { + "epoch": 0.41168084617153367, + "grad_norm": 0.98046875, + "learning_rate": 3.195258376337334e-05, + "loss": 1.3725, + "step": 5595 + }, + { + "epoch": 0.41175442630489767, + "grad_norm": 0.78125, + "learning_rate": 3.1947020453820066e-05, + "loss": 0.8691, + "step": 5596 + }, + { + "epoch": 0.4118280064382617, + "grad_norm": 0.95703125, + "learning_rate": 3.194145677143331e-05, + "loss": 1.0105, + "step": 5597 + }, + { + "epoch": 0.4119015865716257, + "grad_norm": 0.75390625, + "learning_rate": 3.193589271651169e-05, + "loss": 0.9358, + "step": 5598 + }, + { + "epoch": 0.4119751667049896, + "grad_norm": 0.890625, + "learning_rate": 3.1930328289353805e-05, + "loss": 1.1158, + "step": 5599 + }, + { + "epoch": 0.41204874683835363, + "grad_norm": 0.890625, + "learning_rate": 3.192476349025829e-05, + "loss": 0.7623, + "step": 5600 + }, + { + "epoch": 0.41212232697171763, + "grad_norm": 0.86328125, + "learning_rate": 3.1919198319523805e-05, + "loss": 0.918, + "step": 5601 + }, + { + "epoch": 0.41219590710508164, + "grad_norm": 0.8359375, + "learning_rate": 3.1913632777449e-05, + "loss": 0.7094, + "step": 5602 + }, + { + "epoch": 0.41226948723844564, + "grad_norm": 1.03125, + "learning_rate": 3.190806686433259e-05, + "loss": 0.9955, + "step": 5603 + }, + { + "epoch": 0.4123430673718096, + "grad_norm": 1.0859375, + "learning_rate": 3.190250058047328e-05, + "loss": 1.1954, + "step": 5604 + }, + { + "epoch": 0.4124166475051736, + "grad_norm": 1.03125, + "learning_rate": 3.18969339261698e-05, + "loss": 1.2955, + "step": 5605 + }, + { + "epoch": 0.4124902276385376, + "grad_norm": 0.94140625, + "learning_rate": 3.1891366901720896e-05, + "loss": 1.2679, + "step": 5606 + }, + { + "epoch": 0.4125638077719016, + "grad_norm": 0.85546875, + "learning_rate": 3.188579950742535e-05, + "loss": 1.3088, + "step": 5607 + }, + { + "epoch": 0.41263738790526555, + "grad_norm": 0.95703125, + "learning_rate": 3.188023174358196e-05, + "loss": 1.1992, + "step": 5608 + }, + { + "epoch": 0.41271096803862956, + "grad_norm": 0.89453125, + "learning_rate": 3.187466361048952e-05, + "loss": 1.1695, + "step": 5609 + }, + { + "epoch": 0.41278454817199356, + "grad_norm": 1.0078125, + "learning_rate": 3.1869095108446864e-05, + "loss": 1.1898, + "step": 5610 + }, + { + "epoch": 0.41285812830535756, + "grad_norm": 0.78125, + "learning_rate": 3.186352623775285e-05, + "loss": 0.981, + "step": 5611 + }, + { + "epoch": 0.41293170843872157, + "grad_norm": 0.73046875, + "learning_rate": 3.185795699870635e-05, + "loss": 0.73, + "step": 5612 + }, + { + "epoch": 0.4130052885720855, + "grad_norm": 0.921875, + "learning_rate": 3.185238739160624e-05, + "loss": 0.8553, + "step": 5613 + }, + { + "epoch": 0.4130788687054495, + "grad_norm": 0.88671875, + "learning_rate": 3.184681741675145e-05, + "loss": 0.7428, + "step": 5614 + }, + { + "epoch": 0.4131524488388135, + "grad_norm": 1.1171875, + "learning_rate": 3.1841247074440906e-05, + "loss": 0.9858, + "step": 5615 + }, + { + "epoch": 0.41322602897217753, + "grad_norm": 0.96484375, + "learning_rate": 3.1835676364973546e-05, + "loss": 0.9968, + "step": 5616 + }, + { + "epoch": 0.4132996091055415, + "grad_norm": 1.0234375, + "learning_rate": 3.183010528864835e-05, + "loss": 0.9381, + "step": 5617 + }, + { + "epoch": 0.4133731892389055, + "grad_norm": 0.77734375, + "learning_rate": 3.182453384576429e-05, + "loss": 0.7072, + "step": 5618 + }, + { + "epoch": 0.4134467693722695, + "grad_norm": 0.78515625, + "learning_rate": 3.18189620366204e-05, + "loss": 0.8605, + "step": 5619 + }, + { + "epoch": 0.4135203495056335, + "grad_norm": 0.6484375, + "learning_rate": 3.1813389861515706e-05, + "loss": 0.9305, + "step": 5620 + }, + { + "epoch": 0.4135939296389975, + "grad_norm": 0.74609375, + "learning_rate": 3.1807817320749236e-05, + "loss": 0.6096, + "step": 5621 + }, + { + "epoch": 0.41366750977236144, + "grad_norm": 0.85546875, + "learning_rate": 3.1802244414620065e-05, + "loss": 1.0141, + "step": 5622 + }, + { + "epoch": 0.41374108990572545, + "grad_norm": 1.078125, + "learning_rate": 3.179667114342729e-05, + "loss": 0.7475, + "step": 5623 + }, + { + "epoch": 0.41381467003908945, + "grad_norm": 0.7890625, + "learning_rate": 3.179109750747e-05, + "loss": 1.0894, + "step": 5624 + }, + { + "epoch": 0.41388825017245345, + "grad_norm": 0.94921875, + "learning_rate": 3.1785523507047345e-05, + "loss": 1.0363, + "step": 5625 + }, + { + "epoch": 0.4139618303058174, + "grad_norm": 0.90625, + "learning_rate": 3.177994914245846e-05, + "loss": 0.9267, + "step": 5626 + }, + { + "epoch": 0.4140354104391814, + "grad_norm": 0.8359375, + "learning_rate": 3.1774374414002505e-05, + "loss": 0.8263, + "step": 5627 + }, + { + "epoch": 0.4141089905725454, + "grad_norm": 0.7265625, + "learning_rate": 3.1768799321978674e-05, + "loss": 0.5761, + "step": 5628 + }, + { + "epoch": 0.4141825707059094, + "grad_norm": 0.73828125, + "learning_rate": 3.176322386668617e-05, + "loss": 0.7389, + "step": 5629 + }, + { + "epoch": 0.4142561508392734, + "grad_norm": 0.8046875, + "learning_rate": 3.17576480484242e-05, + "loss": 0.6863, + "step": 5630 + }, + { + "epoch": 0.41432973097263737, + "grad_norm": 0.8046875, + "learning_rate": 3.1752071867492037e-05, + "loss": 0.8422, + "step": 5631 + }, + { + "epoch": 0.41440331110600137, + "grad_norm": 0.8828125, + "learning_rate": 3.174649532418893e-05, + "loss": 1.3692, + "step": 5632 + }, + { + "epoch": 0.4144768912393654, + "grad_norm": 1.03125, + "learning_rate": 3.1740918418814156e-05, + "loss": 1.0136, + "step": 5633 + }, + { + "epoch": 0.4145504713727294, + "grad_norm": 1.1640625, + "learning_rate": 3.173534115166702e-05, + "loss": 1.1615, + "step": 5634 + }, + { + "epoch": 0.4146240515060933, + "grad_norm": 0.80078125, + "learning_rate": 3.1729763523046864e-05, + "loss": 0.5981, + "step": 5635 + }, + { + "epoch": 0.41469763163945733, + "grad_norm": 0.96484375, + "learning_rate": 3.172418553325299e-05, + "loss": 1.1503, + "step": 5636 + }, + { + "epoch": 0.41477121177282134, + "grad_norm": 0.86328125, + "learning_rate": 3.171860718258478e-05, + "loss": 0.7839, + "step": 5637 + }, + { + "epoch": 0.41484479190618534, + "grad_norm": 0.93359375, + "learning_rate": 3.1713028471341624e-05, + "loss": 1.2209, + "step": 5638 + }, + { + "epoch": 0.41491837203954934, + "grad_norm": 0.6640625, + "learning_rate": 3.17074493998229e-05, + "loss": 0.5049, + "step": 5639 + }, + { + "epoch": 0.4149919521729133, + "grad_norm": 0.8359375, + "learning_rate": 3.1701869968328036e-05, + "loss": 0.8211, + "step": 5640 + }, + { + "epoch": 0.4150655323062773, + "grad_norm": 0.9296875, + "learning_rate": 3.169629017715647e-05, + "loss": 0.8303, + "step": 5641 + }, + { + "epoch": 0.4151391124396413, + "grad_norm": 0.78515625, + "learning_rate": 3.169071002660766e-05, + "loss": 0.6896, + "step": 5642 + }, + { + "epoch": 0.4152126925730053, + "grad_norm": 1.078125, + "learning_rate": 3.1685129516981076e-05, + "loss": 1.0469, + "step": 5643 + }, + { + "epoch": 0.41528627270636925, + "grad_norm": 0.83203125, + "learning_rate": 3.1679548648576216e-05, + "loss": 0.9086, + "step": 5644 + }, + { + "epoch": 0.41535985283973326, + "grad_norm": 0.9921875, + "learning_rate": 3.167396742169259e-05, + "loss": 1.2287, + "step": 5645 + }, + { + "epoch": 0.41543343297309726, + "grad_norm": 0.7421875, + "learning_rate": 3.166838583662975e-05, + "loss": 0.5987, + "step": 5646 + }, + { + "epoch": 0.41550701310646126, + "grad_norm": 0.91015625, + "learning_rate": 3.1662803893687244e-05, + "loss": 1.229, + "step": 5647 + }, + { + "epoch": 0.41558059323982527, + "grad_norm": 0.9296875, + "learning_rate": 3.165722159316462e-05, + "loss": 0.9456, + "step": 5648 + }, + { + "epoch": 0.4156541733731892, + "grad_norm": 0.83984375, + "learning_rate": 3.16516389353615e-05, + "loss": 0.9693, + "step": 5649 + }, + { + "epoch": 0.4157277535065532, + "grad_norm": 0.79296875, + "learning_rate": 3.164605592057747e-05, + "loss": 0.6719, + "step": 5650 + }, + { + "epoch": 0.4158013336399172, + "grad_norm": 0.765625, + "learning_rate": 3.1640472549112174e-05, + "loss": 0.6725, + "step": 5651 + }, + { + "epoch": 0.41587491377328123, + "grad_norm": 0.80859375, + "learning_rate": 3.163488882126526e-05, + "loss": 0.6966, + "step": 5652 + }, + { + "epoch": 0.4159484939066452, + "grad_norm": 1.0859375, + "learning_rate": 3.1629304737336404e-05, + "loss": 1.3396, + "step": 5653 + }, + { + "epoch": 0.4160220740400092, + "grad_norm": 0.69921875, + "learning_rate": 3.162372029762527e-05, + "loss": 0.647, + "step": 5654 + }, + { + "epoch": 0.4160956541733732, + "grad_norm": 0.80859375, + "learning_rate": 3.161813550243158e-05, + "loss": 0.958, + "step": 5655 + }, + { + "epoch": 0.4161692343067372, + "grad_norm": 0.93359375, + "learning_rate": 3.161255035205507e-05, + "loss": 1.1894, + "step": 5656 + }, + { + "epoch": 0.4162428144401012, + "grad_norm": 0.859375, + "learning_rate": 3.160696484679546e-05, + "loss": 0.9956, + "step": 5657 + }, + { + "epoch": 0.41631639457346514, + "grad_norm": 0.83984375, + "learning_rate": 3.160137898695252e-05, + "loss": 0.8505, + "step": 5658 + }, + { + "epoch": 0.41638997470682915, + "grad_norm": 1.03125, + "learning_rate": 3.159579277282605e-05, + "loss": 0.9437, + "step": 5659 + }, + { + "epoch": 0.41646355484019315, + "grad_norm": 0.9296875, + "learning_rate": 3.159020620471584e-05, + "loss": 0.9145, + "step": 5660 + }, + { + "epoch": 0.41653713497355715, + "grad_norm": 0.91015625, + "learning_rate": 3.1584619282921704e-05, + "loss": 1.3915, + "step": 5661 + }, + { + "epoch": 0.41661071510692116, + "grad_norm": 0.8515625, + "learning_rate": 3.1579032007743485e-05, + "loss": 0.8081, + "step": 5662 + }, + { + "epoch": 0.4166842952402851, + "grad_norm": 0.77734375, + "learning_rate": 3.157344437948105e-05, + "loss": 0.8382, + "step": 5663 + }, + { + "epoch": 0.4167578753736491, + "grad_norm": 0.90625, + "learning_rate": 3.156785639843427e-05, + "loss": 0.824, + "step": 5664 + }, + { + "epoch": 0.4168314555070131, + "grad_norm": 0.890625, + "learning_rate": 3.156226806490304e-05, + "loss": 1.2242, + "step": 5665 + }, + { + "epoch": 0.4169050356403771, + "grad_norm": 0.75, + "learning_rate": 3.1556679379187274e-05, + "loss": 1.0485, + "step": 5666 + }, + { + "epoch": 0.41697861577374107, + "grad_norm": 1.0234375, + "learning_rate": 3.1551090341586917e-05, + "loss": 1.1262, + "step": 5667 + }, + { + "epoch": 0.41705219590710507, + "grad_norm": 0.87890625, + "learning_rate": 3.154550095240191e-05, + "loss": 0.8812, + "step": 5668 + }, + { + "epoch": 0.4171257760404691, + "grad_norm": 0.703125, + "learning_rate": 3.153991121193224e-05, + "loss": 0.686, + "step": 5669 + }, + { + "epoch": 0.4171993561738331, + "grad_norm": 1.0546875, + "learning_rate": 3.153432112047787e-05, + "loss": 0.8766, + "step": 5670 + }, + { + "epoch": 0.4172729363071971, + "grad_norm": 0.9921875, + "learning_rate": 3.152873067833884e-05, + "loss": 0.9772, + "step": 5671 + }, + { + "epoch": 0.41734651644056103, + "grad_norm": 1.046875, + "learning_rate": 3.1523139885815154e-05, + "loss": 1.5448, + "step": 5672 + }, + { + "epoch": 0.41742009657392504, + "grad_norm": 3.03125, + "learning_rate": 3.151754874320688e-05, + "loss": 1.0638, + "step": 5673 + }, + { + "epoch": 0.41749367670728904, + "grad_norm": 1.0234375, + "learning_rate": 3.151195725081408e-05, + "loss": 1.1067, + "step": 5674 + }, + { + "epoch": 0.41756725684065304, + "grad_norm": 0.97265625, + "learning_rate": 3.150636540893682e-05, + "loss": 1.1005, + "step": 5675 + }, + { + "epoch": 0.417640836974017, + "grad_norm": 0.80859375, + "learning_rate": 3.150077321787523e-05, + "loss": 0.9598, + "step": 5676 + }, + { + "epoch": 0.417714417107381, + "grad_norm": 0.890625, + "learning_rate": 3.149518067792941e-05, + "loss": 1.0019, + "step": 5677 + }, + { + "epoch": 0.417787997240745, + "grad_norm": 0.8203125, + "learning_rate": 3.148958778939951e-05, + "loss": 0.9, + "step": 5678 + }, + { + "epoch": 0.417861577374109, + "grad_norm": 0.92578125, + "learning_rate": 3.1483994552585696e-05, + "loss": 1.0215, + "step": 5679 + }, + { + "epoch": 0.417935157507473, + "grad_norm": 0.9296875, + "learning_rate": 3.147840096778813e-05, + "loss": 0.9514, + "step": 5680 + }, + { + "epoch": 0.41800873764083696, + "grad_norm": 0.99609375, + "learning_rate": 3.147280703530704e-05, + "loss": 0.9191, + "step": 5681 + }, + { + "epoch": 0.41808231777420096, + "grad_norm": 0.7734375, + "learning_rate": 3.146721275544261e-05, + "loss": 0.6177, + "step": 5682 + }, + { + "epoch": 0.41815589790756497, + "grad_norm": 0.85546875, + "learning_rate": 3.146161812849509e-05, + "loss": 0.9278, + "step": 5683 + }, + { + "epoch": 0.41822947804092897, + "grad_norm": 0.9453125, + "learning_rate": 3.1456023154764725e-05, + "loss": 1.3063, + "step": 5684 + }, + { + "epoch": 0.4183030581742929, + "grad_norm": 0.62890625, + "learning_rate": 3.1450427834551795e-05, + "loss": 0.8765, + "step": 5685 + }, + { + "epoch": 0.4183766383076569, + "grad_norm": 1.234375, + "learning_rate": 3.144483216815658e-05, + "loss": 1.439, + "step": 5686 + }, + { + "epoch": 0.4184502184410209, + "grad_norm": 1.0078125, + "learning_rate": 3.14392361558794e-05, + "loss": 1.2136, + "step": 5687 + }, + { + "epoch": 0.41852379857438493, + "grad_norm": 0.890625, + "learning_rate": 3.1433639798020575e-05, + "loss": 0.8733, + "step": 5688 + }, + { + "epoch": 0.41859737870774893, + "grad_norm": 0.9296875, + "learning_rate": 3.142804309488045e-05, + "loss": 0.8334, + "step": 5689 + }, + { + "epoch": 0.4186709588411129, + "grad_norm": 0.73046875, + "learning_rate": 3.142244604675941e-05, + "loss": 0.7069, + "step": 5690 + }, + { + "epoch": 0.4187445389744769, + "grad_norm": 0.89453125, + "learning_rate": 3.14168486539578e-05, + "loss": 0.8814, + "step": 5691 + }, + { + "epoch": 0.4188181191078409, + "grad_norm": 0.8046875, + "learning_rate": 3.141125091677605e-05, + "loss": 0.6196, + "step": 5692 + }, + { + "epoch": 0.4188916992412049, + "grad_norm": 0.86328125, + "learning_rate": 3.140565283551458e-05, + "loss": 0.9069, + "step": 5693 + }, + { + "epoch": 0.41896527937456884, + "grad_norm": 0.7890625, + "learning_rate": 3.140005441047381e-05, + "loss": 1.2138, + "step": 5694 + }, + { + "epoch": 0.41903885950793285, + "grad_norm": 1.015625, + "learning_rate": 3.139445564195421e-05, + "loss": 1.1058, + "step": 5695 + }, + { + "epoch": 0.41911243964129685, + "grad_norm": 0.8828125, + "learning_rate": 3.138885653025626e-05, + "loss": 0.955, + "step": 5696 + }, + { + "epoch": 0.41918601977466086, + "grad_norm": 0.8046875, + "learning_rate": 3.1383257075680446e-05, + "loss": 0.722, + "step": 5697 + }, + { + "epoch": 0.41925959990802486, + "grad_norm": 0.921875, + "learning_rate": 3.1377657278527274e-05, + "loss": 1.0323, + "step": 5698 + }, + { + "epoch": 0.4193331800413888, + "grad_norm": 0.7578125, + "learning_rate": 3.1372057139097284e-05, + "loss": 0.8831, + "step": 5699 + }, + { + "epoch": 0.4194067601747528, + "grad_norm": 0.84765625, + "learning_rate": 3.136645665769102e-05, + "loss": 1.2314, + "step": 5700 + }, + { + "epoch": 0.4194803403081168, + "grad_norm": 0.890625, + "learning_rate": 3.136085583460906e-05, + "loss": 1.176, + "step": 5701 + }, + { + "epoch": 0.4195539204414808, + "grad_norm": 0.78515625, + "learning_rate": 3.135525467015197e-05, + "loss": 0.8508, + "step": 5702 + }, + { + "epoch": 0.41962750057484477, + "grad_norm": 0.91015625, + "learning_rate": 3.134965316462037e-05, + "loss": 0.9366, + "step": 5703 + }, + { + "epoch": 0.4197010807082088, + "grad_norm": 0.76953125, + "learning_rate": 3.134405131831489e-05, + "loss": 0.623, + "step": 5704 + }, + { + "epoch": 0.4197746608415728, + "grad_norm": 0.8203125, + "learning_rate": 3.133844913153614e-05, + "loss": 0.7714, + "step": 5705 + }, + { + "epoch": 0.4198482409749368, + "grad_norm": 0.84765625, + "learning_rate": 3.133284660458481e-05, + "loss": 1.3795, + "step": 5706 + }, + { + "epoch": 0.4199218211083008, + "grad_norm": 0.93359375, + "learning_rate": 3.1327243737761555e-05, + "loss": 0.888, + "step": 5707 + }, + { + "epoch": 0.41999540124166473, + "grad_norm": 0.92578125, + "learning_rate": 3.132164053136708e-05, + "loss": 0.8543, + "step": 5708 + }, + { + "epoch": 0.42006898137502874, + "grad_norm": 0.8671875, + "learning_rate": 3.13160369857021e-05, + "loss": 0.7844, + "step": 5709 + }, + { + "epoch": 0.42014256150839274, + "grad_norm": 0.953125, + "learning_rate": 3.131043310106735e-05, + "loss": 1.0513, + "step": 5710 + }, + { + "epoch": 0.42021614164175675, + "grad_norm": 0.92578125, + "learning_rate": 3.130482887776356e-05, + "loss": 1.0161, + "step": 5711 + }, + { + "epoch": 0.4202897217751207, + "grad_norm": 0.9765625, + "learning_rate": 3.129922431609152e-05, + "loss": 0.8555, + "step": 5712 + }, + { + "epoch": 0.4203633019084847, + "grad_norm": 0.80078125, + "learning_rate": 3.1293619416352005e-05, + "loss": 0.7085, + "step": 5713 + }, + { + "epoch": 0.4204368820418487, + "grad_norm": 0.875, + "learning_rate": 3.1288014178845824e-05, + "loss": 0.7576, + "step": 5714 + }, + { + "epoch": 0.4205104621752127, + "grad_norm": 0.63671875, + "learning_rate": 3.128240860387381e-05, + "loss": 0.6418, + "step": 5715 + }, + { + "epoch": 0.4205840423085767, + "grad_norm": 0.69140625, + "learning_rate": 3.127680269173678e-05, + "loss": 0.6519, + "step": 5716 + }, + { + "epoch": 0.42065762244194066, + "grad_norm": 1.1796875, + "learning_rate": 3.1271196442735606e-05, + "loss": 1.6772, + "step": 5717 + }, + { + "epoch": 0.42073120257530466, + "grad_norm": 0.78125, + "learning_rate": 3.126558985717117e-05, + "loss": 0.9215, + "step": 5718 + }, + { + "epoch": 0.42080478270866867, + "grad_norm": 0.859375, + "learning_rate": 3.125998293534435e-05, + "loss": 0.8935, + "step": 5719 + }, + { + "epoch": 0.42087836284203267, + "grad_norm": 1.1328125, + "learning_rate": 3.1254375677556076e-05, + "loss": 1.3714, + "step": 5720 + }, + { + "epoch": 0.4209519429753966, + "grad_norm": 0.76171875, + "learning_rate": 3.1248768084107265e-05, + "loss": 0.8136, + "step": 5721 + }, + { + "epoch": 0.4210255231087606, + "grad_norm": 0.953125, + "learning_rate": 3.1243160155298886e-05, + "loss": 1.1025, + "step": 5722 + }, + { + "epoch": 0.42109910324212463, + "grad_norm": 1.0859375, + "learning_rate": 3.123755189143188e-05, + "loss": 1.3903, + "step": 5723 + }, + { + "epoch": 0.42117268337548863, + "grad_norm": 0.7890625, + "learning_rate": 3.123194329280726e-05, + "loss": 1.1325, + "step": 5724 + }, + { + "epoch": 0.42124626350885264, + "grad_norm": 0.98828125, + "learning_rate": 3.1226334359726e-05, + "loss": 1.3851, + "step": 5725 + }, + { + "epoch": 0.4213198436422166, + "grad_norm": 0.89453125, + "learning_rate": 3.122072509248914e-05, + "loss": 0.7743, + "step": 5726 + }, + { + "epoch": 0.4213934237755806, + "grad_norm": 0.8828125, + "learning_rate": 3.1215115491397714e-05, + "loss": 1.2776, + "step": 5727 + }, + { + "epoch": 0.4214670039089446, + "grad_norm": 0.83203125, + "learning_rate": 3.120950555675277e-05, + "loss": 1.1721, + "step": 5728 + }, + { + "epoch": 0.4215405840423086, + "grad_norm": 0.8671875, + "learning_rate": 3.120389528885541e-05, + "loss": 1.3343, + "step": 5729 + }, + { + "epoch": 0.42161416417567255, + "grad_norm": 0.8359375, + "learning_rate": 3.119828468800669e-05, + "loss": 0.9707, + "step": 5730 + }, + { + "epoch": 0.42168774430903655, + "grad_norm": 1.109375, + "learning_rate": 3.1192673754507754e-05, + "loss": 1.2327, + "step": 5731 + }, + { + "epoch": 0.42176132444240055, + "grad_norm": 0.93359375, + "learning_rate": 3.11870624886597e-05, + "loss": 0.8943, + "step": 5732 + }, + { + "epoch": 0.42183490457576456, + "grad_norm": 0.93359375, + "learning_rate": 3.118145089076369e-05, + "loss": 1.0463, + "step": 5733 + }, + { + "epoch": 0.42190848470912856, + "grad_norm": 1.0, + "learning_rate": 3.117583896112088e-05, + "loss": 0.8832, + "step": 5734 + }, + { + "epoch": 0.4219820648424925, + "grad_norm": 1.1015625, + "learning_rate": 3.117022670003246e-05, + "loss": 1.1644, + "step": 5735 + }, + { + "epoch": 0.4220556449758565, + "grad_norm": 0.86328125, + "learning_rate": 3.116461410779964e-05, + "loss": 0.916, + "step": 5736 + }, + { + "epoch": 0.4221292251092205, + "grad_norm": 0.953125, + "learning_rate": 3.1159001184723615e-05, + "loss": 1.1364, + "step": 5737 + }, + { + "epoch": 0.4222028052425845, + "grad_norm": 1.1171875, + "learning_rate": 3.115338793110563e-05, + "loss": 1.3442, + "step": 5738 + }, + { + "epoch": 0.42227638537594847, + "grad_norm": 0.6953125, + "learning_rate": 3.114777434724694e-05, + "loss": 0.7471, + "step": 5739 + }, + { + "epoch": 0.4223499655093125, + "grad_norm": 1.0078125, + "learning_rate": 3.11421604334488e-05, + "loss": 1.0077, + "step": 5740 + }, + { + "epoch": 0.4224235456426765, + "grad_norm": 0.8359375, + "learning_rate": 3.1136546190012524e-05, + "loss": 0.7534, + "step": 5741 + }, + { + "epoch": 0.4224971257760405, + "grad_norm": 0.9453125, + "learning_rate": 3.11309316172394e-05, + "loss": 0.9952, + "step": 5742 + }, + { + "epoch": 0.4225707059094045, + "grad_norm": 1.234375, + "learning_rate": 3.1125316715430755e-05, + "loss": 1.0639, + "step": 5743 + }, + { + "epoch": 0.42264428604276844, + "grad_norm": 0.859375, + "learning_rate": 3.111970148488793e-05, + "loss": 0.8631, + "step": 5744 + }, + { + "epoch": 0.42271786617613244, + "grad_norm": 1.03125, + "learning_rate": 3.111408592591229e-05, + "loss": 1.0575, + "step": 5745 + }, + { + "epoch": 0.42279144630949644, + "grad_norm": 0.76953125, + "learning_rate": 3.11084700388052e-05, + "loss": 0.7257, + "step": 5746 + }, + { + "epoch": 0.42286502644286045, + "grad_norm": 0.73828125, + "learning_rate": 3.110285382386806e-05, + "loss": 0.6819, + "step": 5747 + }, + { + "epoch": 0.4229386065762244, + "grad_norm": 0.78125, + "learning_rate": 3.109723728140229e-05, + "loss": 0.7694, + "step": 5748 + }, + { + "epoch": 0.4230121867095884, + "grad_norm": 0.83984375, + "learning_rate": 3.10916204117093e-05, + "loss": 1.0799, + "step": 5749 + }, + { + "epoch": 0.4230857668429524, + "grad_norm": 1.0703125, + "learning_rate": 3.108600321509056e-05, + "loss": 0.9085, + "step": 5750 + }, + { + "epoch": 0.4231593469763164, + "grad_norm": 0.78515625, + "learning_rate": 3.108038569184751e-05, + "loss": 1.1044, + "step": 5751 + }, + { + "epoch": 0.4232329271096804, + "grad_norm": 0.9921875, + "learning_rate": 3.1074767842281654e-05, + "loss": 1.2157, + "step": 5752 + }, + { + "epoch": 0.42330650724304436, + "grad_norm": 0.7265625, + "learning_rate": 3.1069149666694474e-05, + "loss": 1.0037, + "step": 5753 + }, + { + "epoch": 0.42338008737640836, + "grad_norm": 0.9375, + "learning_rate": 3.10635311653875e-05, + "loss": 0.9024, + "step": 5754 + }, + { + "epoch": 0.42345366750977237, + "grad_norm": 1.03125, + "learning_rate": 3.105791233866226e-05, + "loss": 1.5471, + "step": 5755 + }, + { + "epoch": 0.4235272476431364, + "grad_norm": 0.89453125, + "learning_rate": 3.10522931868203e-05, + "loss": 1.5793, + "step": 5756 + }, + { + "epoch": 0.4236008277765003, + "grad_norm": 0.8359375, + "learning_rate": 3.1046673710163206e-05, + "loss": 0.7626, + "step": 5757 + }, + { + "epoch": 0.4236744079098643, + "grad_norm": 0.91015625, + "learning_rate": 3.104105390899255e-05, + "loss": 0.947, + "step": 5758 + }, + { + "epoch": 0.42374798804322833, + "grad_norm": 0.9140625, + "learning_rate": 3.103543378360994e-05, + "loss": 0.9181, + "step": 5759 + }, + { + "epoch": 0.42382156817659233, + "grad_norm": 0.875, + "learning_rate": 3.1029813334316994e-05, + "loss": 0.784, + "step": 5760 + }, + { + "epoch": 0.42389514830995634, + "grad_norm": 0.8984375, + "learning_rate": 3.102419256141536e-05, + "loss": 0.951, + "step": 5761 + }, + { + "epoch": 0.4239687284433203, + "grad_norm": 0.81640625, + "learning_rate": 3.101857146520669e-05, + "loss": 0.9759, + "step": 5762 + }, + { + "epoch": 0.4240423085766843, + "grad_norm": 0.84765625, + "learning_rate": 3.101295004599266e-05, + "loss": 1.1313, + "step": 5763 + }, + { + "epoch": 0.4241158887100483, + "grad_norm": 1.5703125, + "learning_rate": 3.100732830407495e-05, + "loss": 1.0304, + "step": 5764 + }, + { + "epoch": 0.4241894688434123, + "grad_norm": 0.9140625, + "learning_rate": 3.1001706239755286e-05, + "loss": 0.9478, + "step": 5765 + }, + { + "epoch": 0.42426304897677625, + "grad_norm": 0.81640625, + "learning_rate": 3.0996083853335374e-05, + "loss": 0.8076, + "step": 5766 + }, + { + "epoch": 0.42433662911014025, + "grad_norm": 0.84765625, + "learning_rate": 3.099046114511696e-05, + "loss": 0.5262, + "step": 5767 + }, + { + "epoch": 0.42441020924350426, + "grad_norm": 1.1640625, + "learning_rate": 3.098483811540183e-05, + "loss": 1.2619, + "step": 5768 + }, + { + "epoch": 0.42448378937686826, + "grad_norm": 0.828125, + "learning_rate": 3.097921476449173e-05, + "loss": 0.7225, + "step": 5769 + }, + { + "epoch": 0.42455736951023226, + "grad_norm": 1.046875, + "learning_rate": 3.097359109268847e-05, + "loss": 1.2377, + "step": 5770 + }, + { + "epoch": 0.4246309496435962, + "grad_norm": 0.953125, + "learning_rate": 3.096796710029386e-05, + "loss": 1.4803, + "step": 5771 + }, + { + "epoch": 0.4247045297769602, + "grad_norm": 0.8203125, + "learning_rate": 3.0962342787609725e-05, + "loss": 1.1074, + "step": 5772 + }, + { + "epoch": 0.4247781099103242, + "grad_norm": 0.81640625, + "learning_rate": 3.0956718154937915e-05, + "loss": 1.1368, + "step": 5773 + }, + { + "epoch": 0.4248516900436882, + "grad_norm": 1.0859375, + "learning_rate": 3.095109320258029e-05, + "loss": 1.1014, + "step": 5774 + }, + { + "epoch": 0.4249252701770522, + "grad_norm": 1.015625, + "learning_rate": 3.094546793083873e-05, + "loss": 1.9882, + "step": 5775 + }, + { + "epoch": 0.4249988503104162, + "grad_norm": 0.82421875, + "learning_rate": 3.0939842340015145e-05, + "loss": 1.282, + "step": 5776 + }, + { + "epoch": 0.4250724304437802, + "grad_norm": 0.74609375, + "learning_rate": 3.093421643041144e-05, + "loss": 0.8593, + "step": 5777 + }, + { + "epoch": 0.4251460105771442, + "grad_norm": 1.0, + "learning_rate": 3.092859020232954e-05, + "loss": 0.9462, + "step": 5778 + }, + { + "epoch": 0.4252195907105082, + "grad_norm": 0.84765625, + "learning_rate": 3.092296365607141e-05, + "loss": 0.8246, + "step": 5779 + }, + { + "epoch": 0.42529317084387214, + "grad_norm": 0.94140625, + "learning_rate": 3.091733679193899e-05, + "loss": 0.9693, + "step": 5780 + }, + { + "epoch": 0.42536675097723614, + "grad_norm": 0.79296875, + "learning_rate": 3.09117096102343e-05, + "loss": 0.7689, + "step": 5781 + }, + { + "epoch": 0.42544033111060015, + "grad_norm": 0.91015625, + "learning_rate": 3.090608211125931e-05, + "loss": 0.9306, + "step": 5782 + }, + { + "epoch": 0.42551391124396415, + "grad_norm": 0.96484375, + "learning_rate": 3.090045429531605e-05, + "loss": 1.2965, + "step": 5783 + }, + { + "epoch": 0.4255874913773281, + "grad_norm": 0.85546875, + "learning_rate": 3.089482616270656e-05, + "loss": 0.8974, + "step": 5784 + }, + { + "epoch": 0.4256610715106921, + "grad_norm": 0.98828125, + "learning_rate": 3.088919771373287e-05, + "loss": 1.1757, + "step": 5785 + }, + { + "epoch": 0.4257346516440561, + "grad_norm": 1.0546875, + "learning_rate": 3.0883568948697077e-05, + "loss": 1.1472, + "step": 5786 + }, + { + "epoch": 0.4258082317774201, + "grad_norm": 0.80859375, + "learning_rate": 3.0877939867901235e-05, + "loss": 0.823, + "step": 5787 + }, + { + "epoch": 0.4258818119107841, + "grad_norm": 0.8515625, + "learning_rate": 3.0872310471647474e-05, + "loss": 1.0921, + "step": 5788 + }, + { + "epoch": 0.42595539204414806, + "grad_norm": 0.80859375, + "learning_rate": 3.08666807602379e-05, + "loss": 0.9897, + "step": 5789 + }, + { + "epoch": 0.42602897217751207, + "grad_norm": 0.76171875, + "learning_rate": 3.086105073397465e-05, + "loss": 0.7113, + "step": 5790 + }, + { + "epoch": 0.42610255231087607, + "grad_norm": 0.68359375, + "learning_rate": 3.0855420393159874e-05, + "loss": 0.6338, + "step": 5791 + }, + { + "epoch": 0.4261761324442401, + "grad_norm": 0.828125, + "learning_rate": 3.084978973809574e-05, + "loss": 1.2566, + "step": 5792 + }, + { + "epoch": 0.426249712577604, + "grad_norm": 0.81640625, + "learning_rate": 3.084415876908444e-05, + "loss": 0.7628, + "step": 5793 + }, + { + "epoch": 0.426323292710968, + "grad_norm": 0.86328125, + "learning_rate": 3.083852748642818e-05, + "loss": 0.7678, + "step": 5794 + }, + { + "epoch": 0.42639687284433203, + "grad_norm": 0.81640625, + "learning_rate": 3.083289589042918e-05, + "loss": 0.9107, + "step": 5795 + }, + { + "epoch": 0.42647045297769604, + "grad_norm": 0.95703125, + "learning_rate": 3.082726398138968e-05, + "loss": 0.8132, + "step": 5796 + }, + { + "epoch": 0.42654403311106004, + "grad_norm": 0.671875, + "learning_rate": 3.0821631759611925e-05, + "loss": 0.616, + "step": 5797 + }, + { + "epoch": 0.426617613244424, + "grad_norm": 0.80078125, + "learning_rate": 3.0815999225398186e-05, + "loss": 1.4227, + "step": 5798 + }, + { + "epoch": 0.426691193377788, + "grad_norm": 1.0546875, + "learning_rate": 3.081036637905075e-05, + "loss": 1.1151, + "step": 5799 + }, + { + "epoch": 0.426764773511152, + "grad_norm": 0.79296875, + "learning_rate": 3.080473322087193e-05, + "loss": 1.221, + "step": 5800 + }, + { + "epoch": 0.426838353644516, + "grad_norm": 0.75, + "learning_rate": 3.0799099751164036e-05, + "loss": 0.8428, + "step": 5801 + }, + { + "epoch": 0.42691193377787995, + "grad_norm": 0.77734375, + "learning_rate": 3.079346597022942e-05, + "loss": 0.8761, + "step": 5802 + }, + { + "epoch": 0.42698551391124395, + "grad_norm": 0.88671875, + "learning_rate": 3.078783187837042e-05, + "loss": 1.2068, + "step": 5803 + }, + { + "epoch": 0.42705909404460796, + "grad_norm": 0.84375, + "learning_rate": 3.078219747588942e-05, + "loss": 0.7164, + "step": 5804 + }, + { + "epoch": 0.42713267417797196, + "grad_norm": 0.79296875, + "learning_rate": 3.077656276308879e-05, + "loss": 0.8161, + "step": 5805 + }, + { + "epoch": 0.42720625431133596, + "grad_norm": 0.9140625, + "learning_rate": 3.077092774027096e-05, + "loss": 1.4555, + "step": 5806 + }, + { + "epoch": 0.4272798344446999, + "grad_norm": 1.03125, + "learning_rate": 3.076529240773833e-05, + "loss": 0.9777, + "step": 5807 + }, + { + "epoch": 0.4273534145780639, + "grad_norm": 0.87109375, + "learning_rate": 3.0759656765793356e-05, + "loss": 1.1901, + "step": 5808 + }, + { + "epoch": 0.4274269947114279, + "grad_norm": 0.9765625, + "learning_rate": 3.075402081473848e-05, + "loss": 1.0205, + "step": 5809 + }, + { + "epoch": 0.4275005748447919, + "grad_norm": 0.82421875, + "learning_rate": 3.074838455487616e-05, + "loss": 1.1187, + "step": 5810 + }, + { + "epoch": 0.4275741549781559, + "grad_norm": 0.7890625, + "learning_rate": 3.074274798650892e-05, + "loss": 0.909, + "step": 5811 + }, + { + "epoch": 0.4276477351115199, + "grad_norm": 0.80859375, + "learning_rate": 3.073711110993923e-05, + "loss": 0.9265, + "step": 5812 + }, + { + "epoch": 0.4277213152448839, + "grad_norm": 0.7265625, + "learning_rate": 3.0731473925469616e-05, + "loss": 0.868, + "step": 5813 + }, + { + "epoch": 0.4277948953782479, + "grad_norm": 0.96875, + "learning_rate": 3.072583643340263e-05, + "loss": 1.1088, + "step": 5814 + }, + { + "epoch": 0.4278684755116119, + "grad_norm": 1.1484375, + "learning_rate": 3.072019863404082e-05, + "loss": 1.3737, + "step": 5815 + }, + { + "epoch": 0.42794205564497584, + "grad_norm": 0.984375, + "learning_rate": 3.071456052768675e-05, + "loss": 1.4669, + "step": 5816 + }, + { + "epoch": 0.42801563577833984, + "grad_norm": 1.203125, + "learning_rate": 3.0708922114643e-05, + "loss": 1.3207, + "step": 5817 + }, + { + "epoch": 0.42808921591170385, + "grad_norm": 1.3046875, + "learning_rate": 3.07032833952122e-05, + "loss": 1.0223, + "step": 5818 + }, + { + "epoch": 0.42816279604506785, + "grad_norm": 0.8203125, + "learning_rate": 3.0697644369696945e-05, + "loss": 0.7622, + "step": 5819 + }, + { + "epoch": 0.4282363761784318, + "grad_norm": 0.8828125, + "learning_rate": 3.069200503839988e-05, + "loss": 0.7641, + "step": 5820 + }, + { + "epoch": 0.4283099563117958, + "grad_norm": 0.80078125, + "learning_rate": 3.068636540162364e-05, + "loss": 0.8504, + "step": 5821 + }, + { + "epoch": 0.4283835364451598, + "grad_norm": 1.078125, + "learning_rate": 3.068072545967093e-05, + "loss": 1.165, + "step": 5822 + }, + { + "epoch": 0.4284571165785238, + "grad_norm": 0.98828125, + "learning_rate": 3.0675085212844403e-05, + "loss": 1.3198, + "step": 5823 + }, + { + "epoch": 0.4285306967118878, + "grad_norm": 1.015625, + "learning_rate": 3.066944466144677e-05, + "loss": 0.7301, + "step": 5824 + }, + { + "epoch": 0.42860427684525176, + "grad_norm": 0.81640625, + "learning_rate": 3.066380380578075e-05, + "loss": 0.8298, + "step": 5825 + }, + { + "epoch": 0.42867785697861577, + "grad_norm": 0.890625, + "learning_rate": 3.065816264614909e-05, + "loss": 0.9894, + "step": 5826 + }, + { + "epoch": 0.42875143711197977, + "grad_norm": 0.890625, + "learning_rate": 3.065252118285451e-05, + "loss": 0.8816, + "step": 5827 + }, + { + "epoch": 0.4288250172453438, + "grad_norm": 0.80859375, + "learning_rate": 3.0646879416199805e-05, + "loss": 1.0246, + "step": 5828 + }, + { + "epoch": 0.4288985973787077, + "grad_norm": 0.8359375, + "learning_rate": 3.0641237346487747e-05, + "loss": 0.8419, + "step": 5829 + }, + { + "epoch": 0.42897217751207173, + "grad_norm": 0.9375, + "learning_rate": 3.063559497402113e-05, + "loss": 0.8907, + "step": 5830 + }, + { + "epoch": 0.42904575764543573, + "grad_norm": 1.0703125, + "learning_rate": 3.062995229910278e-05, + "loss": 0.8183, + "step": 5831 + }, + { + "epoch": 0.42911933777879974, + "grad_norm": 0.796875, + "learning_rate": 3.062430932203552e-05, + "loss": 0.9873, + "step": 5832 + }, + { + "epoch": 0.42919291791216374, + "grad_norm": 0.89453125, + "learning_rate": 3.06186660431222e-05, + "loss": 1.1113, + "step": 5833 + }, + { + "epoch": 0.4292664980455277, + "grad_norm": 0.90625, + "learning_rate": 3.061302246266569e-05, + "loss": 1.0394, + "step": 5834 + }, + { + "epoch": 0.4293400781788917, + "grad_norm": 0.76171875, + "learning_rate": 3.060737858096886e-05, + "loss": 0.9237, + "step": 5835 + }, + { + "epoch": 0.4294136583122557, + "grad_norm": 0.859375, + "learning_rate": 3.0601734398334626e-05, + "loss": 0.8831, + "step": 5836 + }, + { + "epoch": 0.4294872384456197, + "grad_norm": 0.6953125, + "learning_rate": 3.0596089915065875e-05, + "loss": 0.5404, + "step": 5837 + }, + { + "epoch": 0.42956081857898365, + "grad_norm": 0.75, + "learning_rate": 3.059044513146555e-05, + "loss": 0.7114, + "step": 5838 + }, + { + "epoch": 0.42963439871234765, + "grad_norm": 0.7421875, + "learning_rate": 3.058480004783659e-05, + "loss": 0.7721, + "step": 5839 + }, + { + "epoch": 0.42970797884571166, + "grad_norm": 0.81640625, + "learning_rate": 3.0579154664481966e-05, + "loss": 0.8697, + "step": 5840 + }, + { + "epoch": 0.42978155897907566, + "grad_norm": 0.8125, + "learning_rate": 3.057350898170464e-05, + "loss": 0.8042, + "step": 5841 + }, + { + "epoch": 0.42985513911243967, + "grad_norm": 0.82421875, + "learning_rate": 3.056786299980763e-05, + "loss": 1.1312, + "step": 5842 + }, + { + "epoch": 0.4299287192458036, + "grad_norm": 1.0390625, + "learning_rate": 3.0562216719093913e-05, + "loss": 1.0338, + "step": 5843 + }, + { + "epoch": 0.4300022993791676, + "grad_norm": 0.77734375, + "learning_rate": 3.055657013986654e-05, + "loss": 1.1038, + "step": 5844 + }, + { + "epoch": 0.4300758795125316, + "grad_norm": 0.92578125, + "learning_rate": 3.055092326242854e-05, + "loss": 0.9665, + "step": 5845 + }, + { + "epoch": 0.4301494596458956, + "grad_norm": 1.7578125, + "learning_rate": 3.054527608708298e-05, + "loss": 1.4164, + "step": 5846 + }, + { + "epoch": 0.4302230397792596, + "grad_norm": 1.09375, + "learning_rate": 3.053962861413291e-05, + "loss": 1.3889, + "step": 5847 + }, + { + "epoch": 0.4302966199126236, + "grad_norm": 0.87890625, + "learning_rate": 3.053398084388144e-05, + "loss": 1.0581, + "step": 5848 + }, + { + "epoch": 0.4303702000459876, + "grad_norm": 0.82421875, + "learning_rate": 3.052833277663167e-05, + "loss": 0.7859, + "step": 5849 + }, + { + "epoch": 0.4304437801793516, + "grad_norm": 0.8125, + "learning_rate": 3.052268441268673e-05, + "loss": 0.7511, + "step": 5850 + }, + { + "epoch": 0.4305173603127156, + "grad_norm": 1.1796875, + "learning_rate": 3.0517035752349743e-05, + "loss": 1.367, + "step": 5851 + }, + { + "epoch": 0.43059094044607954, + "grad_norm": 1.09375, + "learning_rate": 3.051138679592387e-05, + "loss": 0.9445, + "step": 5852 + }, + { + "epoch": 0.43066452057944354, + "grad_norm": 1.0859375, + "learning_rate": 3.0505737543712275e-05, + "loss": 1.543, + "step": 5853 + }, + { + "epoch": 0.43073810071280755, + "grad_norm": 0.6796875, + "learning_rate": 3.050008799601814e-05, + "loss": 0.5615, + "step": 5854 + }, + { + "epoch": 0.43081168084617155, + "grad_norm": 1.0, + "learning_rate": 3.0494438153144676e-05, + "loss": 0.6474, + "step": 5855 + }, + { + "epoch": 0.4308852609795355, + "grad_norm": 1.03125, + "learning_rate": 3.0488788015395087e-05, + "loss": 0.8403, + "step": 5856 + }, + { + "epoch": 0.4309588411128995, + "grad_norm": 0.828125, + "learning_rate": 3.0483137583072623e-05, + "loss": 0.8642, + "step": 5857 + }, + { + "epoch": 0.4310324212462635, + "grad_norm": 0.91015625, + "learning_rate": 3.0477486856480515e-05, + "loss": 0.9637, + "step": 5858 + }, + { + "epoch": 0.4311060013796275, + "grad_norm": 0.734375, + "learning_rate": 3.0471835835922034e-05, + "loss": 0.6809, + "step": 5859 + }, + { + "epoch": 0.4311795815129915, + "grad_norm": 0.9296875, + "learning_rate": 3.046618452170046e-05, + "loss": 0.9921, + "step": 5860 + }, + { + "epoch": 0.43125316164635547, + "grad_norm": 0.9453125, + "learning_rate": 3.0460532914119083e-05, + "loss": 1.0586, + "step": 5861 + }, + { + "epoch": 0.43132674177971947, + "grad_norm": 1.078125, + "learning_rate": 3.0454881013481217e-05, + "loss": 1.4062, + "step": 5862 + }, + { + "epoch": 0.4314003219130835, + "grad_norm": 1.015625, + "learning_rate": 3.0449228820090198e-05, + "loss": 1.2889, + "step": 5863 + }, + { + "epoch": 0.4314739020464475, + "grad_norm": 0.73828125, + "learning_rate": 3.0443576334249357e-05, + "loss": 1.1536, + "step": 5864 + }, + { + "epoch": 0.4315474821798114, + "grad_norm": 0.6484375, + "learning_rate": 3.043792355626205e-05, + "loss": 0.5844, + "step": 5865 + }, + { + "epoch": 0.43162106231317543, + "grad_norm": 0.98828125, + "learning_rate": 3.0432270486431663e-05, + "loss": 0.7336, + "step": 5866 + }, + { + "epoch": 0.43169464244653943, + "grad_norm": 1.2421875, + "learning_rate": 3.042661712506158e-05, + "loss": 1.3385, + "step": 5867 + }, + { + "epoch": 0.43176822257990344, + "grad_norm": 0.90625, + "learning_rate": 3.04209634724552e-05, + "loss": 0.7551, + "step": 5868 + }, + { + "epoch": 0.43184180271326744, + "grad_norm": 0.8828125, + "learning_rate": 3.0415309528915946e-05, + "loss": 0.8322, + "step": 5869 + }, + { + "epoch": 0.4319153828466314, + "grad_norm": 0.875, + "learning_rate": 3.040965529474727e-05, + "loss": 0.7568, + "step": 5870 + }, + { + "epoch": 0.4319889629799954, + "grad_norm": 0.93359375, + "learning_rate": 3.04040007702526e-05, + "loss": 0.9458, + "step": 5871 + }, + { + "epoch": 0.4320625431133594, + "grad_norm": 0.84375, + "learning_rate": 3.039834595573542e-05, + "loss": 0.9895, + "step": 5872 + }, + { + "epoch": 0.4321361232467234, + "grad_norm": 0.953125, + "learning_rate": 3.039269085149921e-05, + "loss": 1.1591, + "step": 5873 + }, + { + "epoch": 0.43220970338008735, + "grad_norm": 0.73828125, + "learning_rate": 3.038703545784747e-05, + "loss": 0.8226, + "step": 5874 + }, + { + "epoch": 0.43228328351345136, + "grad_norm": 0.765625, + "learning_rate": 3.03813797750837e-05, + "loss": 0.9298, + "step": 5875 + }, + { + "epoch": 0.43235686364681536, + "grad_norm": 0.78125, + "learning_rate": 3.0375723803511447e-05, + "loss": 0.5958, + "step": 5876 + }, + { + "epoch": 0.43243044378017936, + "grad_norm": 0.765625, + "learning_rate": 3.0370067543434254e-05, + "loss": 0.8827, + "step": 5877 + }, + { + "epoch": 0.43250402391354337, + "grad_norm": 1.03125, + "learning_rate": 3.036441099515568e-05, + "loss": 1.133, + "step": 5878 + }, + { + "epoch": 0.4325776040469073, + "grad_norm": 0.859375, + "learning_rate": 3.0358754158979298e-05, + "loss": 0.796, + "step": 5879 + }, + { + "epoch": 0.4326511841802713, + "grad_norm": 0.8359375, + "learning_rate": 3.03530970352087e-05, + "loss": 0.7298, + "step": 5880 + }, + { + "epoch": 0.4327247643136353, + "grad_norm": 0.76171875, + "learning_rate": 3.0347439624147493e-05, + "loss": 0.6449, + "step": 5881 + }, + { + "epoch": 0.43279834444699933, + "grad_norm": 0.8125, + "learning_rate": 3.0341781926099305e-05, + "loss": 0.9, + "step": 5882 + }, + { + "epoch": 0.4328719245803633, + "grad_norm": 0.8828125, + "learning_rate": 3.0336123941367768e-05, + "loss": 1.0585, + "step": 5883 + }, + { + "epoch": 0.4329455047137273, + "grad_norm": 0.78125, + "learning_rate": 3.0330465670256542e-05, + "loss": 0.7246, + "step": 5884 + }, + { + "epoch": 0.4330190848470913, + "grad_norm": 0.87109375, + "learning_rate": 3.032480711306928e-05, + "loss": 1.0527, + "step": 5885 + }, + { + "epoch": 0.4330926649804553, + "grad_norm": 1.015625, + "learning_rate": 3.0319148270109693e-05, + "loss": 1.1721, + "step": 5886 + }, + { + "epoch": 0.4331662451138193, + "grad_norm": 1.1953125, + "learning_rate": 3.0313489141681456e-05, + "loss": 1.5106, + "step": 5887 + }, + { + "epoch": 0.43323982524718324, + "grad_norm": 0.734375, + "learning_rate": 3.030782972808829e-05, + "loss": 0.7848, + "step": 5888 + }, + { + "epoch": 0.43331340538054725, + "grad_norm": 0.90234375, + "learning_rate": 3.030217002963393e-05, + "loss": 1.0025, + "step": 5889 + }, + { + "epoch": 0.43338698551391125, + "grad_norm": 0.6875, + "learning_rate": 3.0296510046622118e-05, + "loss": 0.6883, + "step": 5890 + }, + { + "epoch": 0.43346056564727525, + "grad_norm": 0.8984375, + "learning_rate": 3.0290849779356623e-05, + "loss": 1.1494, + "step": 5891 + }, + { + "epoch": 0.4335341457806392, + "grad_norm": 0.73828125, + "learning_rate": 3.0285189228141202e-05, + "loss": 0.8172, + "step": 5892 + }, + { + "epoch": 0.4336077259140032, + "grad_norm": 0.734375, + "learning_rate": 3.0279528393279664e-05, + "loss": 0.7093, + "step": 5893 + }, + { + "epoch": 0.4336813060473672, + "grad_norm": 0.6171875, + "learning_rate": 3.0273867275075808e-05, + "loss": 0.5568, + "step": 5894 + }, + { + "epoch": 0.4337548861807312, + "grad_norm": 0.984375, + "learning_rate": 3.0268205873833456e-05, + "loss": 0.8622, + "step": 5895 + }, + { + "epoch": 0.4338284663140952, + "grad_norm": 0.8515625, + "learning_rate": 3.0262544189856446e-05, + "loss": 1.0844, + "step": 5896 + }, + { + "epoch": 0.43390204644745917, + "grad_norm": 1.046875, + "learning_rate": 3.0256882223448625e-05, + "loss": 1.2497, + "step": 5897 + }, + { + "epoch": 0.43397562658082317, + "grad_norm": 0.796875, + "learning_rate": 3.025121997491387e-05, + "loss": 0.7019, + "step": 5898 + }, + { + "epoch": 0.4340492067141872, + "grad_norm": 0.85546875, + "learning_rate": 3.024555744455605e-05, + "loss": 1.0185, + "step": 5899 + }, + { + "epoch": 0.4341227868475512, + "grad_norm": 0.7890625, + "learning_rate": 3.0239894632679078e-05, + "loss": 0.9628, + "step": 5900 + }, + { + "epoch": 0.4341963669809151, + "grad_norm": 0.80859375, + "learning_rate": 3.023423153958685e-05, + "loss": 0.612, + "step": 5901 + }, + { + "epoch": 0.43426994711427913, + "grad_norm": 0.81640625, + "learning_rate": 3.0228568165583303e-05, + "loss": 0.999, + "step": 5902 + }, + { + "epoch": 0.43434352724764314, + "grad_norm": 1.0078125, + "learning_rate": 3.0222904510972377e-05, + "loss": 1.065, + "step": 5903 + }, + { + "epoch": 0.43441710738100714, + "grad_norm": 0.796875, + "learning_rate": 3.0217240576058033e-05, + "loss": 0.9642, + "step": 5904 + }, + { + "epoch": 0.43449068751437114, + "grad_norm": 1.0078125, + "learning_rate": 3.0211576361144245e-05, + "loss": 1.1554, + "step": 5905 + }, + { + "epoch": 0.4345642676477351, + "grad_norm": 0.9296875, + "learning_rate": 3.020591186653499e-05, + "loss": 1.2682, + "step": 5906 + }, + { + "epoch": 0.4346378477810991, + "grad_norm": 0.921875, + "learning_rate": 3.0200247092534285e-05, + "loss": 0.7066, + "step": 5907 + }, + { + "epoch": 0.4347114279144631, + "grad_norm": 0.671875, + "learning_rate": 3.0194582039446136e-05, + "loss": 0.7009, + "step": 5908 + }, + { + "epoch": 0.4347850080478271, + "grad_norm": 0.84765625, + "learning_rate": 3.018891670757458e-05, + "loss": 0.9667, + "step": 5909 + }, + { + "epoch": 0.43485858818119105, + "grad_norm": 0.7890625, + "learning_rate": 3.0183251097223664e-05, + "loss": 0.8592, + "step": 5910 + }, + { + "epoch": 0.43493216831455506, + "grad_norm": 0.828125, + "learning_rate": 3.0177585208697456e-05, + "loss": 0.9382, + "step": 5911 + }, + { + "epoch": 0.43500574844791906, + "grad_norm": 0.796875, + "learning_rate": 3.0171919042300023e-05, + "loss": 1.0079, + "step": 5912 + }, + { + "epoch": 0.43507932858128306, + "grad_norm": 0.7265625, + "learning_rate": 3.0166252598335466e-05, + "loss": 0.8211, + "step": 5913 + }, + { + "epoch": 0.43515290871464707, + "grad_norm": 0.9296875, + "learning_rate": 3.016058587710789e-05, + "loss": 1.1583, + "step": 5914 + }, + { + "epoch": 0.435226488848011, + "grad_norm": 0.984375, + "learning_rate": 3.015491887892141e-05, + "loss": 1.0118, + "step": 5915 + }, + { + "epoch": 0.435300068981375, + "grad_norm": 0.7890625, + "learning_rate": 3.014925160408018e-05, + "loss": 1.1678, + "step": 5916 + }, + { + "epoch": 0.435373649114739, + "grad_norm": 0.98046875, + "learning_rate": 3.0143584052888334e-05, + "loss": 0.9329, + "step": 5917 + }, + { + "epoch": 0.43544722924810303, + "grad_norm": 0.73046875, + "learning_rate": 3.0137916225650054e-05, + "loss": 0.5739, + "step": 5918 + }, + { + "epoch": 0.435520809381467, + "grad_norm": 0.79296875, + "learning_rate": 3.0132248122669514e-05, + "loss": 0.8449, + "step": 5919 + }, + { + "epoch": 0.435594389514831, + "grad_norm": 1.0, + "learning_rate": 3.012657974425091e-05, + "loss": 0.9105, + "step": 5920 + }, + { + "epoch": 0.435667969648195, + "grad_norm": 0.875, + "learning_rate": 3.012091109069845e-05, + "loss": 0.821, + "step": 5921 + }, + { + "epoch": 0.435741549781559, + "grad_norm": 1.2890625, + "learning_rate": 3.0115242162316365e-05, + "loss": 1.7447, + "step": 5922 + }, + { + "epoch": 0.435815129914923, + "grad_norm": 0.92578125, + "learning_rate": 3.0109572959408894e-05, + "loss": 1.1237, + "step": 5923 + }, + { + "epoch": 0.43588871004828694, + "grad_norm": 0.8203125, + "learning_rate": 3.0103903482280292e-05, + "loss": 1.156, + "step": 5924 + }, + { + "epoch": 0.43596229018165095, + "grad_norm": 0.875, + "learning_rate": 3.009823373123484e-05, + "loss": 0.682, + "step": 5925 + }, + { + "epoch": 0.43603587031501495, + "grad_norm": 0.85546875, + "learning_rate": 3.0092563706576798e-05, + "loss": 0.8481, + "step": 5926 + }, + { + "epoch": 0.43610945044837895, + "grad_norm": 0.828125, + "learning_rate": 3.0086893408610495e-05, + "loss": 0.7081, + "step": 5927 + }, + { + "epoch": 0.4361830305817429, + "grad_norm": 0.890625, + "learning_rate": 3.0081222837640215e-05, + "loss": 1.0804, + "step": 5928 + }, + { + "epoch": 0.4362566107151069, + "grad_norm": 0.890625, + "learning_rate": 3.0075551993970314e-05, + "loss": 0.9245, + "step": 5929 + }, + { + "epoch": 0.4363301908484709, + "grad_norm": 0.75, + "learning_rate": 3.0069880877905116e-05, + "loss": 0.5928, + "step": 5930 + }, + { + "epoch": 0.4364037709818349, + "grad_norm": 1.0859375, + "learning_rate": 3.006420948974899e-05, + "loss": 0.9989, + "step": 5931 + }, + { + "epoch": 0.4364773511151989, + "grad_norm": 1.1640625, + "learning_rate": 3.005853782980631e-05, + "loss": 0.9612, + "step": 5932 + }, + { + "epoch": 0.43655093124856287, + "grad_norm": 0.87890625, + "learning_rate": 3.0052865898381456e-05, + "loss": 1.058, + "step": 5933 + }, + { + "epoch": 0.43662451138192687, + "grad_norm": 0.9296875, + "learning_rate": 3.0047193695778836e-05, + "loss": 1.1295, + "step": 5934 + }, + { + "epoch": 0.4366980915152909, + "grad_norm": 0.86328125, + "learning_rate": 3.0041521222302853e-05, + "loss": 0.966, + "step": 5935 + }, + { + "epoch": 0.4367716716486549, + "grad_norm": 0.83203125, + "learning_rate": 3.0035848478257956e-05, + "loss": 0.882, + "step": 5936 + }, + { + "epoch": 0.43684525178201883, + "grad_norm": 1.0390625, + "learning_rate": 3.003017546394858e-05, + "loss": 0.7737, + "step": 5937 + }, + { + "epoch": 0.43691883191538283, + "grad_norm": 1.0390625, + "learning_rate": 3.0024502179679187e-05, + "loss": 1.3886, + "step": 5938 + }, + { + "epoch": 0.43699241204874684, + "grad_norm": 0.93359375, + "learning_rate": 3.0018828625754258e-05, + "loss": 1.2207, + "step": 5939 + }, + { + "epoch": 0.43706599218211084, + "grad_norm": 0.875, + "learning_rate": 3.0013154802478266e-05, + "loss": 0.8199, + "step": 5940 + }, + { + "epoch": 0.43713957231547484, + "grad_norm": 0.78515625, + "learning_rate": 3.0007480710155732e-05, + "loss": 0.7809, + "step": 5941 + }, + { + "epoch": 0.4372131524488388, + "grad_norm": 0.84765625, + "learning_rate": 3.0001806349091166e-05, + "loss": 0.9857, + "step": 5942 + }, + { + "epoch": 0.4372867325822028, + "grad_norm": 1.2109375, + "learning_rate": 2.9996131719589092e-05, + "loss": 1.1154, + "step": 5943 + }, + { + "epoch": 0.4373603127155668, + "grad_norm": 1.1953125, + "learning_rate": 2.999045682195407e-05, + "loss": 1.4869, + "step": 5944 + }, + { + "epoch": 0.4374338928489308, + "grad_norm": 0.8125, + "learning_rate": 2.9984781656490656e-05, + "loss": 0.6608, + "step": 5945 + }, + { + "epoch": 0.43750747298229475, + "grad_norm": 0.76953125, + "learning_rate": 2.9979106223503435e-05, + "loss": 0.9823, + "step": 5946 + }, + { + "epoch": 0.43758105311565876, + "grad_norm": 1.25, + "learning_rate": 2.9973430523296974e-05, + "loss": 1.1349, + "step": 5947 + }, + { + "epoch": 0.43765463324902276, + "grad_norm": 1.03125, + "learning_rate": 2.9967754556175896e-05, + "loss": 1.3051, + "step": 5948 + }, + { + "epoch": 0.43772821338238677, + "grad_norm": 1.0, + "learning_rate": 2.9962078322444815e-05, + "loss": 1.2846, + "step": 5949 + }, + { + "epoch": 0.43780179351575077, + "grad_norm": 0.80078125, + "learning_rate": 2.9956401822408357e-05, + "loss": 0.8564, + "step": 5950 + }, + { + "epoch": 0.4378753736491147, + "grad_norm": 1.015625, + "learning_rate": 2.9950725056371177e-05, + "loss": 1.1169, + "step": 5951 + }, + { + "epoch": 0.4379489537824787, + "grad_norm": 0.9609375, + "learning_rate": 2.9945048024637935e-05, + "loss": 0.9506, + "step": 5952 + }, + { + "epoch": 0.4380225339158427, + "grad_norm": 0.74609375, + "learning_rate": 2.9939370727513306e-05, + "loss": 0.671, + "step": 5953 + }, + { + "epoch": 0.43809611404920673, + "grad_norm": 0.9140625, + "learning_rate": 2.9933693165301975e-05, + "loss": 1.1273, + "step": 5954 + }, + { + "epoch": 0.4381696941825707, + "grad_norm": 0.87109375, + "learning_rate": 2.9928015338308655e-05, + "loss": 0.9897, + "step": 5955 + }, + { + "epoch": 0.4382432743159347, + "grad_norm": 0.75390625, + "learning_rate": 2.992233724683805e-05, + "loss": 0.7975, + "step": 5956 + }, + { + "epoch": 0.4383168544492987, + "grad_norm": 0.9140625, + "learning_rate": 2.9916658891194917e-05, + "loss": 0.6962, + "step": 5957 + }, + { + "epoch": 0.4383904345826627, + "grad_norm": 0.8671875, + "learning_rate": 2.9910980271683975e-05, + "loss": 0.7968, + "step": 5958 + }, + { + "epoch": 0.4384640147160267, + "grad_norm": 0.8203125, + "learning_rate": 2.9905301388610008e-05, + "loss": 1.1579, + "step": 5959 + }, + { + "epoch": 0.43853759484939064, + "grad_norm": 0.80859375, + "learning_rate": 2.9899622242277773e-05, + "loss": 0.7132, + "step": 5960 + }, + { + "epoch": 0.43861117498275465, + "grad_norm": 0.78125, + "learning_rate": 2.989394283299207e-05, + "loss": 0.9599, + "step": 5961 + }, + { + "epoch": 0.43868475511611865, + "grad_norm": 0.8203125, + "learning_rate": 2.9888263161057696e-05, + "loss": 0.7488, + "step": 5962 + }, + { + "epoch": 0.43875833524948266, + "grad_norm": 0.66796875, + "learning_rate": 2.9882583226779466e-05, + "loss": 0.7828, + "step": 5963 + }, + { + "epoch": 0.4388319153828466, + "grad_norm": 0.86328125, + "learning_rate": 2.9876903030462222e-05, + "loss": 0.9261, + "step": 5964 + }, + { + "epoch": 0.4389054955162106, + "grad_norm": 0.765625, + "learning_rate": 2.9871222572410805e-05, + "loss": 0.8842, + "step": 5965 + }, + { + "epoch": 0.4389790756495746, + "grad_norm": 1.1015625, + "learning_rate": 2.9865541852930075e-05, + "loss": 1.0741, + "step": 5966 + }, + { + "epoch": 0.4390526557829386, + "grad_norm": 1.015625, + "learning_rate": 2.9859860872324896e-05, + "loss": 0.8826, + "step": 5967 + }, + { + "epoch": 0.4391262359163026, + "grad_norm": 0.734375, + "learning_rate": 2.985417963090017e-05, + "loss": 0.7303, + "step": 5968 + }, + { + "epoch": 0.43919981604966657, + "grad_norm": 0.83984375, + "learning_rate": 2.9848498128960785e-05, + "loss": 0.7794, + "step": 5969 + }, + { + "epoch": 0.4392733961830306, + "grad_norm": 1.2109375, + "learning_rate": 2.984281636681166e-05, + "loss": 1.0307, + "step": 5970 + }, + { + "epoch": 0.4393469763163946, + "grad_norm": 1.140625, + "learning_rate": 2.983713434475774e-05, + "loss": 1.1013, + "step": 5971 + }, + { + "epoch": 0.4394205564497586, + "grad_norm": 0.90625, + "learning_rate": 2.9831452063103944e-05, + "loss": 1.058, + "step": 5972 + }, + { + "epoch": 0.43949413658312253, + "grad_norm": 0.80859375, + "learning_rate": 2.982576952215525e-05, + "loss": 0.942, + "step": 5973 + }, + { + "epoch": 0.43956771671648653, + "grad_norm": 0.9296875, + "learning_rate": 2.9820086722216616e-05, + "loss": 0.873, + "step": 5974 + }, + { + "epoch": 0.43964129684985054, + "grad_norm": 0.83203125, + "learning_rate": 2.9814403663593036e-05, + "loss": 0.8835, + "step": 5975 + }, + { + "epoch": 0.43971487698321454, + "grad_norm": 0.74609375, + "learning_rate": 2.9808720346589493e-05, + "loss": 0.5536, + "step": 5976 + }, + { + "epoch": 0.43978845711657855, + "grad_norm": 0.90625, + "learning_rate": 2.9803036771511016e-05, + "loss": 0.9971, + "step": 5977 + }, + { + "epoch": 0.4398620372499425, + "grad_norm": 1.359375, + "learning_rate": 2.9797352938662633e-05, + "loss": 0.7169, + "step": 5978 + }, + { + "epoch": 0.4399356173833065, + "grad_norm": 0.72265625, + "learning_rate": 2.9791668848349376e-05, + "loss": 0.7012, + "step": 5979 + }, + { + "epoch": 0.4400091975166705, + "grad_norm": 0.7578125, + "learning_rate": 2.9785984500876303e-05, + "loss": 0.7088, + "step": 5980 + }, + { + "epoch": 0.4400827776500345, + "grad_norm": 0.87890625, + "learning_rate": 2.978029989654848e-05, + "loss": 0.962, + "step": 5981 + }, + { + "epoch": 0.44015635778339846, + "grad_norm": 1.0234375, + "learning_rate": 2.9774615035670996e-05, + "loss": 1.1626, + "step": 5982 + }, + { + "epoch": 0.44022993791676246, + "grad_norm": 0.7421875, + "learning_rate": 2.9768929918548926e-05, + "loss": 0.7757, + "step": 5983 + }, + { + "epoch": 0.44030351805012646, + "grad_norm": 0.6875, + "learning_rate": 2.9763244545487406e-05, + "loss": 0.5121, + "step": 5984 + }, + { + "epoch": 0.44037709818349047, + "grad_norm": 1.078125, + "learning_rate": 2.9757558916791545e-05, + "loss": 0.9457, + "step": 5985 + }, + { + "epoch": 0.44045067831685447, + "grad_norm": 0.69140625, + "learning_rate": 2.9751873032766482e-05, + "loss": 0.961, + "step": 5986 + }, + { + "epoch": 0.4405242584502184, + "grad_norm": 0.86328125, + "learning_rate": 2.974618689371737e-05, + "loss": 0.9199, + "step": 5987 + }, + { + "epoch": 0.4405978385835824, + "grad_norm": 0.83203125, + "learning_rate": 2.974050049994937e-05, + "loss": 0.6986, + "step": 5988 + }, + { + "epoch": 0.44067141871694643, + "grad_norm": 0.74609375, + "learning_rate": 2.9734813851767662e-05, + "loss": 0.6249, + "step": 5989 + }, + { + "epoch": 0.44074499885031043, + "grad_norm": 0.9609375, + "learning_rate": 2.9729126949477436e-05, + "loss": 1.158, + "step": 5990 + }, + { + "epoch": 0.4408185789836744, + "grad_norm": 0.79296875, + "learning_rate": 2.9723439793383907e-05, + "loss": 0.6793, + "step": 5991 + }, + { + "epoch": 0.4408921591170384, + "grad_norm": 0.9140625, + "learning_rate": 2.9717752383792276e-05, + "loss": 0.9961, + "step": 5992 + }, + { + "epoch": 0.4409657392504024, + "grad_norm": 0.92578125, + "learning_rate": 2.971206472100779e-05, + "loss": 0.7426, + "step": 5993 + }, + { + "epoch": 0.4410393193837664, + "grad_norm": 1.03125, + "learning_rate": 2.970637680533569e-05, + "loss": 0.8417, + "step": 5994 + }, + { + "epoch": 0.4411128995171304, + "grad_norm": 1.21875, + "learning_rate": 2.970068863708123e-05, + "loss": 1.532, + "step": 5995 + }, + { + "epoch": 0.44118647965049435, + "grad_norm": 0.81640625, + "learning_rate": 2.9695000216549695e-05, + "loss": 1.146, + "step": 5996 + }, + { + "epoch": 0.44126005978385835, + "grad_norm": 0.63671875, + "learning_rate": 2.9689311544046362e-05, + "loss": 0.5832, + "step": 5997 + }, + { + "epoch": 0.44133363991722235, + "grad_norm": 1.015625, + "learning_rate": 2.9683622619876544e-05, + "loss": 0.8079, + "step": 5998 + }, + { + "epoch": 0.44140722005058636, + "grad_norm": 0.734375, + "learning_rate": 2.9677933444345535e-05, + "loss": 0.5841, + "step": 5999 + }, + { + "epoch": 0.4414808001839503, + "grad_norm": 0.89453125, + "learning_rate": 2.9672244017758683e-05, + "loss": 0.9227, + "step": 6000 + }, + { + "epoch": 0.4415543803173143, + "grad_norm": 0.921875, + "learning_rate": 2.9666554340421314e-05, + "loss": 0.7703, + "step": 6001 + }, + { + "epoch": 0.4416279604506783, + "grad_norm": 0.73046875, + "learning_rate": 2.966086441263879e-05, + "loss": 0.6359, + "step": 6002 + }, + { + "epoch": 0.4417015405840423, + "grad_norm": 0.8203125, + "learning_rate": 2.965517423471647e-05, + "loss": 0.5752, + "step": 6003 + }, + { + "epoch": 0.4417751207174063, + "grad_norm": 0.7421875, + "learning_rate": 2.9649483806959742e-05, + "loss": 0.8808, + "step": 6004 + }, + { + "epoch": 0.44184870085077027, + "grad_norm": 0.80078125, + "learning_rate": 2.964379312967401e-05, + "loss": 0.8262, + "step": 6005 + }, + { + "epoch": 0.4419222809841343, + "grad_norm": 0.80859375, + "learning_rate": 2.9638102203164663e-05, + "loss": 1.0234, + "step": 6006 + }, + { + "epoch": 0.4419958611174983, + "grad_norm": 1.2734375, + "learning_rate": 2.9632411027737135e-05, + "loss": 1.6336, + "step": 6007 + }, + { + "epoch": 0.4420694412508623, + "grad_norm": 0.9296875, + "learning_rate": 2.9626719603696852e-05, + "loss": 1.2063, + "step": 6008 + }, + { + "epoch": 0.44214302138422623, + "grad_norm": 0.72265625, + "learning_rate": 2.9621027931349267e-05, + "loss": 1.0634, + "step": 6009 + }, + { + "epoch": 0.44221660151759024, + "grad_norm": 0.79296875, + "learning_rate": 2.9615336010999844e-05, + "loss": 1.0263, + "step": 6010 + }, + { + "epoch": 0.44229018165095424, + "grad_norm": 0.8359375, + "learning_rate": 2.960964384295405e-05, + "loss": 1.041, + "step": 6011 + }, + { + "epoch": 0.44236376178431824, + "grad_norm": 0.9453125, + "learning_rate": 2.9603951427517385e-05, + "loss": 0.983, + "step": 6012 + }, + { + "epoch": 0.44243734191768225, + "grad_norm": 1.0390625, + "learning_rate": 2.9598258764995336e-05, + "loss": 0.6677, + "step": 6013 + }, + { + "epoch": 0.4425109220510462, + "grad_norm": 0.96484375, + "learning_rate": 2.959256585569343e-05, + "loss": 1.0042, + "step": 6014 + }, + { + "epoch": 0.4425845021844102, + "grad_norm": 0.9765625, + "learning_rate": 2.9586872699917178e-05, + "loss": 0.783, + "step": 6015 + }, + { + "epoch": 0.4426580823177742, + "grad_norm": 0.88671875, + "learning_rate": 2.958117929797214e-05, + "loss": 0.7919, + "step": 6016 + }, + { + "epoch": 0.4427316624511382, + "grad_norm": 0.68359375, + "learning_rate": 2.957548565016386e-05, + "loss": 1.0166, + "step": 6017 + }, + { + "epoch": 0.44280524258450216, + "grad_norm": 0.9453125, + "learning_rate": 2.9569791756797905e-05, + "loss": 0.9252, + "step": 6018 + }, + { + "epoch": 0.44287882271786616, + "grad_norm": 1.03125, + "learning_rate": 2.956409761817986e-05, + "loss": 1.3885, + "step": 6019 + }, + { + "epoch": 0.44295240285123016, + "grad_norm": 1.109375, + "learning_rate": 2.955840323461532e-05, + "loss": 1.5966, + "step": 6020 + }, + { + "epoch": 0.44302598298459417, + "grad_norm": 0.9453125, + "learning_rate": 2.9552708606409885e-05, + "loss": 1.6081, + "step": 6021 + }, + { + "epoch": 0.4430995631179582, + "grad_norm": 0.88671875, + "learning_rate": 2.9547013733869172e-05, + "loss": 0.6657, + "step": 6022 + }, + { + "epoch": 0.4431731432513221, + "grad_norm": 1.171875, + "learning_rate": 2.9541318617298825e-05, + "loss": 0.9565, + "step": 6023 + }, + { + "epoch": 0.4432467233846861, + "grad_norm": 0.8984375, + "learning_rate": 2.953562325700448e-05, + "loss": 1.0535, + "step": 6024 + }, + { + "epoch": 0.44332030351805013, + "grad_norm": 0.8984375, + "learning_rate": 2.9529927653291806e-05, + "loss": 0.9568, + "step": 6025 + }, + { + "epoch": 0.44339388365141413, + "grad_norm": 1.1640625, + "learning_rate": 2.9524231806466475e-05, + "loss": 1.438, + "step": 6026 + }, + { + "epoch": 0.44346746378477814, + "grad_norm": 1.0078125, + "learning_rate": 2.9518535716834162e-05, + "loss": 0.9918, + "step": 6027 + }, + { + "epoch": 0.4435410439181421, + "grad_norm": 0.96484375, + "learning_rate": 2.9512839384700575e-05, + "loss": 0.8983, + "step": 6028 + }, + { + "epoch": 0.4436146240515061, + "grad_norm": 0.9921875, + "learning_rate": 2.950714281037142e-05, + "loss": 1.0855, + "step": 6029 + }, + { + "epoch": 0.4436882041848701, + "grad_norm": 1.0078125, + "learning_rate": 2.950144599415242e-05, + "loss": 1.0439, + "step": 6030 + }, + { + "epoch": 0.4437617843182341, + "grad_norm": 0.97265625, + "learning_rate": 2.9495748936349316e-05, + "loss": 1.113, + "step": 6031 + }, + { + "epoch": 0.44383536445159805, + "grad_norm": 0.9765625, + "learning_rate": 2.9490051637267864e-05, + "loss": 1.3181, + "step": 6032 + }, + { + "epoch": 0.44390894458496205, + "grad_norm": 0.7578125, + "learning_rate": 2.9484354097213818e-05, + "loss": 0.8374, + "step": 6033 + }, + { + "epoch": 0.44398252471832605, + "grad_norm": 0.81640625, + "learning_rate": 2.9478656316492952e-05, + "loss": 0.9774, + "step": 6034 + }, + { + "epoch": 0.44405610485169006, + "grad_norm": 0.75, + "learning_rate": 2.9472958295411067e-05, + "loss": 0.9186, + "step": 6035 + }, + { + "epoch": 0.44412968498505406, + "grad_norm": 0.8671875, + "learning_rate": 2.9467260034273957e-05, + "loss": 0.7875, + "step": 6036 + }, + { + "epoch": 0.444203265118418, + "grad_norm": 1.0859375, + "learning_rate": 2.9461561533387437e-05, + "loss": 1.146, + "step": 6037 + }, + { + "epoch": 0.444276845251782, + "grad_norm": 0.86328125, + "learning_rate": 2.9455862793057338e-05, + "loss": 1.0432, + "step": 6038 + }, + { + "epoch": 0.444350425385146, + "grad_norm": 0.7109375, + "learning_rate": 2.9450163813589498e-05, + "loss": 0.966, + "step": 6039 + }, + { + "epoch": 0.44442400551851, + "grad_norm": 0.8515625, + "learning_rate": 2.9444464595289772e-05, + "loss": 0.8573, + "step": 6040 + }, + { + "epoch": 0.44449758565187397, + "grad_norm": 1.1328125, + "learning_rate": 2.943876513846403e-05, + "loss": 1.5034, + "step": 6041 + }, + { + "epoch": 0.444571165785238, + "grad_norm": 0.765625, + "learning_rate": 2.9433065443418138e-05, + "loss": 0.7055, + "step": 6042 + }, + { + "epoch": 0.444644745918602, + "grad_norm": 0.89453125, + "learning_rate": 2.9427365510458e-05, + "loss": 0.6595, + "step": 6043 + }, + { + "epoch": 0.444718326051966, + "grad_norm": 0.76171875, + "learning_rate": 2.9421665339889515e-05, + "loss": 0.8481, + "step": 6044 + }, + { + "epoch": 0.44479190618533, + "grad_norm": 0.6875, + "learning_rate": 2.94159649320186e-05, + "loss": 0.5326, + "step": 6045 + }, + { + "epoch": 0.44486548631869394, + "grad_norm": 0.796875, + "learning_rate": 2.94102642871512e-05, + "loss": 0.8251, + "step": 6046 + }, + { + "epoch": 0.44493906645205794, + "grad_norm": 0.6875, + "learning_rate": 2.9404563405593235e-05, + "loss": 0.6753, + "step": 6047 + }, + { + "epoch": 0.44501264658542194, + "grad_norm": 0.7890625, + "learning_rate": 2.939886228765068e-05, + "loss": 0.8832, + "step": 6048 + }, + { + "epoch": 0.44508622671878595, + "grad_norm": 0.95703125, + "learning_rate": 2.939316093362948e-05, + "loss": 0.9699, + "step": 6049 + }, + { + "epoch": 0.4451598068521499, + "grad_norm": 0.703125, + "learning_rate": 2.9387459343835644e-05, + "loss": 0.6979, + "step": 6050 + }, + { + "epoch": 0.4452333869855139, + "grad_norm": 0.75390625, + "learning_rate": 2.9381757518575142e-05, + "loss": 0.8925, + "step": 6051 + }, + { + "epoch": 0.4453069671188779, + "grad_norm": 0.80859375, + "learning_rate": 2.9376055458153994e-05, + "loss": 0.821, + "step": 6052 + }, + { + "epoch": 0.4453805472522419, + "grad_norm": 0.82421875, + "learning_rate": 2.9370353162878216e-05, + "loss": 0.8584, + "step": 6053 + }, + { + "epoch": 0.4454541273856059, + "grad_norm": 0.83984375, + "learning_rate": 2.9364650633053838e-05, + "loss": 1.0352, + "step": 6054 + }, + { + "epoch": 0.44552770751896986, + "grad_norm": 0.921875, + "learning_rate": 2.935894786898691e-05, + "loss": 0.9812, + "step": 6055 + }, + { + "epoch": 0.44560128765233387, + "grad_norm": 0.66796875, + "learning_rate": 2.935324487098347e-05, + "loss": 0.9025, + "step": 6056 + }, + { + "epoch": 0.44567486778569787, + "grad_norm": 0.796875, + "learning_rate": 2.934754163934961e-05, + "loss": 1.0327, + "step": 6057 + }, + { + "epoch": 0.4457484479190619, + "grad_norm": 0.90625, + "learning_rate": 2.934183817439139e-05, + "loss": 0.8906, + "step": 6058 + }, + { + "epoch": 0.4458220280524258, + "grad_norm": 0.7890625, + "learning_rate": 2.9336134476414923e-05, + "loss": 0.7753, + "step": 6059 + }, + { + "epoch": 0.4458956081857898, + "grad_norm": 0.93359375, + "learning_rate": 2.933043054572631e-05, + "loss": 0.884, + "step": 6060 + }, + { + "epoch": 0.44596918831915383, + "grad_norm": 0.78515625, + "learning_rate": 2.932472638263167e-05, + "loss": 0.8998, + "step": 6061 + }, + { + "epoch": 0.44604276845251783, + "grad_norm": 0.9140625, + "learning_rate": 2.9319021987437134e-05, + "loss": 1.1624, + "step": 6062 + }, + { + "epoch": 0.44611634858588184, + "grad_norm": 0.99609375, + "learning_rate": 2.931331736044884e-05, + "loss": 1.1614, + "step": 6063 + }, + { + "epoch": 0.4461899287192458, + "grad_norm": 0.7421875, + "learning_rate": 2.9307612501972948e-05, + "loss": 0.7683, + "step": 6064 + }, + { + "epoch": 0.4462635088526098, + "grad_norm": 1.03125, + "learning_rate": 2.9301907412315632e-05, + "loss": 1.1088, + "step": 6065 + }, + { + "epoch": 0.4463370889859738, + "grad_norm": 0.8671875, + "learning_rate": 2.9296202091783072e-05, + "loss": 0.9251, + "step": 6066 + }, + { + "epoch": 0.4464106691193378, + "grad_norm": 0.859375, + "learning_rate": 2.929049654068146e-05, + "loss": 0.7168, + "step": 6067 + }, + { + "epoch": 0.44648424925270175, + "grad_norm": 0.90234375, + "learning_rate": 2.9284790759317004e-05, + "loss": 1.1251, + "step": 6068 + }, + { + "epoch": 0.44655782938606575, + "grad_norm": 1.046875, + "learning_rate": 2.9279084747995922e-05, + "loss": 1.3019, + "step": 6069 + }, + { + "epoch": 0.44663140951942976, + "grad_norm": 0.80078125, + "learning_rate": 2.9273378507024435e-05, + "loss": 0.6461, + "step": 6070 + }, + { + "epoch": 0.44670498965279376, + "grad_norm": 1.0703125, + "learning_rate": 2.92676720367088e-05, + "loss": 1.0662, + "step": 6071 + }, + { + "epoch": 0.44677856978615776, + "grad_norm": 0.921875, + "learning_rate": 2.926196533735527e-05, + "loss": 1.0884, + "step": 6072 + }, + { + "epoch": 0.4468521499195217, + "grad_norm": 0.98046875, + "learning_rate": 2.925625840927011e-05, + "loss": 1.2211, + "step": 6073 + }, + { + "epoch": 0.4469257300528857, + "grad_norm": 0.87890625, + "learning_rate": 2.9250551252759595e-05, + "loss": 1.1065, + "step": 6074 + }, + { + "epoch": 0.4469993101862497, + "grad_norm": 0.8515625, + "learning_rate": 2.9244843868130023e-05, + "loss": 0.94, + "step": 6075 + }, + { + "epoch": 0.4470728903196137, + "grad_norm": 0.9453125, + "learning_rate": 2.9239136255687698e-05, + "loss": 0.7705, + "step": 6076 + }, + { + "epoch": 0.4471464704529777, + "grad_norm": 0.84765625, + "learning_rate": 2.923342841573894e-05, + "loss": 1.1884, + "step": 6077 + }, + { + "epoch": 0.4472200505863417, + "grad_norm": 0.734375, + "learning_rate": 2.922772034859007e-05, + "loss": 0.9155, + "step": 6078 + }, + { + "epoch": 0.4472936307197057, + "grad_norm": 0.85546875, + "learning_rate": 2.9222012054547436e-05, + "loss": 0.9214, + "step": 6079 + }, + { + "epoch": 0.4473672108530697, + "grad_norm": 0.8671875, + "learning_rate": 2.9216303533917394e-05, + "loss": 1.0133, + "step": 6080 + }, + { + "epoch": 0.4474407909864337, + "grad_norm": 0.72265625, + "learning_rate": 2.9210594787006302e-05, + "loss": 0.9676, + "step": 6081 + }, + { + "epoch": 0.44751437111979764, + "grad_norm": 1.0703125, + "learning_rate": 2.9204885814120535e-05, + "loss": 1.2342, + "step": 6082 + }, + { + "epoch": 0.44758795125316164, + "grad_norm": 1.0703125, + "learning_rate": 2.9199176615566498e-05, + "loss": 1.4221, + "step": 6083 + }, + { + "epoch": 0.44766153138652565, + "grad_norm": 0.62890625, + "learning_rate": 2.9193467191650577e-05, + "loss": 0.6799, + "step": 6084 + }, + { + "epoch": 0.44773511151988965, + "grad_norm": 0.64453125, + "learning_rate": 2.918775754267919e-05, + "loss": 0.8441, + "step": 6085 + }, + { + "epoch": 0.4478086916532536, + "grad_norm": 0.90234375, + "learning_rate": 2.9182047668958766e-05, + "loss": 0.7616, + "step": 6086 + }, + { + "epoch": 0.4478822717866176, + "grad_norm": 0.86328125, + "learning_rate": 2.9176337570795752e-05, + "loss": 0.8768, + "step": 6087 + }, + { + "epoch": 0.4479558519199816, + "grad_norm": 0.74609375, + "learning_rate": 2.917062724849658e-05, + "loss": 0.6916, + "step": 6088 + }, + { + "epoch": 0.4480294320533456, + "grad_norm": 0.89453125, + "learning_rate": 2.9164916702367722e-05, + "loss": 0.6812, + "step": 6089 + }, + { + "epoch": 0.4481030121867096, + "grad_norm": 0.6953125, + "learning_rate": 2.915920593271565e-05, + "loss": 1.0242, + "step": 6090 + }, + { + "epoch": 0.44817659232007356, + "grad_norm": 0.90625, + "learning_rate": 2.9153494939846855e-05, + "loss": 0.8894, + "step": 6091 + }, + { + "epoch": 0.44825017245343757, + "grad_norm": 1.0546875, + "learning_rate": 2.914778372406782e-05, + "loss": 1.1701, + "step": 6092 + }, + { + "epoch": 0.44832375258680157, + "grad_norm": 0.9296875, + "learning_rate": 2.914207228568508e-05, + "loss": 1.1768, + "step": 6093 + }, + { + "epoch": 0.4483973327201656, + "grad_norm": 1.0625, + "learning_rate": 2.9136360625005142e-05, + "loss": 0.9539, + "step": 6094 + }, + { + "epoch": 0.4484709128535295, + "grad_norm": 0.82421875, + "learning_rate": 2.913064874233454e-05, + "loss": 1.1154, + "step": 6095 + }, + { + "epoch": 0.44854449298689353, + "grad_norm": 1.03125, + "learning_rate": 2.912493663797982e-05, + "loss": 1.4624, + "step": 6096 + }, + { + "epoch": 0.44861807312025753, + "grad_norm": 0.97265625, + "learning_rate": 2.911922431224754e-05, + "loss": 1.0469, + "step": 6097 + }, + { + "epoch": 0.44869165325362154, + "grad_norm": 0.83203125, + "learning_rate": 2.9113511765444272e-05, + "loss": 0.5635, + "step": 6098 + }, + { + "epoch": 0.44876523338698554, + "grad_norm": 1.2734375, + "learning_rate": 2.91077989978766e-05, + "loss": 1.336, + "step": 6099 + }, + { + "epoch": 0.4488388135203495, + "grad_norm": 0.96484375, + "learning_rate": 2.910208600985111e-05, + "loss": 1.1571, + "step": 6100 + }, + { + "epoch": 0.4489123936537135, + "grad_norm": 0.91796875, + "learning_rate": 2.9096372801674416e-05, + "loss": 0.7791, + "step": 6101 + }, + { + "epoch": 0.4489859737870775, + "grad_norm": 0.765625, + "learning_rate": 2.909065937365313e-05, + "loss": 0.8365, + "step": 6102 + }, + { + "epoch": 0.4490595539204415, + "grad_norm": 1.0234375, + "learning_rate": 2.908494572609388e-05, + "loss": 0.9837, + "step": 6103 + }, + { + "epoch": 0.44913313405380545, + "grad_norm": 0.86328125, + "learning_rate": 2.9079231859303306e-05, + "loss": 1.0402, + "step": 6104 + }, + { + "epoch": 0.44920671418716945, + "grad_norm": 0.90625, + "learning_rate": 2.9073517773588067e-05, + "loss": 1.1519, + "step": 6105 + }, + { + "epoch": 0.44928029432053346, + "grad_norm": 0.7421875, + "learning_rate": 2.9067803469254818e-05, + "loss": 1.0381, + "step": 6106 + }, + { + "epoch": 0.44935387445389746, + "grad_norm": 0.82421875, + "learning_rate": 2.9062088946610244e-05, + "loss": 0.8927, + "step": 6107 + }, + { + "epoch": 0.44942745458726147, + "grad_norm": 0.71484375, + "learning_rate": 2.905637420596103e-05, + "loss": 0.5416, + "step": 6108 + }, + { + "epoch": 0.4495010347206254, + "grad_norm": 1.1640625, + "learning_rate": 2.905065924761387e-05, + "loss": 1.0362, + "step": 6109 + }, + { + "epoch": 0.4495746148539894, + "grad_norm": 0.74609375, + "learning_rate": 2.904494407187548e-05, + "loss": 0.749, + "step": 6110 + }, + { + "epoch": 0.4496481949873534, + "grad_norm": 0.99609375, + "learning_rate": 2.9039228679052583e-05, + "loss": 0.9613, + "step": 6111 + }, + { + "epoch": 0.4497217751207174, + "grad_norm": 0.9765625, + "learning_rate": 2.9033513069451905e-05, + "loss": 1.0949, + "step": 6112 + }, + { + "epoch": 0.4497953552540814, + "grad_norm": 0.76953125, + "learning_rate": 2.9027797243380205e-05, + "loss": 0.7375, + "step": 6113 + }, + { + "epoch": 0.4498689353874454, + "grad_norm": 0.734375, + "learning_rate": 2.902208120114423e-05, + "loss": 0.6392, + "step": 6114 + }, + { + "epoch": 0.4499425155208094, + "grad_norm": 0.8671875, + "learning_rate": 2.9016364943050763e-05, + "loss": 1.0716, + "step": 6115 + }, + { + "epoch": 0.4500160956541734, + "grad_norm": 0.703125, + "learning_rate": 2.9010648469406565e-05, + "loss": 1.2349, + "step": 6116 + }, + { + "epoch": 0.4500896757875374, + "grad_norm": 0.65625, + "learning_rate": 2.900493178051845e-05, + "loss": 0.6422, + "step": 6117 + }, + { + "epoch": 0.45016325592090134, + "grad_norm": 0.984375, + "learning_rate": 2.8999214876693205e-05, + "loss": 1.2383, + "step": 6118 + }, + { + "epoch": 0.45023683605426534, + "grad_norm": 0.90234375, + "learning_rate": 2.899349775823764e-05, + "loss": 0.9533, + "step": 6119 + }, + { + "epoch": 0.45031041618762935, + "grad_norm": 1.2265625, + "learning_rate": 2.8987780425458604e-05, + "loss": 1.0481, + "step": 6120 + }, + { + "epoch": 0.45038399632099335, + "grad_norm": 0.8984375, + "learning_rate": 2.898206287866293e-05, + "loss": 1.1189, + "step": 6121 + }, + { + "epoch": 0.4504575764543573, + "grad_norm": 0.9921875, + "learning_rate": 2.897634511815745e-05, + "loss": 0.6942, + "step": 6122 + }, + { + "epoch": 0.4505311565877213, + "grad_norm": 0.91015625, + "learning_rate": 2.897062714424904e-05, + "loss": 1.0282, + "step": 6123 + }, + { + "epoch": 0.4506047367210853, + "grad_norm": 0.81640625, + "learning_rate": 2.896490895724458e-05, + "loss": 1.0714, + "step": 6124 + }, + { + "epoch": 0.4506783168544493, + "grad_norm": 0.84375, + "learning_rate": 2.8959190557450937e-05, + "loss": 0.9996, + "step": 6125 + }, + { + "epoch": 0.4507518969878133, + "grad_norm": 0.9453125, + "learning_rate": 2.8953471945175013e-05, + "loss": 1.0714, + "step": 6126 + }, + { + "epoch": 0.45082547712117726, + "grad_norm": 0.91796875, + "learning_rate": 2.894775312072372e-05, + "loss": 1.0969, + "step": 6127 + }, + { + "epoch": 0.45089905725454127, + "grad_norm": 1.2890625, + "learning_rate": 2.894203408440398e-05, + "loss": 1.3571, + "step": 6128 + }, + { + "epoch": 0.4509726373879053, + "grad_norm": 0.96875, + "learning_rate": 2.8936314836522706e-05, + "loss": 0.7812, + "step": 6129 + }, + { + "epoch": 0.4510462175212693, + "grad_norm": 0.94140625, + "learning_rate": 2.893059537738685e-05, + "loss": 0.8675, + "step": 6130 + }, + { + "epoch": 0.4511197976546332, + "grad_norm": 0.8125, + "learning_rate": 2.8924875707303366e-05, + "loss": 1.1733, + "step": 6131 + }, + { + "epoch": 0.45119337778799723, + "grad_norm": 1.046875, + "learning_rate": 2.8919155826579215e-05, + "loss": 1.1205, + "step": 6132 + }, + { + "epoch": 0.45126695792136123, + "grad_norm": 0.89453125, + "learning_rate": 2.8913435735521377e-05, + "loss": 0.9608, + "step": 6133 + }, + { + "epoch": 0.45134053805472524, + "grad_norm": 0.89453125, + "learning_rate": 2.8907715434436834e-05, + "loss": 0.8731, + "step": 6134 + }, + { + "epoch": 0.45141411818808924, + "grad_norm": 0.84375, + "learning_rate": 2.8901994923632582e-05, + "loss": 1.1364, + "step": 6135 + }, + { + "epoch": 0.4514876983214532, + "grad_norm": 0.84375, + "learning_rate": 2.889627420341563e-05, + "loss": 0.8074, + "step": 6136 + }, + { + "epoch": 0.4515612784548172, + "grad_norm": 0.828125, + "learning_rate": 2.8890553274093006e-05, + "loss": 0.909, + "step": 6137 + }, + { + "epoch": 0.4516348585881812, + "grad_norm": 0.69921875, + "learning_rate": 2.8884832135971735e-05, + "loss": 0.6877, + "step": 6138 + }, + { + "epoch": 0.4517084387215452, + "grad_norm": 1.0390625, + "learning_rate": 2.887911078935885e-05, + "loss": 0.9929, + "step": 6139 + }, + { + "epoch": 0.45178201885490915, + "grad_norm": 0.859375, + "learning_rate": 2.8873389234561428e-05, + "loss": 0.6979, + "step": 6140 + }, + { + "epoch": 0.45185559898827315, + "grad_norm": 0.82421875, + "learning_rate": 2.8867667471886518e-05, + "loss": 1.1729, + "step": 6141 + }, + { + "epoch": 0.45192917912163716, + "grad_norm": 0.86328125, + "learning_rate": 2.88619455016412e-05, + "loss": 0.9276, + "step": 6142 + }, + { + "epoch": 0.45200275925500116, + "grad_norm": 1.0, + "learning_rate": 2.885622332413256e-05, + "loss": 1.0578, + "step": 6143 + }, + { + "epoch": 0.45207633938836517, + "grad_norm": 1.0, + "learning_rate": 2.88505009396677e-05, + "loss": 1.0555, + "step": 6144 + }, + { + "epoch": 0.4521499195217291, + "grad_norm": 0.8125, + "learning_rate": 2.8844778348553724e-05, + "loss": 1.0048, + "step": 6145 + }, + { + "epoch": 0.4522234996550931, + "grad_norm": 0.84765625, + "learning_rate": 2.8839055551097755e-05, + "loss": 0.837, + "step": 6146 + }, + { + "epoch": 0.4522970797884571, + "grad_norm": 0.73046875, + "learning_rate": 2.8833332547606927e-05, + "loss": 0.6533, + "step": 6147 + }, + { + "epoch": 0.4523706599218211, + "grad_norm": 1.0078125, + "learning_rate": 2.8827609338388385e-05, + "loss": 1.1388, + "step": 6148 + }, + { + "epoch": 0.4524442400551851, + "grad_norm": 0.66796875, + "learning_rate": 2.8821885923749275e-05, + "loss": 0.8837, + "step": 6149 + }, + { + "epoch": 0.4525178201885491, + "grad_norm": 0.73046875, + "learning_rate": 2.881616230399677e-05, + "loss": 0.9821, + "step": 6150 + }, + { + "epoch": 0.4525914003219131, + "grad_norm": 1.015625, + "learning_rate": 2.881043847943804e-05, + "loss": 1.0957, + "step": 6151 + }, + { + "epoch": 0.4526649804552771, + "grad_norm": 0.8125, + "learning_rate": 2.8804714450380266e-05, + "loss": 0.7325, + "step": 6152 + }, + { + "epoch": 0.4527385605886411, + "grad_norm": 1.0, + "learning_rate": 2.879899021713066e-05, + "loss": 0.6788, + "step": 6153 + }, + { + "epoch": 0.45281214072200504, + "grad_norm": 0.890625, + "learning_rate": 2.8793265779996427e-05, + "loss": 0.6262, + "step": 6154 + }, + { + "epoch": 0.45288572085536904, + "grad_norm": 0.640625, + "learning_rate": 2.878754113928478e-05, + "loss": 0.8315, + "step": 6155 + }, + { + "epoch": 0.45295930098873305, + "grad_norm": 0.99609375, + "learning_rate": 2.878181629530296e-05, + "loss": 0.8807, + "step": 6156 + }, + { + "epoch": 0.45303288112209705, + "grad_norm": 0.71875, + "learning_rate": 2.8776091248358194e-05, + "loss": 0.78, + "step": 6157 + }, + { + "epoch": 0.453106461255461, + "grad_norm": 0.83203125, + "learning_rate": 2.8770365998757754e-05, + "loss": 0.7171, + "step": 6158 + }, + { + "epoch": 0.453180041388825, + "grad_norm": 0.87109375, + "learning_rate": 2.8764640546808873e-05, + "loss": 0.9987, + "step": 6159 + }, + { + "epoch": 0.453253621522189, + "grad_norm": 0.76171875, + "learning_rate": 2.875891489281886e-05, + "loss": 0.6616, + "step": 6160 + }, + { + "epoch": 0.453327201655553, + "grad_norm": 0.84375, + "learning_rate": 2.875318903709498e-05, + "loss": 0.9825, + "step": 6161 + }, + { + "epoch": 0.453400781788917, + "grad_norm": 0.84375, + "learning_rate": 2.8747462979944534e-05, + "loss": 0.7909, + "step": 6162 + }, + { + "epoch": 0.45347436192228097, + "grad_norm": 0.8515625, + "learning_rate": 2.8741736721674827e-05, + "loss": 0.7882, + "step": 6163 + }, + { + "epoch": 0.45354794205564497, + "grad_norm": 0.76953125, + "learning_rate": 2.8736010262593178e-05, + "loss": 0.8349, + "step": 6164 + }, + { + "epoch": 0.453621522189009, + "grad_norm": 0.76953125, + "learning_rate": 2.8730283603006907e-05, + "loss": 0.9046, + "step": 6165 + }, + { + "epoch": 0.453695102322373, + "grad_norm": 0.9140625, + "learning_rate": 2.872455674322337e-05, + "loss": 1.1353, + "step": 6166 + }, + { + "epoch": 0.4537686824557369, + "grad_norm": 0.81640625, + "learning_rate": 2.871882968354991e-05, + "loss": 0.9633, + "step": 6167 + }, + { + "epoch": 0.45384226258910093, + "grad_norm": 0.62109375, + "learning_rate": 2.8713102424293874e-05, + "loss": 0.6544, + "step": 6168 + }, + { + "epoch": 0.45391584272246494, + "grad_norm": 1.0625, + "learning_rate": 2.870737496576265e-05, + "loss": 1.5257, + "step": 6169 + }, + { + "epoch": 0.45398942285582894, + "grad_norm": 1.0078125, + "learning_rate": 2.8701647308263613e-05, + "loss": 0.6725, + "step": 6170 + }, + { + "epoch": 0.45406300298919294, + "grad_norm": 0.92578125, + "learning_rate": 2.8695919452104157e-05, + "loss": 1.0626, + "step": 6171 + }, + { + "epoch": 0.4541365831225569, + "grad_norm": 1.0078125, + "learning_rate": 2.8690191397591682e-05, + "loss": 0.9497, + "step": 6172 + }, + { + "epoch": 0.4542101632559209, + "grad_norm": 0.6953125, + "learning_rate": 2.86844631450336e-05, + "loss": 0.7885, + "step": 6173 + }, + { + "epoch": 0.4542837433892849, + "grad_norm": 0.83203125, + "learning_rate": 2.8678734694737352e-05, + "loss": 0.8002, + "step": 6174 + }, + { + "epoch": 0.4543573235226489, + "grad_norm": 0.90625, + "learning_rate": 2.8673006047010353e-05, + "loss": 0.8922, + "step": 6175 + }, + { + "epoch": 0.45443090365601285, + "grad_norm": 1.0078125, + "learning_rate": 2.8667277202160066e-05, + "loss": 1.1074, + "step": 6176 + }, + { + "epoch": 0.45450448378937686, + "grad_norm": 0.84765625, + "learning_rate": 2.8661548160493927e-05, + "loss": 0.7483, + "step": 6177 + }, + { + "epoch": 0.45457806392274086, + "grad_norm": 0.74609375, + "learning_rate": 2.865581892231942e-05, + "loss": 0.8306, + "step": 6178 + }, + { + "epoch": 0.45465164405610486, + "grad_norm": 0.90234375, + "learning_rate": 2.865008948794401e-05, + "loss": 1.1476, + "step": 6179 + }, + { + "epoch": 0.45472522418946887, + "grad_norm": 0.66796875, + "learning_rate": 2.8644359857675195e-05, + "loss": 0.7028, + "step": 6180 + }, + { + "epoch": 0.4547988043228328, + "grad_norm": 0.9140625, + "learning_rate": 2.8638630031820472e-05, + "loss": 1.1273, + "step": 6181 + }, + { + "epoch": 0.4548723844561968, + "grad_norm": 0.83203125, + "learning_rate": 2.863290001068735e-05, + "loss": 1.0302, + "step": 6182 + }, + { + "epoch": 0.4549459645895608, + "grad_norm": 0.828125, + "learning_rate": 2.862716979458334e-05, + "loss": 1.0111, + "step": 6183 + }, + { + "epoch": 0.45501954472292483, + "grad_norm": 0.80859375, + "learning_rate": 2.8621439383815985e-05, + "loss": 0.8821, + "step": 6184 + }, + { + "epoch": 0.4550931248562888, + "grad_norm": 0.93359375, + "learning_rate": 2.8615708778692807e-05, + "loss": 0.9798, + "step": 6185 + }, + { + "epoch": 0.4551667049896528, + "grad_norm": 0.9375, + "learning_rate": 2.8609977979521375e-05, + "loss": 1.1728, + "step": 6186 + }, + { + "epoch": 0.4552402851230168, + "grad_norm": 0.6953125, + "learning_rate": 2.860424698660924e-05, + "loss": 0.7231, + "step": 6187 + }, + { + "epoch": 0.4553138652563808, + "grad_norm": 0.7890625, + "learning_rate": 2.859851580026398e-05, + "loss": 0.9579, + "step": 6188 + }, + { + "epoch": 0.4553874453897448, + "grad_norm": 0.765625, + "learning_rate": 2.8592784420793173e-05, + "loss": 0.9078, + "step": 6189 + }, + { + "epoch": 0.45546102552310874, + "grad_norm": 1.03125, + "learning_rate": 2.8587052848504414e-05, + "loss": 1.1242, + "step": 6190 + }, + { + "epoch": 0.45553460565647275, + "grad_norm": 0.8125, + "learning_rate": 2.8581321083705298e-05, + "loss": 1.0109, + "step": 6191 + }, + { + "epoch": 0.45560818578983675, + "grad_norm": 0.72265625, + "learning_rate": 2.8575589126703452e-05, + "loss": 0.6212, + "step": 6192 + }, + { + "epoch": 0.45568176592320075, + "grad_norm": 0.88671875, + "learning_rate": 2.856985697780648e-05, + "loss": 0.9691, + "step": 6193 + }, + { + "epoch": 0.4557553460565647, + "grad_norm": 0.890625, + "learning_rate": 2.856412463732204e-05, + "loss": 0.9252, + "step": 6194 + }, + { + "epoch": 0.4558289261899287, + "grad_norm": 0.80859375, + "learning_rate": 2.8558392105557746e-05, + "loss": 0.978, + "step": 6195 + }, + { + "epoch": 0.4559025063232927, + "grad_norm": 0.75390625, + "learning_rate": 2.8552659382821277e-05, + "loss": 0.6545, + "step": 6196 + }, + { + "epoch": 0.4559760864566567, + "grad_norm": 0.93359375, + "learning_rate": 2.8546926469420293e-05, + "loss": 1.3891, + "step": 6197 + }, + { + "epoch": 0.4560496665900207, + "grad_norm": 0.9765625, + "learning_rate": 2.854119336566246e-05, + "loss": 0.9865, + "step": 6198 + }, + { + "epoch": 0.45612324672338467, + "grad_norm": 1.1015625, + "learning_rate": 2.8535460071855464e-05, + "loss": 1.1086, + "step": 6199 + }, + { + "epoch": 0.45619682685674867, + "grad_norm": 0.85546875, + "learning_rate": 2.8529726588307005e-05, + "loss": 1.3484, + "step": 6200 + }, + { + "epoch": 0.4562704069901127, + "grad_norm": 0.83984375, + "learning_rate": 2.8523992915324794e-05, + "loss": 0.8123, + "step": 6201 + }, + { + "epoch": 0.4563439871234767, + "grad_norm": 0.69921875, + "learning_rate": 2.8518259053216534e-05, + "loss": 0.9677, + "step": 6202 + }, + { + "epoch": 0.45641756725684063, + "grad_norm": 0.85546875, + "learning_rate": 2.8512525002289954e-05, + "loss": 1.1213, + "step": 6203 + }, + { + "epoch": 0.45649114739020463, + "grad_norm": 1.109375, + "learning_rate": 2.8506790762852796e-05, + "loss": 1.123, + "step": 6204 + }, + { + "epoch": 0.45656472752356864, + "grad_norm": 0.8984375, + "learning_rate": 2.8501056335212794e-05, + "loss": 1.2317, + "step": 6205 + }, + { + "epoch": 0.45663830765693264, + "grad_norm": 0.91015625, + "learning_rate": 2.8495321719677714e-05, + "loss": 1.1782, + "step": 6206 + }, + { + "epoch": 0.45671188779029664, + "grad_norm": 0.8984375, + "learning_rate": 2.8489586916555322e-05, + "loss": 0.6275, + "step": 6207 + }, + { + "epoch": 0.4567854679236606, + "grad_norm": 0.7109375, + "learning_rate": 2.8483851926153393e-05, + "loss": 0.6487, + "step": 6208 + }, + { + "epoch": 0.4568590480570246, + "grad_norm": 0.84375, + "learning_rate": 2.8478116748779705e-05, + "loss": 0.946, + "step": 6209 + }, + { + "epoch": 0.4569326281903886, + "grad_norm": 0.8359375, + "learning_rate": 2.847238138474207e-05, + "loss": 1.0903, + "step": 6210 + }, + { + "epoch": 0.4570062083237526, + "grad_norm": 0.73046875, + "learning_rate": 2.8466645834348272e-05, + "loss": 0.7167, + "step": 6211 + }, + { + "epoch": 0.45707978845711655, + "grad_norm": 1.1015625, + "learning_rate": 2.8460910097906148e-05, + "loss": 1.0082, + "step": 6212 + }, + { + "epoch": 0.45715336859048056, + "grad_norm": 0.86328125, + "learning_rate": 2.8455174175723508e-05, + "loss": 0.9808, + "step": 6213 + }, + { + "epoch": 0.45722694872384456, + "grad_norm": 0.8828125, + "learning_rate": 2.8449438068108198e-05, + "loss": 0.6252, + "step": 6214 + }, + { + "epoch": 0.45730052885720857, + "grad_norm": 0.8671875, + "learning_rate": 2.844370177536807e-05, + "loss": 0.8817, + "step": 6215 + }, + { + "epoch": 0.45737410899057257, + "grad_norm": 1.0078125, + "learning_rate": 2.8437965297810966e-05, + "loss": 1.0954, + "step": 6216 + }, + { + "epoch": 0.4574476891239365, + "grad_norm": 1.015625, + "learning_rate": 2.843222863574476e-05, + "loss": 1.0867, + "step": 6217 + }, + { + "epoch": 0.4575212692573005, + "grad_norm": 0.8515625, + "learning_rate": 2.842649178947732e-05, + "loss": 1.1105, + "step": 6218 + }, + { + "epoch": 0.4575948493906645, + "grad_norm": 0.8203125, + "learning_rate": 2.842075475931654e-05, + "loss": 0.9596, + "step": 6219 + }, + { + "epoch": 0.45766842952402853, + "grad_norm": 0.7734375, + "learning_rate": 2.8415017545570318e-05, + "loss": 1.0159, + "step": 6220 + }, + { + "epoch": 0.4577420096573925, + "grad_norm": 0.86328125, + "learning_rate": 2.8409280148546548e-05, + "loss": 1.4714, + "step": 6221 + }, + { + "epoch": 0.4578155897907565, + "grad_norm": 0.87109375, + "learning_rate": 2.840354256855316e-05, + "loss": 0.8535, + "step": 6222 + }, + { + "epoch": 0.4578891699241205, + "grad_norm": 0.9609375, + "learning_rate": 2.8397804805898058e-05, + "loss": 1.2393, + "step": 6223 + }, + { + "epoch": 0.4579627500574845, + "grad_norm": 0.7578125, + "learning_rate": 2.8392066860889206e-05, + "loss": 0.7248, + "step": 6224 + }, + { + "epoch": 0.4580363301908485, + "grad_norm": 0.87890625, + "learning_rate": 2.8386328733834522e-05, + "loss": 0.9474, + "step": 6225 + }, + { + "epoch": 0.45810991032421244, + "grad_norm": 0.953125, + "learning_rate": 2.838059042504197e-05, + "loss": 0.9051, + "step": 6226 + }, + { + "epoch": 0.45818349045757645, + "grad_norm": 0.8828125, + "learning_rate": 2.8374851934819514e-05, + "loss": 1.2669, + "step": 6227 + }, + { + "epoch": 0.45825707059094045, + "grad_norm": 0.67578125, + "learning_rate": 2.836911326347513e-05, + "loss": 0.6519, + "step": 6228 + }, + { + "epoch": 0.45833065072430446, + "grad_norm": 0.87890625, + "learning_rate": 2.8363374411316806e-05, + "loss": 1.4039, + "step": 6229 + }, + { + "epoch": 0.4584042308576684, + "grad_norm": 0.953125, + "learning_rate": 2.8357635378652527e-05, + "loss": 0.9145, + "step": 6230 + }, + { + "epoch": 0.4584778109910324, + "grad_norm": 0.8828125, + "learning_rate": 2.83518961657903e-05, + "loss": 1.0376, + "step": 6231 + }, + { + "epoch": 0.4585513911243964, + "grad_norm": 0.796875, + "learning_rate": 2.834615677303813e-05, + "loss": 0.8081, + "step": 6232 + }, + { + "epoch": 0.4586249712577604, + "grad_norm": 0.7890625, + "learning_rate": 2.834041720070405e-05, + "loss": 0.8228, + "step": 6233 + }, + { + "epoch": 0.4586985513911244, + "grad_norm": 0.67578125, + "learning_rate": 2.8334677449096088e-05, + "loss": 0.5794, + "step": 6234 + }, + { + "epoch": 0.45877213152448837, + "grad_norm": 0.99609375, + "learning_rate": 2.832893751852228e-05, + "loss": 0.7654, + "step": 6235 + }, + { + "epoch": 0.4588457116578524, + "grad_norm": 1.21875, + "learning_rate": 2.8323197409290696e-05, + "loss": 1.2892, + "step": 6236 + }, + { + "epoch": 0.4589192917912164, + "grad_norm": 0.98046875, + "learning_rate": 2.831745712170937e-05, + "loss": 1.0228, + "step": 6237 + }, + { + "epoch": 0.4589928719245804, + "grad_norm": 0.84765625, + "learning_rate": 2.83117166560864e-05, + "loss": 0.7601, + "step": 6238 + }, + { + "epoch": 0.45906645205794433, + "grad_norm": 0.7890625, + "learning_rate": 2.8305976012729845e-05, + "loss": 0.65, + "step": 6239 + }, + { + "epoch": 0.45914003219130833, + "grad_norm": 0.9296875, + "learning_rate": 2.83002351919478e-05, + "loss": 1.0663, + "step": 6240 + }, + { + "epoch": 0.45921361232467234, + "grad_norm": 0.86328125, + "learning_rate": 2.8294494194048365e-05, + "loss": 1.2407, + "step": 6241 + }, + { + "epoch": 0.45928719245803634, + "grad_norm": 0.8984375, + "learning_rate": 2.8288753019339653e-05, + "loss": 1.0828, + "step": 6242 + }, + { + "epoch": 0.45936077259140035, + "grad_norm": 0.86328125, + "learning_rate": 2.8283011668129777e-05, + "loss": 0.9263, + "step": 6243 + }, + { + "epoch": 0.4594343527247643, + "grad_norm": 0.73828125, + "learning_rate": 2.8277270140726863e-05, + "loss": 0.6277, + "step": 6244 + }, + { + "epoch": 0.4595079328581283, + "grad_norm": 0.84765625, + "learning_rate": 2.8271528437439054e-05, + "loss": 0.8775, + "step": 6245 + }, + { + "epoch": 0.4595815129914923, + "grad_norm": 0.87109375, + "learning_rate": 2.8265786558574486e-05, + "loss": 1.0949, + "step": 6246 + }, + { + "epoch": 0.4596550931248563, + "grad_norm": 0.96875, + "learning_rate": 2.826004450444133e-05, + "loss": 1.1995, + "step": 6247 + }, + { + "epoch": 0.45972867325822026, + "grad_norm": 1.15625, + "learning_rate": 2.825430227534773e-05, + "loss": 1.6563, + "step": 6248 + }, + { + "epoch": 0.45980225339158426, + "grad_norm": 0.890625, + "learning_rate": 2.8248559871601887e-05, + "loss": 0.9782, + "step": 6249 + }, + { + "epoch": 0.45987583352494826, + "grad_norm": 0.77734375, + "learning_rate": 2.824281729351197e-05, + "loss": 0.7899, + "step": 6250 + }, + { + "epoch": 0.45994941365831227, + "grad_norm": 0.7890625, + "learning_rate": 2.8237074541386167e-05, + "loss": 0.9182, + "step": 6251 + }, + { + "epoch": 0.46002299379167627, + "grad_norm": 0.68359375, + "learning_rate": 2.8231331615532697e-05, + "loss": 0.6859, + "step": 6252 + }, + { + "epoch": 0.4600965739250402, + "grad_norm": 0.85546875, + "learning_rate": 2.8225588516259748e-05, + "loss": 1.0853, + "step": 6253 + }, + { + "epoch": 0.4601701540584042, + "grad_norm": 1.03125, + "learning_rate": 2.8219845243875565e-05, + "loss": 1.3617, + "step": 6254 + }, + { + "epoch": 0.46024373419176823, + "grad_norm": 0.89453125, + "learning_rate": 2.8214101798688363e-05, + "loss": 1.1836, + "step": 6255 + }, + { + "epoch": 0.46031731432513223, + "grad_norm": 0.9375, + "learning_rate": 2.82083581810064e-05, + "loss": 0.8941, + "step": 6256 + }, + { + "epoch": 0.4603908944584962, + "grad_norm": 1.0390625, + "learning_rate": 2.8202614391137906e-05, + "loss": 0.7217, + "step": 6257 + }, + { + "epoch": 0.4604644745918602, + "grad_norm": 0.73046875, + "learning_rate": 2.8196870429391147e-05, + "loss": 0.6514, + "step": 6258 + }, + { + "epoch": 0.4605380547252242, + "grad_norm": 0.984375, + "learning_rate": 2.819112629607439e-05, + "loss": 0.9014, + "step": 6259 + }, + { + "epoch": 0.4606116348585882, + "grad_norm": 0.75390625, + "learning_rate": 2.8185381991495908e-05, + "loss": 0.708, + "step": 6260 + }, + { + "epoch": 0.4606852149919522, + "grad_norm": 0.8984375, + "learning_rate": 2.8179637515963996e-05, + "loss": 1.1131, + "step": 6261 + }, + { + "epoch": 0.46075879512531615, + "grad_norm": 1.078125, + "learning_rate": 2.817389286978694e-05, + "loss": 1.6942, + "step": 6262 + }, + { + "epoch": 0.46083237525868015, + "grad_norm": 1.25, + "learning_rate": 2.8168148053273053e-05, + "loss": 1.1356, + "step": 6263 + }, + { + "epoch": 0.46090595539204415, + "grad_norm": 0.80078125, + "learning_rate": 2.8162403066730643e-05, + "loss": 1.2119, + "step": 6264 + }, + { + "epoch": 0.46097953552540816, + "grad_norm": 0.875, + "learning_rate": 2.815665791046803e-05, + "loss": 1.3804, + "step": 6265 + }, + { + "epoch": 0.4610531156587721, + "grad_norm": 0.99609375, + "learning_rate": 2.8150912584793554e-05, + "loss": 1.095, + "step": 6266 + }, + { + "epoch": 0.4611266957921361, + "grad_norm": 1.03125, + "learning_rate": 2.8145167090015546e-05, + "loss": 1.2664, + "step": 6267 + }, + { + "epoch": 0.4612002759255001, + "grad_norm": 0.80859375, + "learning_rate": 2.8139421426442357e-05, + "loss": 0.993, + "step": 6268 + }, + { + "epoch": 0.4612738560588641, + "grad_norm": 0.78515625, + "learning_rate": 2.813367559438235e-05, + "loss": 0.9281, + "step": 6269 + }, + { + "epoch": 0.4613474361922281, + "grad_norm": 1.1328125, + "learning_rate": 2.8127929594143903e-05, + "loss": 1.0383, + "step": 6270 + }, + { + "epoch": 0.46142101632559207, + "grad_norm": 0.7421875, + "learning_rate": 2.8122183426035377e-05, + "loss": 0.9486, + "step": 6271 + }, + { + "epoch": 0.4614945964589561, + "grad_norm": 0.97265625, + "learning_rate": 2.8116437090365166e-05, + "loss": 1.5287, + "step": 6272 + }, + { + "epoch": 0.4615681765923201, + "grad_norm": 1.1171875, + "learning_rate": 2.8110690587441656e-05, + "loss": 1.0122, + "step": 6273 + }, + { + "epoch": 0.4616417567256841, + "grad_norm": 0.80859375, + "learning_rate": 2.8104943917573262e-05, + "loss": 1.0152, + "step": 6274 + }, + { + "epoch": 0.46171533685904803, + "grad_norm": 0.9765625, + "learning_rate": 2.809919708106839e-05, + "loss": 1.0841, + "step": 6275 + }, + { + "epoch": 0.46178891699241204, + "grad_norm": 0.8828125, + "learning_rate": 2.809345007823546e-05, + "loss": 1.2076, + "step": 6276 + }, + { + "epoch": 0.46186249712577604, + "grad_norm": 1.0234375, + "learning_rate": 2.8087702909382918e-05, + "loss": 1.175, + "step": 6277 + }, + { + "epoch": 0.46193607725914004, + "grad_norm": 0.89453125, + "learning_rate": 2.8081955574819184e-05, + "loss": 0.932, + "step": 6278 + }, + { + "epoch": 0.46200965739250405, + "grad_norm": 0.75390625, + "learning_rate": 2.8076208074852728e-05, + "loss": 0.9306, + "step": 6279 + }, + { + "epoch": 0.462083237525868, + "grad_norm": 1.1484375, + "learning_rate": 2.807046040979198e-05, + "loss": 1.2228, + "step": 6280 + }, + { + "epoch": 0.462156817659232, + "grad_norm": 0.82421875, + "learning_rate": 2.806471257994543e-05, + "loss": 0.7424, + "step": 6281 + }, + { + "epoch": 0.462230397792596, + "grad_norm": 0.68359375, + "learning_rate": 2.805896458562154e-05, + "loss": 0.6086, + "step": 6282 + }, + { + "epoch": 0.46230397792596, + "grad_norm": 0.69921875, + "learning_rate": 2.8053216427128796e-05, + "loss": 0.768, + "step": 6283 + }, + { + "epoch": 0.46237755805932396, + "grad_norm": 0.80859375, + "learning_rate": 2.8047468104775697e-05, + "loss": 0.8747, + "step": 6284 + }, + { + "epoch": 0.46245113819268796, + "grad_norm": 0.84375, + "learning_rate": 2.8041719618870737e-05, + "loss": 0.739, + "step": 6285 + }, + { + "epoch": 0.46252471832605196, + "grad_norm": 0.96484375, + "learning_rate": 2.8035970969722436e-05, + "loss": 0.9525, + "step": 6286 + }, + { + "epoch": 0.46259829845941597, + "grad_norm": 0.93359375, + "learning_rate": 2.8030222157639308e-05, + "loss": 1.1422, + "step": 6287 + }, + { + "epoch": 0.46267187859278, + "grad_norm": 0.8203125, + "learning_rate": 2.8024473182929872e-05, + "loss": 0.64, + "step": 6288 + }, + { + "epoch": 0.4627454587261439, + "grad_norm": 0.84375, + "learning_rate": 2.8018724045902673e-05, + "loss": 0.8409, + "step": 6289 + }, + { + "epoch": 0.4628190388595079, + "grad_norm": 0.78515625, + "learning_rate": 2.8012974746866266e-05, + "loss": 0.823, + "step": 6290 + }, + { + "epoch": 0.46289261899287193, + "grad_norm": 0.77734375, + "learning_rate": 2.8007225286129187e-05, + "loss": 0.7293, + "step": 6291 + }, + { + "epoch": 0.46296619912623593, + "grad_norm": 0.875, + "learning_rate": 2.8001475664000004e-05, + "loss": 0.8793, + "step": 6292 + }, + { + "epoch": 0.4630397792595999, + "grad_norm": 0.8671875, + "learning_rate": 2.79957258807873e-05, + "loss": 1.0189, + "step": 6293 + }, + { + "epoch": 0.4631133593929639, + "grad_norm": 0.9765625, + "learning_rate": 2.798997593679964e-05, + "loss": 0.839, + "step": 6294 + }, + { + "epoch": 0.4631869395263279, + "grad_norm": 0.7109375, + "learning_rate": 2.7984225832345624e-05, + "loss": 0.7649, + "step": 6295 + }, + { + "epoch": 0.4632605196596919, + "grad_norm": 0.98828125, + "learning_rate": 2.7978475567733836e-05, + "loss": 1.1753, + "step": 6296 + }, + { + "epoch": 0.4633340997930559, + "grad_norm": 0.84765625, + "learning_rate": 2.7972725143272898e-05, + "loss": 0.9838, + "step": 6297 + }, + { + "epoch": 0.46340767992641985, + "grad_norm": 1.0, + "learning_rate": 2.7966974559271407e-05, + "loss": 1.024, + "step": 6298 + }, + { + "epoch": 0.46348126005978385, + "grad_norm": 0.72265625, + "learning_rate": 2.7961223816038008e-05, + "loss": 0.7331, + "step": 6299 + }, + { + "epoch": 0.46355484019314785, + "grad_norm": 0.79296875, + "learning_rate": 2.795547291388131e-05, + "loss": 0.9146, + "step": 6300 + }, + { + "epoch": 0.46362842032651186, + "grad_norm": 0.6875, + "learning_rate": 2.794972185310996e-05, + "loss": 0.5683, + "step": 6301 + }, + { + "epoch": 0.4637020004598758, + "grad_norm": 0.9453125, + "learning_rate": 2.7943970634032618e-05, + "loss": 1.18, + "step": 6302 + }, + { + "epoch": 0.4637755805932398, + "grad_norm": 0.98828125, + "learning_rate": 2.7938219256957926e-05, + "loss": 0.6956, + "step": 6303 + }, + { + "epoch": 0.4638491607266038, + "grad_norm": 0.8828125, + "learning_rate": 2.7932467722194562e-05, + "loss": 0.8688, + "step": 6304 + }, + { + "epoch": 0.4639227408599678, + "grad_norm": 1.0625, + "learning_rate": 2.7926716030051193e-05, + "loss": 1.1108, + "step": 6305 + }, + { + "epoch": 0.4639963209933318, + "grad_norm": 0.55859375, + "learning_rate": 2.79209641808365e-05, + "loss": 0.4767, + "step": 6306 + }, + { + "epoch": 0.46406990112669577, + "grad_norm": 0.75, + "learning_rate": 2.7915212174859177e-05, + "loss": 0.7869, + "step": 6307 + }, + { + "epoch": 0.4641434812600598, + "grad_norm": 0.76171875, + "learning_rate": 2.7909460012427923e-05, + "loss": 1.1705, + "step": 6308 + }, + { + "epoch": 0.4642170613934238, + "grad_norm": 0.7265625, + "learning_rate": 2.7903707693851444e-05, + "loss": 0.7204, + "step": 6309 + }, + { + "epoch": 0.4642906415267878, + "grad_norm": 0.625, + "learning_rate": 2.7897955219438454e-05, + "loss": 0.7186, + "step": 6310 + }, + { + "epoch": 0.46436422166015173, + "grad_norm": 0.84765625, + "learning_rate": 2.789220258949769e-05, + "loss": 0.8165, + "step": 6311 + }, + { + "epoch": 0.46443780179351574, + "grad_norm": 0.734375, + "learning_rate": 2.7886449804337867e-05, + "loss": 0.8139, + "step": 6312 + }, + { + "epoch": 0.46451138192687974, + "grad_norm": 0.8125, + "learning_rate": 2.788069686426774e-05, + "loss": 1.2423, + "step": 6313 + }, + { + "epoch": 0.46458496206024374, + "grad_norm": 0.66796875, + "learning_rate": 2.787494376959605e-05, + "loss": 0.674, + "step": 6314 + }, + { + "epoch": 0.46465854219360775, + "grad_norm": 0.8828125, + "learning_rate": 2.786919052063156e-05, + "loss": 1.146, + "step": 6315 + }, + { + "epoch": 0.4647321223269717, + "grad_norm": 0.8671875, + "learning_rate": 2.7863437117683034e-05, + "loss": 1.2663, + "step": 6316 + }, + { + "epoch": 0.4648057024603357, + "grad_norm": 0.75390625, + "learning_rate": 2.7857683561059245e-05, + "loss": 0.5894, + "step": 6317 + }, + { + "epoch": 0.4648792825936997, + "grad_norm": 1.0546875, + "learning_rate": 2.785192985106898e-05, + "loss": 1.1378, + "step": 6318 + }, + { + "epoch": 0.4649528627270637, + "grad_norm": 1.0, + "learning_rate": 2.784617598802102e-05, + "loss": 0.7876, + "step": 6319 + }, + { + "epoch": 0.46502644286042766, + "grad_norm": 0.83984375, + "learning_rate": 2.784042197222418e-05, + "loss": 0.8514, + "step": 6320 + }, + { + "epoch": 0.46510002299379166, + "grad_norm": 1.0703125, + "learning_rate": 2.7834667803987247e-05, + "loss": 1.0082, + "step": 6321 + }, + { + "epoch": 0.46517360312715567, + "grad_norm": 0.94140625, + "learning_rate": 2.782891348361905e-05, + "loss": 1.0139, + "step": 6322 + }, + { + "epoch": 0.46524718326051967, + "grad_norm": 0.8359375, + "learning_rate": 2.7823159011428412e-05, + "loss": 0.634, + "step": 6323 + }, + { + "epoch": 0.4653207633938837, + "grad_norm": 0.734375, + "learning_rate": 2.7817404387724154e-05, + "loss": 0.6427, + "step": 6324 + }, + { + "epoch": 0.4653943435272476, + "grad_norm": 0.76171875, + "learning_rate": 2.7811649612815138e-05, + "loss": 0.7685, + "step": 6325 + }, + { + "epoch": 0.4654679236606116, + "grad_norm": 1.03125, + "learning_rate": 2.7805894687010186e-05, + "loss": 0.8788, + "step": 6326 + }, + { + "epoch": 0.46554150379397563, + "grad_norm": 0.6171875, + "learning_rate": 2.7800139610618176e-05, + "loss": 0.6293, + "step": 6327 + }, + { + "epoch": 0.46561508392733963, + "grad_norm": 0.66796875, + "learning_rate": 2.7794384383947945e-05, + "loss": 0.7029, + "step": 6328 + }, + { + "epoch": 0.4656886640607036, + "grad_norm": 0.828125, + "learning_rate": 2.7788629007308396e-05, + "loss": 1.1446, + "step": 6329 + }, + { + "epoch": 0.4657622441940676, + "grad_norm": 0.68359375, + "learning_rate": 2.77828734810084e-05, + "loss": 0.9256, + "step": 6330 + }, + { + "epoch": 0.4658358243274316, + "grad_norm": 1.0078125, + "learning_rate": 2.7777117805356834e-05, + "loss": 1.0913, + "step": 6331 + }, + { + "epoch": 0.4659094044607956, + "grad_norm": 1.046875, + "learning_rate": 2.7771361980662596e-05, + "loss": 1.1589, + "step": 6332 + }, + { + "epoch": 0.4659829845941596, + "grad_norm": 0.765625, + "learning_rate": 2.7765606007234597e-05, + "loss": 0.7919, + "step": 6333 + }, + { + "epoch": 0.46605656472752355, + "grad_norm": 0.94140625, + "learning_rate": 2.775984988538175e-05, + "loss": 0.9905, + "step": 6334 + }, + { + "epoch": 0.46613014486088755, + "grad_norm": 0.87109375, + "learning_rate": 2.7754093615412963e-05, + "loss": 0.6927, + "step": 6335 + }, + { + "epoch": 0.46620372499425156, + "grad_norm": 0.7890625, + "learning_rate": 2.7748337197637186e-05, + "loss": 0.8133, + "step": 6336 + }, + { + "epoch": 0.46627730512761556, + "grad_norm": 0.76171875, + "learning_rate": 2.7742580632363336e-05, + "loss": 0.9428, + "step": 6337 + }, + { + "epoch": 0.4663508852609795, + "grad_norm": 0.98828125, + "learning_rate": 2.7736823919900367e-05, + "loss": 0.9724, + "step": 6338 + }, + { + "epoch": 0.4664244653943435, + "grad_norm": 0.984375, + "learning_rate": 2.7731067060557224e-05, + "loss": 0.9285, + "step": 6339 + }, + { + "epoch": 0.4664980455277075, + "grad_norm": 0.859375, + "learning_rate": 2.7725310054642866e-05, + "loss": 1.1349, + "step": 6340 + }, + { + "epoch": 0.4665716256610715, + "grad_norm": 0.76171875, + "learning_rate": 2.7719552902466273e-05, + "loss": 0.9095, + "step": 6341 + }, + { + "epoch": 0.4666452057944355, + "grad_norm": 1.0, + "learning_rate": 2.7713795604336407e-05, + "loss": 1.5698, + "step": 6342 + }, + { + "epoch": 0.4667187859277995, + "grad_norm": 0.83984375, + "learning_rate": 2.770803816056226e-05, + "loss": 0.9831, + "step": 6343 + }, + { + "epoch": 0.4667923660611635, + "grad_norm": 0.99609375, + "learning_rate": 2.7702280571452817e-05, + "loss": 0.7587, + "step": 6344 + }, + { + "epoch": 0.4668659461945275, + "grad_norm": 0.87890625, + "learning_rate": 2.7696522837317085e-05, + "loss": 0.9243, + "step": 6345 + }, + { + "epoch": 0.4669395263278915, + "grad_norm": 1.015625, + "learning_rate": 2.769076495846406e-05, + "loss": 0.9187, + "step": 6346 + }, + { + "epoch": 0.46701310646125543, + "grad_norm": 0.8828125, + "learning_rate": 2.7685006935202768e-05, + "loss": 0.8834, + "step": 6347 + }, + { + "epoch": 0.46708668659461944, + "grad_norm": 0.83203125, + "learning_rate": 2.767924876784221e-05, + "loss": 0.8859, + "step": 6348 + }, + { + "epoch": 0.46716026672798344, + "grad_norm": 0.84375, + "learning_rate": 2.7673490456691442e-05, + "loss": 1.0394, + "step": 6349 + }, + { + "epoch": 0.46723384686134745, + "grad_norm": 0.8671875, + "learning_rate": 2.7667732002059493e-05, + "loss": 0.8982, + "step": 6350 + }, + { + "epoch": 0.46730742699471145, + "grad_norm": 0.8359375, + "learning_rate": 2.76619734042554e-05, + "loss": 0.9029, + "step": 6351 + }, + { + "epoch": 0.4673810071280754, + "grad_norm": 0.9296875, + "learning_rate": 2.7656214663588226e-05, + "loss": 0.7857, + "step": 6352 + }, + { + "epoch": 0.4674545872614394, + "grad_norm": 0.94921875, + "learning_rate": 2.765045578036703e-05, + "loss": 1.067, + "step": 6353 + }, + { + "epoch": 0.4675281673948034, + "grad_norm": 0.828125, + "learning_rate": 2.7644696754900878e-05, + "loss": 0.9241, + "step": 6354 + }, + { + "epoch": 0.4676017475281674, + "grad_norm": 0.765625, + "learning_rate": 2.7638937587498835e-05, + "loss": 0.723, + "step": 6355 + }, + { + "epoch": 0.46767532766153136, + "grad_norm": 0.8125, + "learning_rate": 2.763317827847e-05, + "loss": 1.4426, + "step": 6356 + }, + { + "epoch": 0.46774890779489536, + "grad_norm": 0.8515625, + "learning_rate": 2.762741882812347e-05, + "loss": 1.2153, + "step": 6357 + }, + { + "epoch": 0.46782248792825937, + "grad_norm": 0.8046875, + "learning_rate": 2.7621659236768326e-05, + "loss": 0.7467, + "step": 6358 + }, + { + "epoch": 0.46789606806162337, + "grad_norm": 1.0, + "learning_rate": 2.7615899504713687e-05, + "loss": 1.0205, + "step": 6359 + }, + { + "epoch": 0.4679696481949874, + "grad_norm": 0.70703125, + "learning_rate": 2.761013963226866e-05, + "loss": 0.689, + "step": 6360 + }, + { + "epoch": 0.4680432283283513, + "grad_norm": 0.79296875, + "learning_rate": 2.7604379619742378e-05, + "loss": 0.7143, + "step": 6361 + }, + { + "epoch": 0.46811680846171533, + "grad_norm": 0.7109375, + "learning_rate": 2.7598619467443943e-05, + "loss": 0.7523, + "step": 6362 + }, + { + "epoch": 0.46819038859507933, + "grad_norm": 0.796875, + "learning_rate": 2.7592859175682517e-05, + "loss": 0.6483, + "step": 6363 + }, + { + "epoch": 0.46826396872844334, + "grad_norm": 0.6875, + "learning_rate": 2.7587098744767238e-05, + "loss": 0.57, + "step": 6364 + }, + { + "epoch": 0.4683375488618073, + "grad_norm": 0.91015625, + "learning_rate": 2.7581338175007253e-05, + "loss": 0.8446, + "step": 6365 + }, + { + "epoch": 0.4684111289951713, + "grad_norm": 0.95703125, + "learning_rate": 2.7575577466711733e-05, + "loss": 1.0267, + "step": 6366 + }, + { + "epoch": 0.4684847091285353, + "grad_norm": 1.078125, + "learning_rate": 2.7569816620189825e-05, + "loss": 0.8569, + "step": 6367 + }, + { + "epoch": 0.4685582892618993, + "grad_norm": 0.8359375, + "learning_rate": 2.7564055635750712e-05, + "loss": 0.9914, + "step": 6368 + }, + { + "epoch": 0.4686318693952633, + "grad_norm": 0.734375, + "learning_rate": 2.7558294513703575e-05, + "loss": 0.8706, + "step": 6369 + }, + { + "epoch": 0.46870544952862725, + "grad_norm": 0.828125, + "learning_rate": 2.7552533254357615e-05, + "loss": 1.0821, + "step": 6370 + }, + { + "epoch": 0.46877902966199125, + "grad_norm": 0.765625, + "learning_rate": 2.7546771858022006e-05, + "loss": 0.7812, + "step": 6371 + }, + { + "epoch": 0.46885260979535526, + "grad_norm": 0.67578125, + "learning_rate": 2.7541010325005963e-05, + "loss": 0.8116, + "step": 6372 + }, + { + "epoch": 0.46892618992871926, + "grad_norm": 0.9765625, + "learning_rate": 2.7535248655618702e-05, + "loss": 0.9588, + "step": 6373 + }, + { + "epoch": 0.4689997700620832, + "grad_norm": 0.84375, + "learning_rate": 2.7529486850169422e-05, + "loss": 0.974, + "step": 6374 + }, + { + "epoch": 0.4690733501954472, + "grad_norm": 0.98828125, + "learning_rate": 2.7523724908967367e-05, + "loss": 1.1405, + "step": 6375 + }, + { + "epoch": 0.4691469303288112, + "grad_norm": 0.9140625, + "learning_rate": 2.7517962832321763e-05, + "loss": 1.1151, + "step": 6376 + }, + { + "epoch": 0.4692205104621752, + "grad_norm": 1.0078125, + "learning_rate": 2.751220062054185e-05, + "loss": 0.965, + "step": 6377 + }, + { + "epoch": 0.4692940905955392, + "grad_norm": 0.8828125, + "learning_rate": 2.7506438273936874e-05, + "loss": 0.6209, + "step": 6378 + }, + { + "epoch": 0.4693676707289032, + "grad_norm": 0.96484375, + "learning_rate": 2.7500675792816094e-05, + "loss": 1.389, + "step": 6379 + }, + { + "epoch": 0.4694412508622672, + "grad_norm": 1.0390625, + "learning_rate": 2.7494913177488756e-05, + "loss": 1.2503, + "step": 6380 + }, + { + "epoch": 0.4695148309956312, + "grad_norm": 0.85546875, + "learning_rate": 2.748915042826415e-05, + "loss": 1.0505, + "step": 6381 + }, + { + "epoch": 0.4695884111289952, + "grad_norm": 0.796875, + "learning_rate": 2.7483387545451535e-05, + "loss": 0.9022, + "step": 6382 + }, + { + "epoch": 0.46966199126235914, + "grad_norm": 0.83203125, + "learning_rate": 2.74776245293602e-05, + "loss": 0.6629, + "step": 6383 + }, + { + "epoch": 0.46973557139572314, + "grad_norm": 0.6953125, + "learning_rate": 2.7471861380299447e-05, + "loss": 0.7861, + "step": 6384 + }, + { + "epoch": 0.46980915152908714, + "grad_norm": 0.9296875, + "learning_rate": 2.7466098098578557e-05, + "loss": 1.0548, + "step": 6385 + }, + { + "epoch": 0.46988273166245115, + "grad_norm": 0.9140625, + "learning_rate": 2.746033468450684e-05, + "loss": 0.7586, + "step": 6386 + }, + { + "epoch": 0.46995631179581515, + "grad_norm": 0.86328125, + "learning_rate": 2.7454571138393603e-05, + "loss": 1.3519, + "step": 6387 + }, + { + "epoch": 0.4700298919291791, + "grad_norm": 0.94921875, + "learning_rate": 2.7448807460548174e-05, + "loss": 0.9955, + "step": 6388 + }, + { + "epoch": 0.4701034720625431, + "grad_norm": 0.85546875, + "learning_rate": 2.744304365127987e-05, + "loss": 1.0774, + "step": 6389 + }, + { + "epoch": 0.4701770521959071, + "grad_norm": 0.84375, + "learning_rate": 2.7437279710898027e-05, + "loss": 0.6787, + "step": 6390 + }, + { + "epoch": 0.4702506323292711, + "grad_norm": 0.8359375, + "learning_rate": 2.7431515639711992e-05, + "loss": 1.358, + "step": 6391 + }, + { + "epoch": 0.4703242124626351, + "grad_norm": 0.828125, + "learning_rate": 2.7425751438031098e-05, + "loss": 1.093, + "step": 6392 + }, + { + "epoch": 0.47039779259599906, + "grad_norm": 0.7109375, + "learning_rate": 2.7419987106164714e-05, + "loss": 0.5847, + "step": 6393 + }, + { + "epoch": 0.47047137272936307, + "grad_norm": 0.81640625, + "learning_rate": 2.741422264442218e-05, + "loss": 0.9338, + "step": 6394 + }, + { + "epoch": 0.4705449528627271, + "grad_norm": 1.0, + "learning_rate": 2.740845805311289e-05, + "loss": 1.2731, + "step": 6395 + }, + { + "epoch": 0.4706185329960911, + "grad_norm": 0.8828125, + "learning_rate": 2.7402693332546198e-05, + "loss": 1.114, + "step": 6396 + }, + { + "epoch": 0.470692113129455, + "grad_norm": 0.98828125, + "learning_rate": 2.739692848303149e-05, + "loss": 1.4891, + "step": 6397 + }, + { + "epoch": 0.47076569326281903, + "grad_norm": 1.0078125, + "learning_rate": 2.7391163504878164e-05, + "loss": 1.2744, + "step": 6398 + }, + { + "epoch": 0.47083927339618303, + "grad_norm": 0.984375, + "learning_rate": 2.7385398398395606e-05, + "loss": 0.7885, + "step": 6399 + }, + { + "epoch": 0.47091285352954704, + "grad_norm": 0.73046875, + "learning_rate": 2.7379633163893226e-05, + "loss": 0.6159, + "step": 6400 + }, + { + "epoch": 0.47098643366291104, + "grad_norm": 0.90234375, + "learning_rate": 2.737386780168042e-05, + "loss": 1.0149, + "step": 6401 + }, + { + "epoch": 0.471060013796275, + "grad_norm": 0.81640625, + "learning_rate": 2.7368102312066618e-05, + "loss": 0.9604, + "step": 6402 + }, + { + "epoch": 0.471133593929639, + "grad_norm": 0.734375, + "learning_rate": 2.7362336695361235e-05, + "loss": 0.6421, + "step": 6403 + }, + { + "epoch": 0.471207174063003, + "grad_norm": 1.0234375, + "learning_rate": 2.735657095187371e-05, + "loss": 1.1529, + "step": 6404 + }, + { + "epoch": 0.471280754196367, + "grad_norm": 0.79296875, + "learning_rate": 2.7350805081913466e-05, + "loss": 0.7504, + "step": 6405 + }, + { + "epoch": 0.47135433432973095, + "grad_norm": 0.9140625, + "learning_rate": 2.7345039085789953e-05, + "loss": 0.8606, + "step": 6406 + }, + { + "epoch": 0.47142791446309495, + "grad_norm": 0.8984375, + "learning_rate": 2.733927296381263e-05, + "loss": 0.7918, + "step": 6407 + }, + { + "epoch": 0.47150149459645896, + "grad_norm": 0.73828125, + "learning_rate": 2.7333506716290937e-05, + "loss": 0.9462, + "step": 6408 + }, + { + "epoch": 0.47157507472982296, + "grad_norm": 0.94921875, + "learning_rate": 2.7327740343534346e-05, + "loss": 1.1555, + "step": 6409 + }, + { + "epoch": 0.47164865486318697, + "grad_norm": 0.77734375, + "learning_rate": 2.732197384585233e-05, + "loss": 0.964, + "step": 6410 + }, + { + "epoch": 0.4717222349965509, + "grad_norm": 0.82421875, + "learning_rate": 2.7316207223554364e-05, + "loss": 0.6545, + "step": 6411 + }, + { + "epoch": 0.4717958151299149, + "grad_norm": 0.984375, + "learning_rate": 2.731044047694993e-05, + "loss": 1.1812, + "step": 6412 + }, + { + "epoch": 0.4718693952632789, + "grad_norm": 1.015625, + "learning_rate": 2.7304673606348514e-05, + "loss": 1.0098, + "step": 6413 + }, + { + "epoch": 0.4719429753966429, + "grad_norm": 0.828125, + "learning_rate": 2.7298906612059628e-05, + "loss": 0.5711, + "step": 6414 + }, + { + "epoch": 0.4720165555300069, + "grad_norm": 0.8359375, + "learning_rate": 2.729313949439276e-05, + "loss": 0.9497, + "step": 6415 + }, + { + "epoch": 0.4720901356633709, + "grad_norm": 0.83984375, + "learning_rate": 2.7287372253657422e-05, + "loss": 0.9081, + "step": 6416 + }, + { + "epoch": 0.4721637157967349, + "grad_norm": 0.875, + "learning_rate": 2.7281604890163144e-05, + "loss": 0.9944, + "step": 6417 + }, + { + "epoch": 0.4722372959300989, + "grad_norm": 0.9375, + "learning_rate": 2.727583740421944e-05, + "loss": 1.0096, + "step": 6418 + }, + { + "epoch": 0.4723108760634629, + "grad_norm": 0.84765625, + "learning_rate": 2.727006979613584e-05, + "loss": 1.1038, + "step": 6419 + }, + { + "epoch": 0.47238445619682684, + "grad_norm": 0.84765625, + "learning_rate": 2.726430206622188e-05, + "loss": 0.9111, + "step": 6420 + }, + { + "epoch": 0.47245803633019084, + "grad_norm": 0.71875, + "learning_rate": 2.7258534214787108e-05, + "loss": 0.6363, + "step": 6421 + }, + { + "epoch": 0.47253161646355485, + "grad_norm": 0.77734375, + "learning_rate": 2.7252766242141066e-05, + "loss": 0.745, + "step": 6422 + }, + { + "epoch": 0.47260519659691885, + "grad_norm": 0.96484375, + "learning_rate": 2.7246998148593313e-05, + "loss": 0.906, + "step": 6423 + }, + { + "epoch": 0.4726787767302828, + "grad_norm": 0.8359375, + "learning_rate": 2.7241229934453417e-05, + "loss": 0.6845, + "step": 6424 + }, + { + "epoch": 0.4727523568636468, + "grad_norm": 0.9921875, + "learning_rate": 2.7235461600030947e-05, + "loss": 0.7243, + "step": 6425 + }, + { + "epoch": 0.4728259369970108, + "grad_norm": 0.7109375, + "learning_rate": 2.7229693145635472e-05, + "loss": 0.8047, + "step": 6426 + }, + { + "epoch": 0.4728995171303748, + "grad_norm": 1.1015625, + "learning_rate": 2.7223924571576577e-05, + "loss": 1.1103, + "step": 6427 + }, + { + "epoch": 0.4729730972637388, + "grad_norm": 0.7578125, + "learning_rate": 2.7218155878163847e-05, + "loss": 0.7126, + "step": 6428 + }, + { + "epoch": 0.47304667739710277, + "grad_norm": 0.8828125, + "learning_rate": 2.7212387065706885e-05, + "loss": 0.7595, + "step": 6429 + }, + { + "epoch": 0.47312025753046677, + "grad_norm": 1.0078125, + "learning_rate": 2.7206618134515284e-05, + "loss": 1.0391, + "step": 6430 + }, + { + "epoch": 0.4731938376638308, + "grad_norm": 0.7734375, + "learning_rate": 2.7200849084898654e-05, + "loss": 1.1466, + "step": 6431 + }, + { + "epoch": 0.4732674177971948, + "grad_norm": 0.71875, + "learning_rate": 2.719507991716662e-05, + "loss": 0.9107, + "step": 6432 + }, + { + "epoch": 0.4733409979305587, + "grad_norm": 0.85546875, + "learning_rate": 2.718931063162879e-05, + "loss": 0.9272, + "step": 6433 + }, + { + "epoch": 0.47341457806392273, + "grad_norm": 0.80859375, + "learning_rate": 2.7183541228594796e-05, + "loss": 0.8903, + "step": 6434 + }, + { + "epoch": 0.47348815819728673, + "grad_norm": 0.97265625, + "learning_rate": 2.7177771708374257e-05, + "loss": 1.2076, + "step": 6435 + }, + { + "epoch": 0.47356173833065074, + "grad_norm": 0.8203125, + "learning_rate": 2.7172002071276832e-05, + "loss": 0.684, + "step": 6436 + }, + { + "epoch": 0.47363531846401474, + "grad_norm": 0.765625, + "learning_rate": 2.716623231761215e-05, + "loss": 1.1711, + "step": 6437 + }, + { + "epoch": 0.4737088985973787, + "grad_norm": 0.80078125, + "learning_rate": 2.7160462447689877e-05, + "loss": 0.6916, + "step": 6438 + }, + { + "epoch": 0.4737824787307427, + "grad_norm": 0.91796875, + "learning_rate": 2.715469246181967e-05, + "loss": 0.7315, + "step": 6439 + }, + { + "epoch": 0.4738560588641067, + "grad_norm": 0.8046875, + "learning_rate": 2.714892236031118e-05, + "loss": 1.0714, + "step": 6440 + }, + { + "epoch": 0.4739296389974707, + "grad_norm": 0.75390625, + "learning_rate": 2.7143152143474093e-05, + "loss": 0.6242, + "step": 6441 + }, + { + "epoch": 0.47400321913083465, + "grad_norm": 1.0390625, + "learning_rate": 2.7137381811618073e-05, + "loss": 1.0698, + "step": 6442 + }, + { + "epoch": 0.47407679926419866, + "grad_norm": 0.75, + "learning_rate": 2.7131611365052806e-05, + "loss": 0.6787, + "step": 6443 + }, + { + "epoch": 0.47415037939756266, + "grad_norm": 0.7578125, + "learning_rate": 2.7125840804087987e-05, + "loss": 0.756, + "step": 6444 + }, + { + "epoch": 0.47422395953092666, + "grad_norm": 0.93359375, + "learning_rate": 2.7120070129033302e-05, + "loss": 0.7813, + "step": 6445 + }, + { + "epoch": 0.47429753966429067, + "grad_norm": 1.0546875, + "learning_rate": 2.7114299340198467e-05, + "loss": 1.0591, + "step": 6446 + }, + { + "epoch": 0.4743711197976546, + "grad_norm": 0.81640625, + "learning_rate": 2.710852843789317e-05, + "loss": 0.7675, + "step": 6447 + }, + { + "epoch": 0.4744446999310186, + "grad_norm": 0.78125, + "learning_rate": 2.7102757422427145e-05, + "loss": 0.7435, + "step": 6448 + }, + { + "epoch": 0.4745182800643826, + "grad_norm": 0.71875, + "learning_rate": 2.7096986294110094e-05, + "loss": 0.8249, + "step": 6449 + }, + { + "epoch": 0.47459186019774663, + "grad_norm": 0.95703125, + "learning_rate": 2.7091215053251745e-05, + "loss": 0.8246, + "step": 6450 + }, + { + "epoch": 0.4746654403311106, + "grad_norm": 0.7890625, + "learning_rate": 2.708544370016184e-05, + "loss": 0.9574, + "step": 6451 + }, + { + "epoch": 0.4747390204644746, + "grad_norm": 0.7265625, + "learning_rate": 2.707967223515011e-05, + "loss": 0.989, + "step": 6452 + }, + { + "epoch": 0.4748126005978386, + "grad_norm": 0.76953125, + "learning_rate": 2.7073900658526295e-05, + "loss": 1.0891, + "step": 6453 + }, + { + "epoch": 0.4748861807312026, + "grad_norm": 0.8515625, + "learning_rate": 2.7068128970600152e-05, + "loss": 1.2637, + "step": 6454 + }, + { + "epoch": 0.4749597608645666, + "grad_norm": 0.69921875, + "learning_rate": 2.7062357171681435e-05, + "loss": 0.8857, + "step": 6455 + }, + { + "epoch": 0.47503334099793054, + "grad_norm": 0.96484375, + "learning_rate": 2.7056585262079902e-05, + "loss": 1.1733, + "step": 6456 + }, + { + "epoch": 0.47510692113129455, + "grad_norm": 1.03125, + "learning_rate": 2.705081324210532e-05, + "loss": 1.2251, + "step": 6457 + }, + { + "epoch": 0.47518050126465855, + "grad_norm": 1.1328125, + "learning_rate": 2.7045041112067465e-05, + "loss": 1.0351, + "step": 6458 + }, + { + "epoch": 0.47525408139802255, + "grad_norm": 0.80859375, + "learning_rate": 2.7039268872276124e-05, + "loss": 0.6846, + "step": 6459 + }, + { + "epoch": 0.4753276615313865, + "grad_norm": 0.8203125, + "learning_rate": 2.703349652304107e-05, + "loss": 0.6974, + "step": 6460 + }, + { + "epoch": 0.4754012416647505, + "grad_norm": 0.8046875, + "learning_rate": 2.7027724064672088e-05, + "loss": 0.8048, + "step": 6461 + }, + { + "epoch": 0.4754748217981145, + "grad_norm": 0.73828125, + "learning_rate": 2.7021951497479003e-05, + "loss": 0.6476, + "step": 6462 + }, + { + "epoch": 0.4755484019314785, + "grad_norm": 0.78125, + "learning_rate": 2.7016178821771586e-05, + "loss": 1.1329, + "step": 6463 + }, + { + "epoch": 0.4756219820648425, + "grad_norm": 0.703125, + "learning_rate": 2.7010406037859663e-05, + "loss": 0.6575, + "step": 6464 + }, + { + "epoch": 0.47569556219820647, + "grad_norm": 0.859375, + "learning_rate": 2.7004633146053043e-05, + "loss": 0.8041, + "step": 6465 + }, + { + "epoch": 0.47576914233157047, + "grad_norm": 0.87890625, + "learning_rate": 2.699886014666156e-05, + "loss": 1.2123, + "step": 6466 + }, + { + "epoch": 0.4758427224649345, + "grad_norm": 0.77734375, + "learning_rate": 2.6993087039995017e-05, + "loss": 0.9338, + "step": 6467 + }, + { + "epoch": 0.4759163025982985, + "grad_norm": 1.109375, + "learning_rate": 2.6987313826363264e-05, + "loss": 1.7897, + "step": 6468 + }, + { + "epoch": 0.47598988273166243, + "grad_norm": 1.0, + "learning_rate": 2.6981540506076126e-05, + "loss": 1.1717, + "step": 6469 + }, + { + "epoch": 0.47606346286502643, + "grad_norm": 0.88671875, + "learning_rate": 2.6975767079443454e-05, + "loss": 1.0878, + "step": 6470 + }, + { + "epoch": 0.47613704299839044, + "grad_norm": 0.80078125, + "learning_rate": 2.696999354677509e-05, + "loss": 0.8289, + "step": 6471 + }, + { + "epoch": 0.47621062313175444, + "grad_norm": 0.8984375, + "learning_rate": 2.6964219908380895e-05, + "loss": 0.9862, + "step": 6472 + }, + { + "epoch": 0.47628420326511844, + "grad_norm": 0.70703125, + "learning_rate": 2.6958446164570734e-05, + "loss": 0.7615, + "step": 6473 + }, + { + "epoch": 0.4763577833984824, + "grad_norm": 1.2421875, + "learning_rate": 2.6952672315654458e-05, + "loss": 1.434, + "step": 6474 + }, + { + "epoch": 0.4764313635318464, + "grad_norm": 1.0703125, + "learning_rate": 2.694689836194195e-05, + "loss": 1.4117, + "step": 6475 + }, + { + "epoch": 0.4765049436652104, + "grad_norm": 0.80859375, + "learning_rate": 2.694112430374309e-05, + "loss": 0.7745, + "step": 6476 + }, + { + "epoch": 0.4765785237985744, + "grad_norm": 0.83203125, + "learning_rate": 2.6935350141367742e-05, + "loss": 0.9303, + "step": 6477 + }, + { + "epoch": 0.47665210393193835, + "grad_norm": 0.88671875, + "learning_rate": 2.6929575875125817e-05, + "loss": 0.9707, + "step": 6478 + }, + { + "epoch": 0.47672568406530236, + "grad_norm": 0.8984375, + "learning_rate": 2.6923801505327196e-05, + "loss": 0.8115, + "step": 6479 + }, + { + "epoch": 0.47679926419866636, + "grad_norm": 0.72265625, + "learning_rate": 2.6918027032281784e-05, + "loss": 0.6961, + "step": 6480 + }, + { + "epoch": 0.47687284433203037, + "grad_norm": 0.87109375, + "learning_rate": 2.691225245629948e-05, + "loss": 0.7492, + "step": 6481 + }, + { + "epoch": 0.47694642446539437, + "grad_norm": 0.90625, + "learning_rate": 2.690647777769021e-05, + "loss": 1.1451, + "step": 6482 + }, + { + "epoch": 0.4770200045987583, + "grad_norm": 0.984375, + "learning_rate": 2.6900702996763866e-05, + "loss": 0.9732, + "step": 6483 + }, + { + "epoch": 0.4770935847321223, + "grad_norm": 1.125, + "learning_rate": 2.689492811383038e-05, + "loss": 1.2119, + "step": 6484 + }, + { + "epoch": 0.4771671648654863, + "grad_norm": 0.7109375, + "learning_rate": 2.688915312919969e-05, + "loss": 0.5543, + "step": 6485 + }, + { + "epoch": 0.47724074499885033, + "grad_norm": 0.828125, + "learning_rate": 2.6883378043181713e-05, + "loss": 1.4937, + "step": 6486 + }, + { + "epoch": 0.4773143251322143, + "grad_norm": 1.078125, + "learning_rate": 2.68776028560864e-05, + "loss": 1.0516, + "step": 6487 + }, + { + "epoch": 0.4773879052655783, + "grad_norm": 0.9453125, + "learning_rate": 2.6871827568223684e-05, + "loss": 1.3265, + "step": 6488 + }, + { + "epoch": 0.4774614853989423, + "grad_norm": 0.6875, + "learning_rate": 2.6866052179903522e-05, + "loss": 1.0175, + "step": 6489 + }, + { + "epoch": 0.4775350655323063, + "grad_norm": 0.9375, + "learning_rate": 2.686027669143586e-05, + "loss": 1.1962, + "step": 6490 + }, + { + "epoch": 0.4776086456656703, + "grad_norm": 0.859375, + "learning_rate": 2.6854501103130657e-05, + "loss": 0.9129, + "step": 6491 + }, + { + "epoch": 0.47768222579903424, + "grad_norm": 0.828125, + "learning_rate": 2.6848725415297887e-05, + "loss": 0.8448, + "step": 6492 + }, + { + "epoch": 0.47775580593239825, + "grad_norm": 0.94140625, + "learning_rate": 2.6842949628247516e-05, + "loss": 1.237, + "step": 6493 + }, + { + "epoch": 0.47782938606576225, + "grad_norm": 0.72265625, + "learning_rate": 2.6837173742289524e-05, + "loss": 0.9473, + "step": 6494 + }, + { + "epoch": 0.47790296619912626, + "grad_norm": 0.765625, + "learning_rate": 2.683139775773388e-05, + "loss": 0.8712, + "step": 6495 + }, + { + "epoch": 0.4779765463324902, + "grad_norm": 0.9140625, + "learning_rate": 2.6825621674890584e-05, + "loss": 0.7254, + "step": 6496 + }, + { + "epoch": 0.4780501264658542, + "grad_norm": 0.984375, + "learning_rate": 2.681984549406962e-05, + "loss": 1.2051, + "step": 6497 + }, + { + "epoch": 0.4781237065992182, + "grad_norm": 0.62890625, + "learning_rate": 2.6814069215580978e-05, + "loss": 0.8268, + "step": 6498 + }, + { + "epoch": 0.4781972867325822, + "grad_norm": 0.89453125, + "learning_rate": 2.680829283973467e-05, + "loss": 0.9537, + "step": 6499 + }, + { + "epoch": 0.4782708668659462, + "grad_norm": 0.859375, + "learning_rate": 2.6802516366840706e-05, + "loss": 0.7822, + "step": 6500 + }, + { + "epoch": 0.47834444699931017, + "grad_norm": 0.73046875, + "learning_rate": 2.6796739797209087e-05, + "loss": 0.791, + "step": 6501 + }, + { + "epoch": 0.4784180271326742, + "grad_norm": 0.828125, + "learning_rate": 2.679096313114984e-05, + "loss": 1.1303, + "step": 6502 + }, + { + "epoch": 0.4784916072660382, + "grad_norm": 0.9140625, + "learning_rate": 2.6785186368972987e-05, + "loss": 1.2051, + "step": 6503 + }, + { + "epoch": 0.4785651873994022, + "grad_norm": 0.71484375, + "learning_rate": 2.677940951098855e-05, + "loss": 1.0447, + "step": 6504 + }, + { + "epoch": 0.47863876753276613, + "grad_norm": 0.9140625, + "learning_rate": 2.677363255750656e-05, + "loss": 0.9259, + "step": 6505 + }, + { + "epoch": 0.47871234766613013, + "grad_norm": 0.9765625, + "learning_rate": 2.676785550883707e-05, + "loss": 0.8799, + "step": 6506 + }, + { + "epoch": 0.47878592779949414, + "grad_norm": 0.83203125, + "learning_rate": 2.676207836529011e-05, + "loss": 1.212, + "step": 6507 + }, + { + "epoch": 0.47885950793285814, + "grad_norm": 0.796875, + "learning_rate": 2.6756301127175732e-05, + "loss": 0.8355, + "step": 6508 + }, + { + "epoch": 0.47893308806622215, + "grad_norm": 0.6875, + "learning_rate": 2.6750523794803988e-05, + "loss": 0.687, + "step": 6509 + }, + { + "epoch": 0.4790066681995861, + "grad_norm": 0.81640625, + "learning_rate": 2.674474636848494e-05, + "loss": 0.7383, + "step": 6510 + }, + { + "epoch": 0.4790802483329501, + "grad_norm": 0.6640625, + "learning_rate": 2.6738968848528647e-05, + "loss": 0.742, + "step": 6511 + }, + { + "epoch": 0.4791538284663141, + "grad_norm": 0.7265625, + "learning_rate": 2.6733191235245185e-05, + "loss": 0.6763, + "step": 6512 + }, + { + "epoch": 0.4792274085996781, + "grad_norm": 0.9609375, + "learning_rate": 2.672741352894462e-05, + "loss": 0.8088, + "step": 6513 + }, + { + "epoch": 0.47930098873304205, + "grad_norm": 0.859375, + "learning_rate": 2.672163572993704e-05, + "loss": 1.2608, + "step": 6514 + }, + { + "epoch": 0.47937456886640606, + "grad_norm": 1.21875, + "learning_rate": 2.6715857838532516e-05, + "loss": 1.5293, + "step": 6515 + }, + { + "epoch": 0.47944814899977006, + "grad_norm": 0.7578125, + "learning_rate": 2.6710079855041142e-05, + "loss": 0.7698, + "step": 6516 + }, + { + "epoch": 0.47952172913313407, + "grad_norm": 0.69140625, + "learning_rate": 2.6704301779773016e-05, + "loss": 0.8382, + "step": 6517 + }, + { + "epoch": 0.47959530926649807, + "grad_norm": 0.9140625, + "learning_rate": 2.6698523613038223e-05, + "loss": 1.1378, + "step": 6518 + }, + { + "epoch": 0.479668889399862, + "grad_norm": 0.9609375, + "learning_rate": 2.6692745355146887e-05, + "loss": 1.0447, + "step": 6519 + }, + { + "epoch": 0.479742469533226, + "grad_norm": 1.078125, + "learning_rate": 2.66869670064091e-05, + "loss": 0.9409, + "step": 6520 + }, + { + "epoch": 0.47981604966659, + "grad_norm": 0.73828125, + "learning_rate": 2.668118856713498e-05, + "loss": 0.7465, + "step": 6521 + }, + { + "epoch": 0.47988962979995403, + "grad_norm": 0.6328125, + "learning_rate": 2.6675410037634646e-05, + "loss": 0.6655, + "step": 6522 + }, + { + "epoch": 0.479963209933318, + "grad_norm": 0.72265625, + "learning_rate": 2.6669631418218223e-05, + "loss": 0.7218, + "step": 6523 + }, + { + "epoch": 0.480036790066682, + "grad_norm": 0.80859375, + "learning_rate": 2.666385270919583e-05, + "loss": 0.81, + "step": 6524 + }, + { + "epoch": 0.480110370200046, + "grad_norm": 0.80078125, + "learning_rate": 2.6658073910877603e-05, + "loss": 0.8634, + "step": 6525 + }, + { + "epoch": 0.48018395033341, + "grad_norm": 0.7890625, + "learning_rate": 2.665229502357368e-05, + "loss": 0.8237, + "step": 6526 + }, + { + "epoch": 0.480257530466774, + "grad_norm": 0.86328125, + "learning_rate": 2.6646516047594206e-05, + "loss": 0.9938, + "step": 6527 + }, + { + "epoch": 0.48033111060013794, + "grad_norm": 0.80078125, + "learning_rate": 2.6640736983249332e-05, + "loss": 0.7124, + "step": 6528 + }, + { + "epoch": 0.48040469073350195, + "grad_norm": 0.8203125, + "learning_rate": 2.66349578308492e-05, + "loss": 0.9326, + "step": 6529 + }, + { + "epoch": 0.48047827086686595, + "grad_norm": 0.98828125, + "learning_rate": 2.6629178590703968e-05, + "loss": 1.2848, + "step": 6530 + }, + { + "epoch": 0.48055185100022996, + "grad_norm": 0.96484375, + "learning_rate": 2.6623399263123792e-05, + "loss": 1.0676, + "step": 6531 + }, + { + "epoch": 0.4806254311335939, + "grad_norm": 0.84375, + "learning_rate": 2.6617619848418852e-05, + "loss": 0.6388, + "step": 6532 + }, + { + "epoch": 0.4806990112669579, + "grad_norm": 0.86328125, + "learning_rate": 2.661184034689931e-05, + "loss": 0.8049, + "step": 6533 + }, + { + "epoch": 0.4807725914003219, + "grad_norm": 0.953125, + "learning_rate": 2.6606060758875333e-05, + "loss": 1.0493, + "step": 6534 + }, + { + "epoch": 0.4808461715336859, + "grad_norm": 0.77734375, + "learning_rate": 2.660028108465712e-05, + "loss": 0.8408, + "step": 6535 + }, + { + "epoch": 0.4809197516670499, + "grad_norm": 1.09375, + "learning_rate": 2.6594501324554833e-05, + "loss": 1.2574, + "step": 6536 + }, + { + "epoch": 0.48099333180041387, + "grad_norm": 0.9921875, + "learning_rate": 2.6588721478878682e-05, + "loss": 1.0273, + "step": 6537 + }, + { + "epoch": 0.4810669119337779, + "grad_norm": 0.9765625, + "learning_rate": 2.6582941547938832e-05, + "loss": 0.9924, + "step": 6538 + }, + { + "epoch": 0.4811404920671419, + "grad_norm": 0.9921875, + "learning_rate": 2.6577161532045515e-05, + "loss": 0.9943, + "step": 6539 + }, + { + "epoch": 0.4812140722005059, + "grad_norm": 0.90234375, + "learning_rate": 2.6571381431508913e-05, + "loss": 1.3333, + "step": 6540 + }, + { + "epoch": 0.48128765233386983, + "grad_norm": 0.8203125, + "learning_rate": 2.6565601246639245e-05, + "loss": 0.9713, + "step": 6541 + }, + { + "epoch": 0.48136123246723383, + "grad_norm": 0.77734375, + "learning_rate": 2.6559820977746703e-05, + "loss": 0.7901, + "step": 6542 + }, + { + "epoch": 0.48143481260059784, + "grad_norm": 0.76953125, + "learning_rate": 2.655404062514152e-05, + "loss": 0.9806, + "step": 6543 + }, + { + "epoch": 0.48150839273396184, + "grad_norm": 0.6796875, + "learning_rate": 2.6548260189133904e-05, + "loss": 0.6986, + "step": 6544 + }, + { + "epoch": 0.48158197286732585, + "grad_norm": 1.015625, + "learning_rate": 2.6542479670034098e-05, + "loss": 1.0739, + "step": 6545 + }, + { + "epoch": 0.4816555530006898, + "grad_norm": 0.703125, + "learning_rate": 2.6536699068152322e-05, + "loss": 0.6641, + "step": 6546 + }, + { + "epoch": 0.4817291331340538, + "grad_norm": 0.83984375, + "learning_rate": 2.6530918383798804e-05, + "loss": 1.1925, + "step": 6547 + }, + { + "epoch": 0.4818027132674178, + "grad_norm": 0.82421875, + "learning_rate": 2.6525137617283797e-05, + "loss": 0.7034, + "step": 6548 + }, + { + "epoch": 0.4818762934007818, + "grad_norm": 0.65234375, + "learning_rate": 2.6519356768917524e-05, + "loss": 0.6368, + "step": 6549 + }, + { + "epoch": 0.48194987353414576, + "grad_norm": 0.73046875, + "learning_rate": 2.6513575839010246e-05, + "loss": 0.8171, + "step": 6550 + }, + { + "epoch": 0.48202345366750976, + "grad_norm": 0.8125, + "learning_rate": 2.6507794827872212e-05, + "loss": 0.9612, + "step": 6551 + }, + { + "epoch": 0.48209703380087376, + "grad_norm": 0.8359375, + "learning_rate": 2.6502013735813676e-05, + "loss": 0.8156, + "step": 6552 + }, + { + "epoch": 0.48217061393423777, + "grad_norm": 0.7578125, + "learning_rate": 2.64962325631449e-05, + "loss": 0.9645, + "step": 6553 + }, + { + "epoch": 0.4822441940676018, + "grad_norm": 0.91796875, + "learning_rate": 2.649045131017615e-05, + "loss": 1.0374, + "step": 6554 + }, + { + "epoch": 0.4823177742009657, + "grad_norm": 0.7578125, + "learning_rate": 2.6484669977217696e-05, + "loss": 0.697, + "step": 6555 + }, + { + "epoch": 0.4823913543343297, + "grad_norm": 0.71875, + "learning_rate": 2.6478888564579808e-05, + "loss": 0.7014, + "step": 6556 + }, + { + "epoch": 0.48246493446769373, + "grad_norm": 1.0234375, + "learning_rate": 2.647310707257276e-05, + "loss": 1.2796, + "step": 6557 + }, + { + "epoch": 0.48253851460105773, + "grad_norm": 1.015625, + "learning_rate": 2.6467325501506834e-05, + "loss": 0.9344, + "step": 6558 + }, + { + "epoch": 0.4826120947344217, + "grad_norm": 0.734375, + "learning_rate": 2.646154385169232e-05, + "loss": 0.6295, + "step": 6559 + }, + { + "epoch": 0.4826856748677857, + "grad_norm": 0.85546875, + "learning_rate": 2.6455762123439522e-05, + "loss": 0.8593, + "step": 6560 + }, + { + "epoch": 0.4827592550011497, + "grad_norm": 0.86328125, + "learning_rate": 2.6449980317058708e-05, + "loss": 0.743, + "step": 6561 + }, + { + "epoch": 0.4828328351345137, + "grad_norm": 0.734375, + "learning_rate": 2.6444198432860197e-05, + "loss": 1.0652, + "step": 6562 + }, + { + "epoch": 0.4829064152678777, + "grad_norm": 0.890625, + "learning_rate": 2.6438416471154275e-05, + "loss": 0.6952, + "step": 6563 + }, + { + "epoch": 0.48297999540124165, + "grad_norm": 0.7890625, + "learning_rate": 2.643263443225126e-05, + "loss": 1.2082, + "step": 6564 + }, + { + "epoch": 0.48305357553460565, + "grad_norm": 0.94921875, + "learning_rate": 2.6426852316461465e-05, + "loss": 0.8351, + "step": 6565 + }, + { + "epoch": 0.48312715566796965, + "grad_norm": 0.8359375, + "learning_rate": 2.6421070124095194e-05, + "loss": 0.7706, + "step": 6566 + }, + { + "epoch": 0.48320073580133366, + "grad_norm": 0.76171875, + "learning_rate": 2.6415287855462784e-05, + "loss": 0.6269, + "step": 6567 + }, + { + "epoch": 0.4832743159346976, + "grad_norm": 1.1484375, + "learning_rate": 2.6409505510874538e-05, + "loss": 1.1887, + "step": 6568 + }, + { + "epoch": 0.4833478960680616, + "grad_norm": 0.85546875, + "learning_rate": 2.64037230906408e-05, + "loss": 0.6661, + "step": 6569 + }, + { + "epoch": 0.4834214762014256, + "grad_norm": 0.875, + "learning_rate": 2.639794059507189e-05, + "loss": 0.8258, + "step": 6570 + }, + { + "epoch": 0.4834950563347896, + "grad_norm": 0.9609375, + "learning_rate": 2.6392158024478154e-05, + "loss": 1.0068, + "step": 6571 + }, + { + "epoch": 0.4835686364681536, + "grad_norm": 0.9609375, + "learning_rate": 2.6386375379169918e-05, + "loss": 1.1187, + "step": 6572 + }, + { + "epoch": 0.48364221660151757, + "grad_norm": 0.8359375, + "learning_rate": 2.6380592659457543e-05, + "loss": 0.9549, + "step": 6573 + }, + { + "epoch": 0.4837157967348816, + "grad_norm": 0.828125, + "learning_rate": 2.6374809865651363e-05, + "loss": 1.1353, + "step": 6574 + }, + { + "epoch": 0.4837893768682456, + "grad_norm": 0.94140625, + "learning_rate": 2.6369026998061736e-05, + "loss": 1.303, + "step": 6575 + }, + { + "epoch": 0.4838629570016096, + "grad_norm": 0.93359375, + "learning_rate": 2.6363244056999014e-05, + "loss": 1.2945, + "step": 6576 + }, + { + "epoch": 0.48393653713497353, + "grad_norm": 0.921875, + "learning_rate": 2.635746104277356e-05, + "loss": 1.0926, + "step": 6577 + }, + { + "epoch": 0.48401011726833754, + "grad_norm": 0.875, + "learning_rate": 2.6351677955695732e-05, + "loss": 0.8453, + "step": 6578 + }, + { + "epoch": 0.48408369740170154, + "grad_norm": 0.9765625, + "learning_rate": 2.6345894796075904e-05, + "loss": 0.7784, + "step": 6579 + }, + { + "epoch": 0.48415727753506554, + "grad_norm": 0.8203125, + "learning_rate": 2.6340111564224447e-05, + "loss": 1.159, + "step": 6580 + }, + { + "epoch": 0.48423085766842955, + "grad_norm": 0.8515625, + "learning_rate": 2.6334328260451728e-05, + "loss": 0.9067, + "step": 6581 + }, + { + "epoch": 0.4843044378017935, + "grad_norm": 0.8046875, + "learning_rate": 2.6328544885068136e-05, + "loss": 0.8354, + "step": 6582 + }, + { + "epoch": 0.4843780179351575, + "grad_norm": 0.71875, + "learning_rate": 2.6322761438384048e-05, + "loss": 0.6787, + "step": 6583 + }, + { + "epoch": 0.4844515980685215, + "grad_norm": 0.8359375, + "learning_rate": 2.631697792070985e-05, + "loss": 1.0695, + "step": 6584 + }, + { + "epoch": 0.4845251782018855, + "grad_norm": 0.8359375, + "learning_rate": 2.631119433235593e-05, + "loss": 0.9824, + "step": 6585 + }, + { + "epoch": 0.48459875833524946, + "grad_norm": 1.0, + "learning_rate": 2.630541067363269e-05, + "loss": 1.0727, + "step": 6586 + }, + { + "epoch": 0.48467233846861346, + "grad_norm": 0.859375, + "learning_rate": 2.6299626944850537e-05, + "loss": 0.7621, + "step": 6587 + }, + { + "epoch": 0.48474591860197747, + "grad_norm": 0.9140625, + "learning_rate": 2.6293843146319846e-05, + "loss": 0.7323, + "step": 6588 + }, + { + "epoch": 0.48481949873534147, + "grad_norm": 0.9609375, + "learning_rate": 2.6288059278351047e-05, + "loss": 0.9427, + "step": 6589 + }, + { + "epoch": 0.4848930788687055, + "grad_norm": 0.8671875, + "learning_rate": 2.628227534125453e-05, + "loss": 1.0747, + "step": 6590 + }, + { + "epoch": 0.4849666590020694, + "grad_norm": 0.64453125, + "learning_rate": 2.627649133534072e-05, + "loss": 0.6269, + "step": 6591 + }, + { + "epoch": 0.4850402391354334, + "grad_norm": 0.9609375, + "learning_rate": 2.627070726092003e-05, + "loss": 1.057, + "step": 6592 + }, + { + "epoch": 0.48511381926879743, + "grad_norm": 0.88671875, + "learning_rate": 2.6264923118302882e-05, + "loss": 0.838, + "step": 6593 + }, + { + "epoch": 0.48518739940216143, + "grad_norm": 0.80078125, + "learning_rate": 2.625913890779971e-05, + "loss": 0.9063, + "step": 6594 + }, + { + "epoch": 0.4852609795355254, + "grad_norm": 0.90234375, + "learning_rate": 2.6253354629720918e-05, + "loss": 0.9571, + "step": 6595 + }, + { + "epoch": 0.4853345596688894, + "grad_norm": 0.78515625, + "learning_rate": 2.6247570284376954e-05, + "loss": 0.859, + "step": 6596 + }, + { + "epoch": 0.4854081398022534, + "grad_norm": 0.99609375, + "learning_rate": 2.624178587207825e-05, + "loss": 1.0043, + "step": 6597 + }, + { + "epoch": 0.4854817199356174, + "grad_norm": 0.640625, + "learning_rate": 2.623600139313524e-05, + "loss": 0.5801, + "step": 6598 + }, + { + "epoch": 0.4855553000689814, + "grad_norm": 0.828125, + "learning_rate": 2.623021684785837e-05, + "loss": 0.8346, + "step": 6599 + }, + { + "epoch": 0.48562888020234535, + "grad_norm": 1.5234375, + "learning_rate": 2.6224432236558088e-05, + "loss": 0.663, + "step": 6600 + }, + { + "epoch": 0.48570246033570935, + "grad_norm": 0.7734375, + "learning_rate": 2.6218647559544844e-05, + "loss": 1.0242, + "step": 6601 + }, + { + "epoch": 0.48577604046907336, + "grad_norm": 0.93359375, + "learning_rate": 2.621286281712908e-05, + "loss": 0.8787, + "step": 6602 + }, + { + "epoch": 0.48584962060243736, + "grad_norm": 0.83203125, + "learning_rate": 2.6207078009621266e-05, + "loss": 0.6553, + "step": 6603 + }, + { + "epoch": 0.4859232007358013, + "grad_norm": 0.6484375, + "learning_rate": 2.620129313733185e-05, + "loss": 0.7066, + "step": 6604 + }, + { + "epoch": 0.4859967808691653, + "grad_norm": 0.7890625, + "learning_rate": 2.6195508200571305e-05, + "loss": 0.9585, + "step": 6605 + }, + { + "epoch": 0.4860703610025293, + "grad_norm": 0.9375, + "learning_rate": 2.6189723199650084e-05, + "loss": 1.4974, + "step": 6606 + }, + { + "epoch": 0.4861439411358933, + "grad_norm": 1.171875, + "learning_rate": 2.6183938134878673e-05, + "loss": 1.0687, + "step": 6607 + }, + { + "epoch": 0.4862175212692573, + "grad_norm": 0.66015625, + "learning_rate": 2.6178153006567542e-05, + "loss": 0.5521, + "step": 6608 + }, + { + "epoch": 0.4862911014026213, + "grad_norm": 0.8515625, + "learning_rate": 2.617236781502716e-05, + "loss": 1.0084, + "step": 6609 + }, + { + "epoch": 0.4863646815359853, + "grad_norm": 0.7578125, + "learning_rate": 2.6166582560568016e-05, + "loss": 0.8275, + "step": 6610 + }, + { + "epoch": 0.4864382616693493, + "grad_norm": 0.7265625, + "learning_rate": 2.616079724350058e-05, + "loss": 0.702, + "step": 6611 + }, + { + "epoch": 0.4865118418027133, + "grad_norm": 0.7421875, + "learning_rate": 2.6155011864135355e-05, + "loss": 0.8587, + "step": 6612 + }, + { + "epoch": 0.48658542193607723, + "grad_norm": 0.63671875, + "learning_rate": 2.614922642278282e-05, + "loss": 0.635, + "step": 6613 + }, + { + "epoch": 0.48665900206944124, + "grad_norm": 0.80078125, + "learning_rate": 2.614344091975347e-05, + "loss": 1.03, + "step": 6614 + }, + { + "epoch": 0.48673258220280524, + "grad_norm": 1.0625, + "learning_rate": 2.6137655355357822e-05, + "loss": 1.0933, + "step": 6615 + }, + { + "epoch": 0.48680616233616925, + "grad_norm": 0.85546875, + "learning_rate": 2.6131869729906344e-05, + "loss": 0.9497, + "step": 6616 + }, + { + "epoch": 0.48687974246953325, + "grad_norm": 0.828125, + "learning_rate": 2.6126084043709563e-05, + "loss": 0.6989, + "step": 6617 + }, + { + "epoch": 0.4869533226028972, + "grad_norm": 0.92578125, + "learning_rate": 2.6120298297077978e-05, + "loss": 0.7913, + "step": 6618 + }, + { + "epoch": 0.4870269027362612, + "grad_norm": 0.73828125, + "learning_rate": 2.6114512490322096e-05, + "loss": 0.6741, + "step": 6619 + }, + { + "epoch": 0.4871004828696252, + "grad_norm": 0.9140625, + "learning_rate": 2.6108726623752434e-05, + "loss": 0.7712, + "step": 6620 + }, + { + "epoch": 0.4871740630029892, + "grad_norm": 0.7890625, + "learning_rate": 2.6102940697679514e-05, + "loss": 0.8339, + "step": 6621 + }, + { + "epoch": 0.48724764313635316, + "grad_norm": 0.703125, + "learning_rate": 2.609715471241384e-05, + "loss": 0.8066, + "step": 6622 + }, + { + "epoch": 0.48732122326971716, + "grad_norm": 1.0703125, + "learning_rate": 2.6091368668265954e-05, + "loss": 1.4414, + "step": 6623 + }, + { + "epoch": 0.48739480340308117, + "grad_norm": 0.71875, + "learning_rate": 2.608558256554637e-05, + "loss": 1.1509, + "step": 6624 + }, + { + "epoch": 0.48746838353644517, + "grad_norm": 0.92578125, + "learning_rate": 2.6079796404565622e-05, + "loss": 1.0638, + "step": 6625 + }, + { + "epoch": 0.4875419636698092, + "grad_norm": 0.94140625, + "learning_rate": 2.6074010185634235e-05, + "loss": 1.0043, + "step": 6626 + }, + { + "epoch": 0.4876155438031731, + "grad_norm": 0.8984375, + "learning_rate": 2.606822390906275e-05, + "loss": 1.0775, + "step": 6627 + }, + { + "epoch": 0.48768912393653713, + "grad_norm": 0.99609375, + "learning_rate": 2.6062437575161708e-05, + "loss": 1.2867, + "step": 6628 + }, + { + "epoch": 0.48776270406990113, + "grad_norm": 1.078125, + "learning_rate": 2.605665118424165e-05, + "loss": 1.2731, + "step": 6629 + }, + { + "epoch": 0.48783628420326514, + "grad_norm": 0.72265625, + "learning_rate": 2.6050864736613112e-05, + "loss": 0.72, + "step": 6630 + }, + { + "epoch": 0.4879098643366291, + "grad_norm": 0.87109375, + "learning_rate": 2.6045078232586657e-05, + "loss": 0.9182, + "step": 6631 + }, + { + "epoch": 0.4879834444699931, + "grad_norm": 0.97265625, + "learning_rate": 2.603929167247282e-05, + "loss": 1.1477, + "step": 6632 + }, + { + "epoch": 0.4880570246033571, + "grad_norm": 0.74609375, + "learning_rate": 2.6033505056582164e-05, + "loss": 1.0072, + "step": 6633 + }, + { + "epoch": 0.4881306047367211, + "grad_norm": 0.828125, + "learning_rate": 2.6027718385225245e-05, + "loss": 0.8941, + "step": 6634 + }, + { + "epoch": 0.4882041848700851, + "grad_norm": 0.671875, + "learning_rate": 2.6021931658712624e-05, + "loss": 0.7858, + "step": 6635 + }, + { + "epoch": 0.48827776500344905, + "grad_norm": 0.609375, + "learning_rate": 2.601614487735486e-05, + "loss": 0.5726, + "step": 6636 + }, + { + "epoch": 0.48835134513681305, + "grad_norm": 0.90234375, + "learning_rate": 2.601035804146252e-05, + "loss": 0.9406, + "step": 6637 + }, + { + "epoch": 0.48842492527017706, + "grad_norm": 0.94921875, + "learning_rate": 2.600457115134617e-05, + "loss": 1.0871, + "step": 6638 + }, + { + "epoch": 0.48849850540354106, + "grad_norm": 0.99609375, + "learning_rate": 2.5998784207316378e-05, + "loss": 0.9156, + "step": 6639 + }, + { + "epoch": 0.488572085536905, + "grad_norm": 0.9140625, + "learning_rate": 2.599299720968373e-05, + "loss": 0.835, + "step": 6640 + }, + { + "epoch": 0.488645665670269, + "grad_norm": 0.87890625, + "learning_rate": 2.5987210158758797e-05, + "loss": 0.8283, + "step": 6641 + }, + { + "epoch": 0.488719245803633, + "grad_norm": 0.73828125, + "learning_rate": 2.598142305485216e-05, + "loss": 0.8809, + "step": 6642 + }, + { + "epoch": 0.488792825936997, + "grad_norm": 0.78515625, + "learning_rate": 2.5975635898274397e-05, + "loss": 0.689, + "step": 6643 + }, + { + "epoch": 0.488866406070361, + "grad_norm": 0.796875, + "learning_rate": 2.596984868933611e-05, + "loss": 0.8381, + "step": 6644 + }, + { + "epoch": 0.488939986203725, + "grad_norm": 0.78125, + "learning_rate": 2.5964061428347862e-05, + "loss": 0.8655, + "step": 6645 + }, + { + "epoch": 0.489013566337089, + "grad_norm": 1.0390625, + "learning_rate": 2.5958274115620262e-05, + "loss": 1.0838, + "step": 6646 + }, + { + "epoch": 0.489087146470453, + "grad_norm": 0.9453125, + "learning_rate": 2.59524867514639e-05, + "loss": 1.3738, + "step": 6647 + }, + { + "epoch": 0.489160726603817, + "grad_norm": 0.796875, + "learning_rate": 2.5946699336189373e-05, + "loss": 1.0237, + "step": 6648 + }, + { + "epoch": 0.48923430673718094, + "grad_norm": 1.078125, + "learning_rate": 2.5940911870107288e-05, + "loss": 0.9828, + "step": 6649 + }, + { + "epoch": 0.48930788687054494, + "grad_norm": 0.921875, + "learning_rate": 2.593512435352823e-05, + "loss": 1.157, + "step": 6650 + }, + { + "epoch": 0.48938146700390894, + "grad_norm": 0.8671875, + "learning_rate": 2.592933678676282e-05, + "loss": 0.7981, + "step": 6651 + }, + { + "epoch": 0.48945504713727295, + "grad_norm": 0.8515625, + "learning_rate": 2.592354917012166e-05, + "loss": 1.1765, + "step": 6652 + }, + { + "epoch": 0.48952862727063695, + "grad_norm": 0.8125, + "learning_rate": 2.5917761503915355e-05, + "loss": 1.0321, + "step": 6653 + }, + { + "epoch": 0.4896022074040009, + "grad_norm": 0.76171875, + "learning_rate": 2.5911973788454525e-05, + "loss": 0.7883, + "step": 6654 + }, + { + "epoch": 0.4896757875373649, + "grad_norm": 0.91015625, + "learning_rate": 2.5906186024049784e-05, + "loss": 0.9707, + "step": 6655 + }, + { + "epoch": 0.4897493676707289, + "grad_norm": 0.7109375, + "learning_rate": 2.5900398211011755e-05, + "loss": 0.7421, + "step": 6656 + }, + { + "epoch": 0.4898229478040929, + "grad_norm": 1.09375, + "learning_rate": 2.5894610349651055e-05, + "loss": 0.833, + "step": 6657 + }, + { + "epoch": 0.48989652793745686, + "grad_norm": 0.65234375, + "learning_rate": 2.588882244027831e-05, + "loss": 0.6903, + "step": 6658 + }, + { + "epoch": 0.48997010807082086, + "grad_norm": 1.09375, + "learning_rate": 2.5883034483204133e-05, + "loss": 0.9731, + "step": 6659 + }, + { + "epoch": 0.49004368820418487, + "grad_norm": 0.91015625, + "learning_rate": 2.5877246478739165e-05, + "loss": 1.0695, + "step": 6660 + }, + { + "epoch": 0.4901172683375489, + "grad_norm": 0.78515625, + "learning_rate": 2.587145842719404e-05, + "loss": 0.5633, + "step": 6661 + }, + { + "epoch": 0.4901908484709129, + "grad_norm": 0.734375, + "learning_rate": 2.5865670328879387e-05, + "loss": 0.8886, + "step": 6662 + }, + { + "epoch": 0.4902644286042768, + "grad_norm": 0.9296875, + "learning_rate": 2.5859882184105844e-05, + "loss": 1.2564, + "step": 6663 + }, + { + "epoch": 0.49033800873764083, + "grad_norm": 0.89453125, + "learning_rate": 2.585409399318404e-05, + "loss": 0.7831, + "step": 6664 + }, + { + "epoch": 0.49041158887100483, + "grad_norm": 1.0390625, + "learning_rate": 2.5848305756424635e-05, + "loss": 1.1321, + "step": 6665 + }, + { + "epoch": 0.49048516900436884, + "grad_norm": 1.0859375, + "learning_rate": 2.5842517474138254e-05, + "loss": 1.2362, + "step": 6666 + }, + { + "epoch": 0.4905587491377328, + "grad_norm": 0.77734375, + "learning_rate": 2.583672914663555e-05, + "loss": 0.9461, + "step": 6667 + }, + { + "epoch": 0.4906323292710968, + "grad_norm": 1.078125, + "learning_rate": 2.583094077422718e-05, + "loss": 1.2423, + "step": 6668 + }, + { + "epoch": 0.4907059094044608, + "grad_norm": 0.86328125, + "learning_rate": 2.5825152357223792e-05, + "loss": 0.6962, + "step": 6669 + }, + { + "epoch": 0.4907794895378248, + "grad_norm": 0.72265625, + "learning_rate": 2.5819363895936026e-05, + "loss": 0.946, + "step": 6670 + }, + { + "epoch": 0.4908530696711888, + "grad_norm": 0.83203125, + "learning_rate": 2.5813575390674548e-05, + "loss": 0.9372, + "step": 6671 + }, + { + "epoch": 0.49092664980455275, + "grad_norm": 0.875, + "learning_rate": 2.5807786841750025e-05, + "loss": 0.9871, + "step": 6672 + }, + { + "epoch": 0.49100022993791675, + "grad_norm": 0.78125, + "learning_rate": 2.5801998249473096e-05, + "loss": 0.7371, + "step": 6673 + }, + { + "epoch": 0.49107381007128076, + "grad_norm": 1.0390625, + "learning_rate": 2.5796209614154442e-05, + "loss": 1.0865, + "step": 6674 + }, + { + "epoch": 0.49114739020464476, + "grad_norm": 0.8984375, + "learning_rate": 2.5790420936104714e-05, + "loss": 0.8629, + "step": 6675 + }, + { + "epoch": 0.4912209703380087, + "grad_norm": 0.70703125, + "learning_rate": 2.5784632215634603e-05, + "loss": 0.7611, + "step": 6676 + }, + { + "epoch": 0.4912945504713727, + "grad_norm": 0.921875, + "learning_rate": 2.577884345305475e-05, + "loss": 1.1233, + "step": 6677 + }, + { + "epoch": 0.4913681306047367, + "grad_norm": 0.80859375, + "learning_rate": 2.577305464867585e-05, + "loss": 0.9199, + "step": 6678 + }, + { + "epoch": 0.4914417107381007, + "grad_norm": 0.81640625, + "learning_rate": 2.5767265802808555e-05, + "loss": 0.9293, + "step": 6679 + }, + { + "epoch": 0.4915152908714647, + "grad_norm": 0.95703125, + "learning_rate": 2.576147691576356e-05, + "loss": 1.1749, + "step": 6680 + }, + { + "epoch": 0.4915888710048287, + "grad_norm": 1.09375, + "learning_rate": 2.5755687987851534e-05, + "loss": 0.9712, + "step": 6681 + }, + { + "epoch": 0.4916624511381927, + "grad_norm": 1.1171875, + "learning_rate": 2.5749899019383163e-05, + "loss": 1.1325, + "step": 6682 + }, + { + "epoch": 0.4917360312715567, + "grad_norm": 0.8515625, + "learning_rate": 2.5744110010669133e-05, + "loss": 1.0317, + "step": 6683 + }, + { + "epoch": 0.4918096114049207, + "grad_norm": 0.94140625, + "learning_rate": 2.573832096202011e-05, + "loss": 0.7013, + "step": 6684 + }, + { + "epoch": 0.49188319153828464, + "grad_norm": 0.796875, + "learning_rate": 2.573253187374681e-05, + "loss": 0.9334, + "step": 6685 + }, + { + "epoch": 0.49195677167164864, + "grad_norm": 1.03125, + "learning_rate": 2.5726742746159906e-05, + "loss": 1.036, + "step": 6686 + }, + { + "epoch": 0.49203035180501264, + "grad_norm": 0.84375, + "learning_rate": 2.5720953579570078e-05, + "loss": 0.9906, + "step": 6687 + }, + { + "epoch": 0.49210393193837665, + "grad_norm": 0.73828125, + "learning_rate": 2.571516437428805e-05, + "loss": 0.7805, + "step": 6688 + }, + { + "epoch": 0.49217751207174065, + "grad_norm": 0.859375, + "learning_rate": 2.570937513062449e-05, + "loss": 0.7876, + "step": 6689 + }, + { + "epoch": 0.4922510922051046, + "grad_norm": 0.62890625, + "learning_rate": 2.5703585848890116e-05, + "loss": 0.5749, + "step": 6690 + }, + { + "epoch": 0.4923246723384686, + "grad_norm": 0.93359375, + "learning_rate": 2.569779652939561e-05, + "loss": 0.7909, + "step": 6691 + }, + { + "epoch": 0.4923982524718326, + "grad_norm": 1.0078125, + "learning_rate": 2.5692007172451693e-05, + "loss": 0.8743, + "step": 6692 + }, + { + "epoch": 0.4924718326051966, + "grad_norm": 0.83984375, + "learning_rate": 2.568621777836905e-05, + "loss": 0.7506, + "step": 6693 + }, + { + "epoch": 0.49254541273856056, + "grad_norm": 0.8515625, + "learning_rate": 2.568042834745839e-05, + "loss": 0.89, + "step": 6694 + }, + { + "epoch": 0.49261899287192457, + "grad_norm": 0.953125, + "learning_rate": 2.5674638880030427e-05, + "loss": 0.841, + "step": 6695 + }, + { + "epoch": 0.49269257300528857, + "grad_norm": 1.03125, + "learning_rate": 2.5668849376395876e-05, + "loss": 1.1024, + "step": 6696 + }, + { + "epoch": 0.4927661531386526, + "grad_norm": 1.015625, + "learning_rate": 2.5663059836865444e-05, + "loss": 0.8662, + "step": 6697 + }, + { + "epoch": 0.4928397332720166, + "grad_norm": 0.875, + "learning_rate": 2.5657270261749834e-05, + "loss": 0.7646, + "step": 6698 + }, + { + "epoch": 0.4929133134053805, + "grad_norm": 0.8828125, + "learning_rate": 2.5651480651359777e-05, + "loss": 1.05, + "step": 6699 + }, + { + "epoch": 0.49298689353874453, + "grad_norm": 0.828125, + "learning_rate": 2.5645691006005985e-05, + "loss": 1.0519, + "step": 6700 + }, + { + "epoch": 0.49306047367210853, + "grad_norm": 1.015625, + "learning_rate": 2.563990132599917e-05, + "loss": 1.1284, + "step": 6701 + }, + { + "epoch": 0.49313405380547254, + "grad_norm": 0.765625, + "learning_rate": 2.5634111611650063e-05, + "loss": 0.6174, + "step": 6702 + }, + { + "epoch": 0.4932076339388365, + "grad_norm": 1.1953125, + "learning_rate": 2.5628321863269377e-05, + "loss": 1.2426, + "step": 6703 + }, + { + "epoch": 0.4932812140722005, + "grad_norm": 0.890625, + "learning_rate": 2.5622532081167855e-05, + "loss": 1.1306, + "step": 6704 + }, + { + "epoch": 0.4933547942055645, + "grad_norm": 0.78125, + "learning_rate": 2.5616742265656207e-05, + "loss": 0.9979, + "step": 6705 + }, + { + "epoch": 0.4934283743389285, + "grad_norm": 0.796875, + "learning_rate": 2.561095241704517e-05, + "loss": 0.8668, + "step": 6706 + }, + { + "epoch": 0.4935019544722925, + "grad_norm": 0.99609375, + "learning_rate": 2.560516253564546e-05, + "loss": 0.6973, + "step": 6707 + }, + { + "epoch": 0.49357553460565645, + "grad_norm": 1.1796875, + "learning_rate": 2.5599372621767833e-05, + "loss": 1.0704, + "step": 6708 + }, + { + "epoch": 0.49364911473902046, + "grad_norm": 0.94140625, + "learning_rate": 2.5593582675723e-05, + "loss": 0.9875, + "step": 6709 + }, + { + "epoch": 0.49372269487238446, + "grad_norm": 0.8984375, + "learning_rate": 2.558779269782171e-05, + "loss": 0.9234, + "step": 6710 + }, + { + "epoch": 0.49379627500574846, + "grad_norm": 0.93359375, + "learning_rate": 2.558200268837469e-05, + "loss": 0.888, + "step": 6711 + }, + { + "epoch": 0.4938698551391124, + "grad_norm": 0.63671875, + "learning_rate": 2.5576212647692688e-05, + "loss": 0.6125, + "step": 6712 + }, + { + "epoch": 0.4939434352724764, + "grad_norm": 0.70703125, + "learning_rate": 2.557042257608645e-05, + "loss": 0.7742, + "step": 6713 + }, + { + "epoch": 0.4940170154058404, + "grad_norm": 0.94921875, + "learning_rate": 2.5564632473866694e-05, + "loss": 1.027, + "step": 6714 + }, + { + "epoch": 0.4940905955392044, + "grad_norm": 0.875, + "learning_rate": 2.5558842341344193e-05, + "loss": 1.3594, + "step": 6715 + }, + { + "epoch": 0.49416417567256843, + "grad_norm": 1.171875, + "learning_rate": 2.555305217882967e-05, + "loss": 0.7544, + "step": 6716 + }, + { + "epoch": 0.4942377558059324, + "grad_norm": 0.81640625, + "learning_rate": 2.5547261986633892e-05, + "loss": 0.7817, + "step": 6717 + }, + { + "epoch": 0.4943113359392964, + "grad_norm": 0.94921875, + "learning_rate": 2.554147176506759e-05, + "loss": 1.3649, + "step": 6718 + }, + { + "epoch": 0.4943849160726604, + "grad_norm": 0.734375, + "learning_rate": 2.553568151444152e-05, + "loss": 0.8064, + "step": 6719 + }, + { + "epoch": 0.4944584962060244, + "grad_norm": 0.890625, + "learning_rate": 2.552989123506644e-05, + "loss": 1.2102, + "step": 6720 + }, + { + "epoch": 0.49453207633938834, + "grad_norm": 0.9921875, + "learning_rate": 2.5524100927253085e-05, + "loss": 1.3482, + "step": 6721 + }, + { + "epoch": 0.49460565647275234, + "grad_norm": 0.75, + "learning_rate": 2.551831059131224e-05, + "loss": 0.8116, + "step": 6722 + }, + { + "epoch": 0.49467923660611635, + "grad_norm": 0.6953125, + "learning_rate": 2.5512520227554637e-05, + "loss": 0.573, + "step": 6723 + }, + { + "epoch": 0.49475281673948035, + "grad_norm": 0.88671875, + "learning_rate": 2.5506729836291045e-05, + "loss": 0.8246, + "step": 6724 + }, + { + "epoch": 0.49482639687284435, + "grad_norm": 0.82421875, + "learning_rate": 2.5500939417832213e-05, + "loss": 0.8664, + "step": 6725 + }, + { + "epoch": 0.4948999770062083, + "grad_norm": 0.86328125, + "learning_rate": 2.549514897248892e-05, + "loss": 0.8978, + "step": 6726 + }, + { + "epoch": 0.4949735571395723, + "grad_norm": 0.8671875, + "learning_rate": 2.5489358500571896e-05, + "loss": 0.9654, + "step": 6727 + }, + { + "epoch": 0.4950471372729363, + "grad_norm": 0.9140625, + "learning_rate": 2.548356800239194e-05, + "loss": 1.2929, + "step": 6728 + }, + { + "epoch": 0.4951207174063003, + "grad_norm": 0.9296875, + "learning_rate": 2.5477777478259805e-05, + "loss": 0.8073, + "step": 6729 + }, + { + "epoch": 0.49519429753966426, + "grad_norm": 0.88671875, + "learning_rate": 2.5471986928486245e-05, + "loss": 0.9441, + "step": 6730 + }, + { + "epoch": 0.49526787767302827, + "grad_norm": 0.82421875, + "learning_rate": 2.5466196353382053e-05, + "loss": 1.1509, + "step": 6731 + }, + { + "epoch": 0.49534145780639227, + "grad_norm": 0.98046875, + "learning_rate": 2.5460405753257972e-05, + "loss": 1.2072, + "step": 6732 + }, + { + "epoch": 0.4954150379397563, + "grad_norm": 0.87109375, + "learning_rate": 2.545461512842479e-05, + "loss": 0.8782, + "step": 6733 + }, + { + "epoch": 0.4954886180731203, + "grad_norm": 0.95703125, + "learning_rate": 2.5448824479193262e-05, + "loss": 1.0232, + "step": 6734 + }, + { + "epoch": 0.49556219820648423, + "grad_norm": 0.84765625, + "learning_rate": 2.5443033805874177e-05, + "loss": 1.0602, + "step": 6735 + }, + { + "epoch": 0.49563577833984823, + "grad_norm": 0.74609375, + "learning_rate": 2.5437243108778307e-05, + "loss": 0.7239, + "step": 6736 + }, + { + "epoch": 0.49570935847321224, + "grad_norm": 0.65625, + "learning_rate": 2.5431452388216427e-05, + "loss": 0.6611, + "step": 6737 + }, + { + "epoch": 0.49578293860657624, + "grad_norm": 1.0078125, + "learning_rate": 2.5425661644499315e-05, + "loss": 0.9489, + "step": 6738 + }, + { + "epoch": 0.4958565187399402, + "grad_norm": 0.75, + "learning_rate": 2.541987087793774e-05, + "loss": 0.5325, + "step": 6739 + }, + { + "epoch": 0.4959300988733042, + "grad_norm": 1.0390625, + "learning_rate": 2.5414080088842484e-05, + "loss": 1.1207, + "step": 6740 + }, + { + "epoch": 0.4960036790066682, + "grad_norm": 0.98828125, + "learning_rate": 2.540828927752434e-05, + "loss": 1.1418, + "step": 6741 + }, + { + "epoch": 0.4960772591400322, + "grad_norm": 0.92578125, + "learning_rate": 2.540249844429408e-05, + "loss": 0.8199, + "step": 6742 + }, + { + "epoch": 0.4961508392733962, + "grad_norm": 0.71875, + "learning_rate": 2.5396707589462486e-05, + "loss": 0.7246, + "step": 6743 + }, + { + "epoch": 0.49622441940676015, + "grad_norm": 0.921875, + "learning_rate": 2.5390916713340345e-05, + "loss": 0.943, + "step": 6744 + }, + { + "epoch": 0.49629799954012416, + "grad_norm": 0.78125, + "learning_rate": 2.5385125816238448e-05, + "loss": 1.1701, + "step": 6745 + }, + { + "epoch": 0.49637157967348816, + "grad_norm": 0.9140625, + "learning_rate": 2.5379334898467565e-05, + "loss": 0.8514, + "step": 6746 + }, + { + "epoch": 0.49644515980685217, + "grad_norm": 1.1875, + "learning_rate": 2.5373543960338503e-05, + "loss": 1.1307, + "step": 6747 + }, + { + "epoch": 0.4965187399402161, + "grad_norm": 0.796875, + "learning_rate": 2.5367753002162037e-05, + "loss": 0.6164, + "step": 6748 + }, + { + "epoch": 0.4965923200735801, + "grad_norm": 0.8046875, + "learning_rate": 2.536196202424897e-05, + "loss": 0.7547, + "step": 6749 + }, + { + "epoch": 0.4966659002069441, + "grad_norm": 0.80078125, + "learning_rate": 2.5356171026910074e-05, + "loss": 0.7357, + "step": 6750 + }, + { + "epoch": 0.4967394803403081, + "grad_norm": 0.98046875, + "learning_rate": 2.535038001045616e-05, + "loss": 1.8553, + "step": 6751 + }, + { + "epoch": 0.49681306047367213, + "grad_norm": 0.80859375, + "learning_rate": 2.5344588975198012e-05, + "loss": 0.8724, + "step": 6752 + }, + { + "epoch": 0.4968866406070361, + "grad_norm": 0.9296875, + "learning_rate": 2.5338797921446422e-05, + "loss": 1.2423, + "step": 6753 + }, + { + "epoch": 0.4969602207404001, + "grad_norm": 0.98046875, + "learning_rate": 2.5333006849512185e-05, + "loss": 1.4631, + "step": 6754 + }, + { + "epoch": 0.4970338008737641, + "grad_norm": 1.046875, + "learning_rate": 2.53272157597061e-05, + "loss": 1.0206, + "step": 6755 + }, + { + "epoch": 0.4971073810071281, + "grad_norm": 0.79296875, + "learning_rate": 2.5321424652338972e-05, + "loss": 0.6425, + "step": 6756 + }, + { + "epoch": 0.4971809611404921, + "grad_norm": 0.9296875, + "learning_rate": 2.5315633527721583e-05, + "loss": 0.9942, + "step": 6757 + }, + { + "epoch": 0.49725454127385604, + "grad_norm": 0.7265625, + "learning_rate": 2.5309842386164744e-05, + "loss": 0.7313, + "step": 6758 + }, + { + "epoch": 0.49732812140722005, + "grad_norm": 0.99609375, + "learning_rate": 2.5304051227979243e-05, + "loss": 1.2216, + "step": 6759 + }, + { + "epoch": 0.49740170154058405, + "grad_norm": 0.96484375, + "learning_rate": 2.5298260053475885e-05, + "loss": 0.9072, + "step": 6760 + }, + { + "epoch": 0.49747528167394806, + "grad_norm": 0.9140625, + "learning_rate": 2.5292468862965472e-05, + "loss": 1.1626, + "step": 6761 + }, + { + "epoch": 0.497548861807312, + "grad_norm": 0.7421875, + "learning_rate": 2.5286677656758817e-05, + "loss": 0.6964, + "step": 6762 + }, + { + "epoch": 0.497622441940676, + "grad_norm": 0.86328125, + "learning_rate": 2.5280886435166713e-05, + "loss": 0.8919, + "step": 6763 + }, + { + "epoch": 0.49769602207404, + "grad_norm": 0.9453125, + "learning_rate": 2.5275095198499953e-05, + "loss": 1.1505, + "step": 6764 + }, + { + "epoch": 0.497769602207404, + "grad_norm": 1.0859375, + "learning_rate": 2.5269303947069368e-05, + "loss": 1.2343, + "step": 6765 + }, + { + "epoch": 0.497843182340768, + "grad_norm": 0.99609375, + "learning_rate": 2.5263512681185742e-05, + "loss": 1.6502, + "step": 6766 + }, + { + "epoch": 0.49791676247413197, + "grad_norm": 0.90234375, + "learning_rate": 2.5257721401159884e-05, + "loss": 1.0583, + "step": 6767 + }, + { + "epoch": 0.497990342607496, + "grad_norm": 0.77734375, + "learning_rate": 2.5251930107302608e-05, + "loss": 0.9987, + "step": 6768 + }, + { + "epoch": 0.49806392274086, + "grad_norm": 0.828125, + "learning_rate": 2.5246138799924718e-05, + "loss": 0.9529, + "step": 6769 + }, + { + "epoch": 0.498137502874224, + "grad_norm": 0.82421875, + "learning_rate": 2.5240347479337027e-05, + "loss": 0.8805, + "step": 6770 + }, + { + "epoch": 0.49821108300758793, + "grad_norm": 0.7890625, + "learning_rate": 2.5234556145850343e-05, + "loss": 0.9518, + "step": 6771 + }, + { + "epoch": 0.49828466314095193, + "grad_norm": 0.7734375, + "learning_rate": 2.5228764799775468e-05, + "loss": 1.0451, + "step": 6772 + }, + { + "epoch": 0.49835824327431594, + "grad_norm": 0.82421875, + "learning_rate": 2.522297344142322e-05, + "loss": 0.9713, + "step": 6773 + }, + { + "epoch": 0.49843182340767994, + "grad_norm": 0.76171875, + "learning_rate": 2.5217182071104405e-05, + "loss": 0.8702, + "step": 6774 + }, + { + "epoch": 0.49850540354104395, + "grad_norm": 0.95703125, + "learning_rate": 2.5211390689129844e-05, + "loss": 1.041, + "step": 6775 + }, + { + "epoch": 0.4985789836744079, + "grad_norm": 0.71875, + "learning_rate": 2.5205599295810338e-05, + "loss": 0.898, + "step": 6776 + }, + { + "epoch": 0.4986525638077719, + "grad_norm": 0.8046875, + "learning_rate": 2.5199807891456716e-05, + "loss": 0.6358, + "step": 6777 + }, + { + "epoch": 0.4987261439411359, + "grad_norm": 0.82421875, + "learning_rate": 2.5194016476379773e-05, + "loss": 0.7588, + "step": 6778 + }, + { + "epoch": 0.4987997240744999, + "grad_norm": 0.703125, + "learning_rate": 2.518822505089034e-05, + "loss": 0.5863, + "step": 6779 + }, + { + "epoch": 0.49887330420786385, + "grad_norm": 0.92578125, + "learning_rate": 2.5182433615299215e-05, + "loss": 1.0457, + "step": 6780 + }, + { + "epoch": 0.49894688434122786, + "grad_norm": 0.921875, + "learning_rate": 2.5176642169917225e-05, + "loss": 0.8337, + "step": 6781 + }, + { + "epoch": 0.49902046447459186, + "grad_norm": 0.8125, + "learning_rate": 2.517085071505518e-05, + "loss": 1.1624, + "step": 6782 + }, + { + "epoch": 0.49909404460795587, + "grad_norm": 0.70703125, + "learning_rate": 2.5165059251023915e-05, + "loss": 0.7193, + "step": 6783 + }, + { + "epoch": 0.49916762474131987, + "grad_norm": 1.0390625, + "learning_rate": 2.515926777813422e-05, + "loss": 1.4729, + "step": 6784 + }, + { + "epoch": 0.4992412048746838, + "grad_norm": 0.8125, + "learning_rate": 2.5153476296696922e-05, + "loss": 0.6558, + "step": 6785 + }, + { + "epoch": 0.4993147850080478, + "grad_norm": 0.70703125, + "learning_rate": 2.5147684807022847e-05, + "loss": 0.6489, + "step": 6786 + }, + { + "epoch": 0.4993883651414118, + "grad_norm": 0.86328125, + "learning_rate": 2.5141893309422804e-05, + "loss": 1.0101, + "step": 6787 + }, + { + "epoch": 0.49946194527477583, + "grad_norm": 0.859375, + "learning_rate": 2.513610180420762e-05, + "loss": 1.3912, + "step": 6788 + }, + { + "epoch": 0.4995355254081398, + "grad_norm": 1.015625, + "learning_rate": 2.5130310291688097e-05, + "loss": 0.9714, + "step": 6789 + }, + { + "epoch": 0.4996091055415038, + "grad_norm": 0.8203125, + "learning_rate": 2.512451877217508e-05, + "loss": 0.7537, + "step": 6790 + }, + { + "epoch": 0.4996826856748678, + "grad_norm": 1.1484375, + "learning_rate": 2.5118727245979372e-05, + "loss": 0.8053, + "step": 6791 + }, + { + "epoch": 0.4997562658082318, + "grad_norm": 0.89453125, + "learning_rate": 2.5112935713411796e-05, + "loss": 1.1749, + "step": 6792 + }, + { + "epoch": 0.4998298459415958, + "grad_norm": 0.83203125, + "learning_rate": 2.5107144174783174e-05, + "loss": 0.5942, + "step": 6793 + }, + { + "epoch": 0.49990342607495974, + "grad_norm": 0.89453125, + "learning_rate": 2.5101352630404324e-05, + "loss": 0.7524, + "step": 6794 + }, + { + "epoch": 0.49997700620832375, + "grad_norm": 0.671875, + "learning_rate": 2.509556108058607e-05, + "loss": 0.8138, + "step": 6795 + }, + { + "epoch": 0.5000505863416878, + "grad_norm": 0.73828125, + "learning_rate": 2.5089769525639235e-05, + "loss": 0.6325, + "step": 6796 + }, + { + "epoch": 0.5001241664750518, + "grad_norm": 0.78515625, + "learning_rate": 2.5083977965874637e-05, + "loss": 0.8426, + "step": 6797 + }, + { + "epoch": 0.5001977466084158, + "grad_norm": 0.65625, + "learning_rate": 2.5078186401603103e-05, + "loss": 0.6433, + "step": 6798 + }, + { + "epoch": 0.5002713267417798, + "grad_norm": 0.74609375, + "learning_rate": 2.507239483313546e-05, + "loss": 0.7694, + "step": 6799 + }, + { + "epoch": 0.5003449068751437, + "grad_norm": 0.609375, + "learning_rate": 2.506660326078251e-05, + "loss": 0.6423, + "step": 6800 + }, + { + "epoch": 0.5004184870085077, + "grad_norm": 0.75, + "learning_rate": 2.506081168485509e-05, + "loss": 0.7475, + "step": 6801 + }, + { + "epoch": 0.5004920671418717, + "grad_norm": 0.84765625, + "learning_rate": 2.5055020105664024e-05, + "loss": 0.7871, + "step": 6802 + }, + { + "epoch": 0.5005656472752357, + "grad_norm": 0.85546875, + "learning_rate": 2.5049228523520135e-05, + "loss": 1.1883, + "step": 6803 + }, + { + "epoch": 0.5006392274085997, + "grad_norm": 0.875, + "learning_rate": 2.5043436938734245e-05, + "loss": 1.0403, + "step": 6804 + }, + { + "epoch": 0.5007128075419637, + "grad_norm": 0.8515625, + "learning_rate": 2.503764535161717e-05, + "loss": 1.0646, + "step": 6805 + }, + { + "epoch": 0.5007863876753277, + "grad_norm": 1.03125, + "learning_rate": 2.503185376247975e-05, + "loss": 0.8878, + "step": 6806 + }, + { + "epoch": 0.5008599678086917, + "grad_norm": 0.7578125, + "learning_rate": 2.5026062171632796e-05, + "loss": 0.9242, + "step": 6807 + }, + { + "epoch": 0.5009335479420557, + "grad_norm": 0.76953125, + "learning_rate": 2.5020270579387127e-05, + "loss": 0.9098, + "step": 6808 + }, + { + "epoch": 0.5010071280754196, + "grad_norm": 0.9453125, + "learning_rate": 2.501447898605358e-05, + "loss": 0.9524, + "step": 6809 + }, + { + "epoch": 0.5010807082087836, + "grad_norm": 0.81640625, + "learning_rate": 2.500868739194297e-05, + "loss": 0.8059, + "step": 6810 + }, + { + "epoch": 0.5011542883421476, + "grad_norm": 1.046875, + "learning_rate": 2.5002895797366134e-05, + "loss": 0.758, + "step": 6811 + }, + { + "epoch": 0.5012278684755116, + "grad_norm": 0.734375, + "learning_rate": 2.4997104202633875e-05, + "loss": 0.7122, + "step": 6812 + }, + { + "epoch": 0.5013014486088756, + "grad_norm": 0.80859375, + "learning_rate": 2.499131260805703e-05, + "loss": 0.7517, + "step": 6813 + }, + { + "epoch": 0.5013750287422396, + "grad_norm": 1.0625, + "learning_rate": 2.4985521013946425e-05, + "loss": 1.0842, + "step": 6814 + }, + { + "epoch": 0.5014486088756036, + "grad_norm": 0.984375, + "learning_rate": 2.4979729420612875e-05, + "loss": 1.4459, + "step": 6815 + }, + { + "epoch": 0.5015221890089676, + "grad_norm": 0.87109375, + "learning_rate": 2.497393782836721e-05, + "loss": 1.1544, + "step": 6816 + }, + { + "epoch": 0.5015957691423316, + "grad_norm": 0.9375, + "learning_rate": 2.496814623752025e-05, + "loss": 1.3312, + "step": 6817 + }, + { + "epoch": 0.5016693492756955, + "grad_norm": 1.078125, + "learning_rate": 2.4962354648382827e-05, + "loss": 1.2026, + "step": 6818 + }, + { + "epoch": 0.5017429294090595, + "grad_norm": 0.8359375, + "learning_rate": 2.4956563061265764e-05, + "loss": 0.5985, + "step": 6819 + }, + { + "epoch": 0.5018165095424235, + "grad_norm": 0.7421875, + "learning_rate": 2.495077147647987e-05, + "loss": 0.8615, + "step": 6820 + }, + { + "epoch": 0.5018900896757875, + "grad_norm": 1.140625, + "learning_rate": 2.494497989433598e-05, + "loss": 1.1456, + "step": 6821 + }, + { + "epoch": 0.5019636698091515, + "grad_norm": 0.8515625, + "learning_rate": 2.4939188315144914e-05, + "loss": 0.897, + "step": 6822 + }, + { + "epoch": 0.5020372499425155, + "grad_norm": 0.92578125, + "learning_rate": 2.49333967392175e-05, + "loss": 1.0049, + "step": 6823 + }, + { + "epoch": 0.5021108300758795, + "grad_norm": 0.88671875, + "learning_rate": 2.4927605166864554e-05, + "loss": 0.7346, + "step": 6824 + }, + { + "epoch": 0.5021844102092435, + "grad_norm": 1.015625, + "learning_rate": 2.49218135983969e-05, + "loss": 1.0402, + "step": 6825 + }, + { + "epoch": 0.5022579903426075, + "grad_norm": 0.890625, + "learning_rate": 2.4916022034125365e-05, + "loss": 1.0197, + "step": 6826 + }, + { + "epoch": 0.5023315704759714, + "grad_norm": 0.75390625, + "learning_rate": 2.4910230474360764e-05, + "loss": 0.7358, + "step": 6827 + }, + { + "epoch": 0.5024051506093354, + "grad_norm": 0.95703125, + "learning_rate": 2.4904438919413938e-05, + "loss": 1.1908, + "step": 6828 + }, + { + "epoch": 0.5024787307426994, + "grad_norm": 0.828125, + "learning_rate": 2.4898647369595682e-05, + "loss": 0.6124, + "step": 6829 + }, + { + "epoch": 0.5025523108760634, + "grad_norm": 0.76953125, + "learning_rate": 2.489285582521683e-05, + "loss": 0.9041, + "step": 6830 + }, + { + "epoch": 0.5026258910094275, + "grad_norm": 0.8125, + "learning_rate": 2.4887064286588206e-05, + "loss": 0.8017, + "step": 6831 + }, + { + "epoch": 0.5026994711427915, + "grad_norm": 0.87109375, + "learning_rate": 2.488127275402063e-05, + "loss": 0.9575, + "step": 6832 + }, + { + "epoch": 0.5027730512761555, + "grad_norm": 0.94921875, + "learning_rate": 2.4875481227824928e-05, + "loss": 0.889, + "step": 6833 + }, + { + "epoch": 0.5028466314095195, + "grad_norm": 1.0390625, + "learning_rate": 2.4869689708311905e-05, + "loss": 0.9494, + "step": 6834 + }, + { + "epoch": 0.5029202115428835, + "grad_norm": 0.765625, + "learning_rate": 2.486389819579239e-05, + "loss": 0.8909, + "step": 6835 + }, + { + "epoch": 0.5029937916762474, + "grad_norm": 1.0625, + "learning_rate": 2.4858106690577198e-05, + "loss": 1.0806, + "step": 6836 + }, + { + "epoch": 0.5030673718096114, + "grad_norm": 0.83984375, + "learning_rate": 2.4852315192977162e-05, + "loss": 1.1335, + "step": 6837 + }, + { + "epoch": 0.5031409519429754, + "grad_norm": 0.81640625, + "learning_rate": 2.484652370330308e-05, + "loss": 1.1262, + "step": 6838 + }, + { + "epoch": 0.5032145320763394, + "grad_norm": 0.83203125, + "learning_rate": 2.4840732221865788e-05, + "loss": 0.7607, + "step": 6839 + }, + { + "epoch": 0.5032881122097034, + "grad_norm": 0.9375, + "learning_rate": 2.4834940748976095e-05, + "loss": 0.8848, + "step": 6840 + }, + { + "epoch": 0.5033616923430674, + "grad_norm": 1.0703125, + "learning_rate": 2.4829149284944814e-05, + "loss": 1.009, + "step": 6841 + }, + { + "epoch": 0.5034352724764314, + "grad_norm": 0.92578125, + "learning_rate": 2.482335783008278e-05, + "loss": 0.9775, + "step": 6842 + }, + { + "epoch": 0.5035088526097954, + "grad_norm": 0.90625, + "learning_rate": 2.4817566384700787e-05, + "loss": 0.6439, + "step": 6843 + }, + { + "epoch": 0.5035824327431594, + "grad_norm": 1.0078125, + "learning_rate": 2.4811774949109668e-05, + "loss": 1.2899, + "step": 6844 + }, + { + "epoch": 0.5036560128765233, + "grad_norm": 0.98046875, + "learning_rate": 2.4805983523620226e-05, + "loss": 1.7501, + "step": 6845 + }, + { + "epoch": 0.5037295930098873, + "grad_norm": 0.92578125, + "learning_rate": 2.4800192108543296e-05, + "loss": 0.9806, + "step": 6846 + }, + { + "epoch": 0.5038031731432513, + "grad_norm": 1.0546875, + "learning_rate": 2.4794400704189664e-05, + "loss": 0.9335, + "step": 6847 + }, + { + "epoch": 0.5038767532766153, + "grad_norm": 0.8046875, + "learning_rate": 2.478860931087016e-05, + "loss": 0.9179, + "step": 6848 + }, + { + "epoch": 0.5039503334099793, + "grad_norm": 0.79296875, + "learning_rate": 2.4782817928895598e-05, + "loss": 0.6442, + "step": 6849 + }, + { + "epoch": 0.5040239135433433, + "grad_norm": 0.9296875, + "learning_rate": 2.477702655857678e-05, + "loss": 0.9782, + "step": 6850 + }, + { + "epoch": 0.5040974936767073, + "grad_norm": 0.81640625, + "learning_rate": 2.477123520022454e-05, + "loss": 0.677, + "step": 6851 + }, + { + "epoch": 0.5041710738100713, + "grad_norm": 0.86328125, + "learning_rate": 2.4765443854149667e-05, + "loss": 1.0652, + "step": 6852 + }, + { + "epoch": 0.5042446539434353, + "grad_norm": 0.75, + "learning_rate": 2.4759652520662975e-05, + "loss": 0.7857, + "step": 6853 + }, + { + "epoch": 0.5043182340767992, + "grad_norm": 0.7734375, + "learning_rate": 2.475386120007528e-05, + "loss": 0.6471, + "step": 6854 + }, + { + "epoch": 0.5043918142101632, + "grad_norm": 0.84765625, + "learning_rate": 2.4748069892697398e-05, + "loss": 0.799, + "step": 6855 + }, + { + "epoch": 0.5044653943435272, + "grad_norm": 0.90234375, + "learning_rate": 2.4742278598840122e-05, + "loss": 1.0061, + "step": 6856 + }, + { + "epoch": 0.5045389744768912, + "grad_norm": 0.94140625, + "learning_rate": 2.4736487318814267e-05, + "loss": 0.7325, + "step": 6857 + }, + { + "epoch": 0.5046125546102552, + "grad_norm": 0.83203125, + "learning_rate": 2.4730696052930638e-05, + "loss": 0.8355, + "step": 6858 + }, + { + "epoch": 0.5046861347436192, + "grad_norm": 0.93359375, + "learning_rate": 2.4724904801500043e-05, + "loss": 0.8914, + "step": 6859 + }, + { + "epoch": 0.5047597148769832, + "grad_norm": 0.984375, + "learning_rate": 2.4719113564833303e-05, + "loss": 1.6786, + "step": 6860 + }, + { + "epoch": 0.5048332950103472, + "grad_norm": 1.0625, + "learning_rate": 2.4713322343241192e-05, + "loss": 1.0828, + "step": 6861 + }, + { + "epoch": 0.5049068751437112, + "grad_norm": 0.796875, + "learning_rate": 2.470753113703453e-05, + "loss": 0.7747, + "step": 6862 + }, + { + "epoch": 0.5049804552770751, + "grad_norm": 1.0234375, + "learning_rate": 2.470173994652412e-05, + "loss": 1.0917, + "step": 6863 + }, + { + "epoch": 0.5050540354104391, + "grad_norm": 0.94140625, + "learning_rate": 2.4695948772020756e-05, + "loss": 1.1608, + "step": 6864 + }, + { + "epoch": 0.5051276155438031, + "grad_norm": 0.6640625, + "learning_rate": 2.469015761383527e-05, + "loss": 0.7306, + "step": 6865 + }, + { + "epoch": 0.5052011956771671, + "grad_norm": 0.76953125, + "learning_rate": 2.4684366472278423e-05, + "loss": 0.706, + "step": 6866 + }, + { + "epoch": 0.5052747758105312, + "grad_norm": 0.80859375, + "learning_rate": 2.4678575347661034e-05, + "loss": 0.7526, + "step": 6867 + }, + { + "epoch": 0.5053483559438952, + "grad_norm": 0.9921875, + "learning_rate": 2.4672784240293897e-05, + "loss": 0.9766, + "step": 6868 + }, + { + "epoch": 0.5054219360772592, + "grad_norm": 0.90234375, + "learning_rate": 2.4666993150487818e-05, + "loss": 1.0136, + "step": 6869 + }, + { + "epoch": 0.5054955162106232, + "grad_norm": 1.0234375, + "learning_rate": 2.4661202078553584e-05, + "loss": 1.1905, + "step": 6870 + }, + { + "epoch": 0.5055690963439872, + "grad_norm": 0.69921875, + "learning_rate": 2.465541102480199e-05, + "loss": 0.7406, + "step": 6871 + }, + { + "epoch": 0.5056426764773511, + "grad_norm": 0.80859375, + "learning_rate": 2.464961998954384e-05, + "loss": 0.6446, + "step": 6872 + }, + { + "epoch": 0.5057162566107151, + "grad_norm": 1.0703125, + "learning_rate": 2.464382897308992e-05, + "loss": 0.8695, + "step": 6873 + }, + { + "epoch": 0.5057898367440791, + "grad_norm": 0.84765625, + "learning_rate": 2.463803797575104e-05, + "loss": 1.2651, + "step": 6874 + }, + { + "epoch": 0.5058634168774431, + "grad_norm": 0.94921875, + "learning_rate": 2.463224699783797e-05, + "loss": 0.7668, + "step": 6875 + }, + { + "epoch": 0.5059369970108071, + "grad_norm": 0.90234375, + "learning_rate": 2.4626456039661503e-05, + "loss": 0.8345, + "step": 6876 + }, + { + "epoch": 0.5060105771441711, + "grad_norm": 0.88671875, + "learning_rate": 2.462066510153243e-05, + "loss": 1.0072, + "step": 6877 + }, + { + "epoch": 0.5060841572775351, + "grad_norm": 0.921875, + "learning_rate": 2.4614874183761564e-05, + "loss": 1.4951, + "step": 6878 + }, + { + "epoch": 0.5061577374108991, + "grad_norm": 0.9375, + "learning_rate": 2.460908328665966e-05, + "loss": 1.0984, + "step": 6879 + }, + { + "epoch": 0.5062313175442631, + "grad_norm": 1.0078125, + "learning_rate": 2.460329241053752e-05, + "loss": 1.1923, + "step": 6880 + }, + { + "epoch": 0.506304897677627, + "grad_norm": 0.828125, + "learning_rate": 2.4597501555705925e-05, + "loss": 0.8413, + "step": 6881 + }, + { + "epoch": 0.506378477810991, + "grad_norm": 1.0546875, + "learning_rate": 2.4591710722475662e-05, + "loss": 1.3315, + "step": 6882 + }, + { + "epoch": 0.506452057944355, + "grad_norm": 0.7109375, + "learning_rate": 2.4585919911157522e-05, + "loss": 0.7327, + "step": 6883 + }, + { + "epoch": 0.506525638077719, + "grad_norm": 0.8515625, + "learning_rate": 2.4580129122062267e-05, + "loss": 0.7946, + "step": 6884 + }, + { + "epoch": 0.506599218211083, + "grad_norm": 0.859375, + "learning_rate": 2.4574338355500694e-05, + "loss": 0.6864, + "step": 6885 + }, + { + "epoch": 0.506672798344447, + "grad_norm": 0.84765625, + "learning_rate": 2.4568547611783575e-05, + "loss": 0.7982, + "step": 6886 + }, + { + "epoch": 0.506746378477811, + "grad_norm": 1.1015625, + "learning_rate": 2.4562756891221698e-05, + "loss": 1.0393, + "step": 6887 + }, + { + "epoch": 0.506819958611175, + "grad_norm": 0.859375, + "learning_rate": 2.4556966194125826e-05, + "loss": 1.0597, + "step": 6888 + }, + { + "epoch": 0.506893538744539, + "grad_norm": 1.046875, + "learning_rate": 2.4551175520806744e-05, + "loss": 1.2202, + "step": 6889 + }, + { + "epoch": 0.5069671188779029, + "grad_norm": 0.63671875, + "learning_rate": 2.4545384871575215e-05, + "loss": 0.5673, + "step": 6890 + }, + { + "epoch": 0.5070406990112669, + "grad_norm": 0.67578125, + "learning_rate": 2.4539594246742027e-05, + "loss": 0.6118, + "step": 6891 + }, + { + "epoch": 0.5071142791446309, + "grad_norm": 0.72265625, + "learning_rate": 2.453380364661796e-05, + "loss": 0.9154, + "step": 6892 + }, + { + "epoch": 0.5071878592779949, + "grad_norm": 0.76171875, + "learning_rate": 2.4528013071513757e-05, + "loss": 0.7346, + "step": 6893 + }, + { + "epoch": 0.5072614394113589, + "grad_norm": 0.765625, + "learning_rate": 2.45222225217402e-05, + "loss": 0.7266, + "step": 6894 + }, + { + "epoch": 0.5073350195447229, + "grad_norm": 0.73046875, + "learning_rate": 2.4516431997608062e-05, + "loss": 0.6065, + "step": 6895 + }, + { + "epoch": 0.5074085996780869, + "grad_norm": 0.6328125, + "learning_rate": 2.451064149942811e-05, + "loss": 0.6672, + "step": 6896 + }, + { + "epoch": 0.5074821798114509, + "grad_norm": 0.90625, + "learning_rate": 2.4504851027511094e-05, + "loss": 0.7494, + "step": 6897 + }, + { + "epoch": 0.5075557599448149, + "grad_norm": 0.828125, + "learning_rate": 2.449906058216779e-05, + "loss": 1.0767, + "step": 6898 + }, + { + "epoch": 0.5076293400781788, + "grad_norm": 1.125, + "learning_rate": 2.449327016370896e-05, + "loss": 1.2438, + "step": 6899 + }, + { + "epoch": 0.5077029202115428, + "grad_norm": 0.7890625, + "learning_rate": 2.4487479772445365e-05, + "loss": 0.752, + "step": 6900 + }, + { + "epoch": 0.5077765003449068, + "grad_norm": 0.89453125, + "learning_rate": 2.4481689408687772e-05, + "loss": 0.9818, + "step": 6901 + }, + { + "epoch": 0.5078500804782708, + "grad_norm": 0.90625, + "learning_rate": 2.4475899072746918e-05, + "loss": 0.7712, + "step": 6902 + }, + { + "epoch": 0.5079236606116349, + "grad_norm": 0.74609375, + "learning_rate": 2.4470108764933564e-05, + "loss": 0.8489, + "step": 6903 + }, + { + "epoch": 0.5079972407449989, + "grad_norm": 1.109375, + "learning_rate": 2.446431848555848e-05, + "loss": 0.9714, + "step": 6904 + }, + { + "epoch": 0.5080708208783629, + "grad_norm": 0.8984375, + "learning_rate": 2.445852823493241e-05, + "loss": 0.9575, + "step": 6905 + }, + { + "epoch": 0.5081444010117269, + "grad_norm": 1.125, + "learning_rate": 2.4452738013366117e-05, + "loss": 1.476, + "step": 6906 + }, + { + "epoch": 0.5082179811450909, + "grad_norm": 1.0625, + "learning_rate": 2.444694782117033e-05, + "loss": 0.7326, + "step": 6907 + }, + { + "epoch": 0.5082915612784548, + "grad_norm": 1.0, + "learning_rate": 2.444115765865581e-05, + "loss": 0.8245, + "step": 6908 + }, + { + "epoch": 0.5083651414118188, + "grad_norm": 0.734375, + "learning_rate": 2.4435367526133305e-05, + "loss": 0.7986, + "step": 6909 + }, + { + "epoch": 0.5084387215451828, + "grad_norm": 0.7734375, + "learning_rate": 2.442957742391356e-05, + "loss": 0.6466, + "step": 6910 + }, + { + "epoch": 0.5085123016785468, + "grad_norm": 0.91796875, + "learning_rate": 2.4423787352307314e-05, + "loss": 1.061, + "step": 6911 + }, + { + "epoch": 0.5085858818119108, + "grad_norm": 0.84765625, + "learning_rate": 2.4417997311625313e-05, + "loss": 0.6697, + "step": 6912 + }, + { + "epoch": 0.5086594619452748, + "grad_norm": 0.87109375, + "learning_rate": 2.4412207302178298e-05, + "loss": 0.9173, + "step": 6913 + }, + { + "epoch": 0.5087330420786388, + "grad_norm": 0.8671875, + "learning_rate": 2.4406417324277003e-05, + "loss": 1.1273, + "step": 6914 + }, + { + "epoch": 0.5088066222120028, + "grad_norm": 1.0390625, + "learning_rate": 2.4400627378232183e-05, + "loss": 1.1021, + "step": 6915 + }, + { + "epoch": 0.5088802023453668, + "grad_norm": 0.85546875, + "learning_rate": 2.4394837464354548e-05, + "loss": 0.8225, + "step": 6916 + }, + { + "epoch": 0.5089537824787307, + "grad_norm": 1.015625, + "learning_rate": 2.438904758295484e-05, + "loss": 1.2826, + "step": 6917 + }, + { + "epoch": 0.5090273626120947, + "grad_norm": 0.6796875, + "learning_rate": 2.4383257734343796e-05, + "loss": 0.6253, + "step": 6918 + }, + { + "epoch": 0.5091009427454587, + "grad_norm": 1.25, + "learning_rate": 2.4377467918832157e-05, + "loss": 1.2076, + "step": 6919 + }, + { + "epoch": 0.5091745228788227, + "grad_norm": 1.0, + "learning_rate": 2.437167813673063e-05, + "loss": 1.6609, + "step": 6920 + }, + { + "epoch": 0.5092481030121867, + "grad_norm": 0.83984375, + "learning_rate": 2.4365888388349943e-05, + "loss": 1.054, + "step": 6921 + }, + { + "epoch": 0.5093216831455507, + "grad_norm": 0.89453125, + "learning_rate": 2.4360098674000835e-05, + "loss": 1.1305, + "step": 6922 + }, + { + "epoch": 0.5093952632789147, + "grad_norm": 0.81640625, + "learning_rate": 2.435430899399402e-05, + "loss": 0.6593, + "step": 6923 + }, + { + "epoch": 0.5094688434122787, + "grad_norm": 0.7421875, + "learning_rate": 2.434851934864023e-05, + "loss": 0.8338, + "step": 6924 + }, + { + "epoch": 0.5095424235456427, + "grad_norm": 0.984375, + "learning_rate": 2.434272973825017e-05, + "loss": 1.2494, + "step": 6925 + }, + { + "epoch": 0.5096160036790066, + "grad_norm": 0.8828125, + "learning_rate": 2.4336940163134565e-05, + "loss": 1.015, + "step": 6926 + }, + { + "epoch": 0.5096895838123706, + "grad_norm": 0.85546875, + "learning_rate": 2.4331150623604126e-05, + "loss": 1.0062, + "step": 6927 + }, + { + "epoch": 0.5097631639457346, + "grad_norm": 0.9609375, + "learning_rate": 2.4325361119969582e-05, + "loss": 0.9374, + "step": 6928 + }, + { + "epoch": 0.5098367440790986, + "grad_norm": 0.87890625, + "learning_rate": 2.431957165254162e-05, + "loss": 0.6762, + "step": 6929 + }, + { + "epoch": 0.5099103242124626, + "grad_norm": 0.984375, + "learning_rate": 2.431378222163096e-05, + "loss": 0.8244, + "step": 6930 + }, + { + "epoch": 0.5099839043458266, + "grad_norm": 0.87890625, + "learning_rate": 2.4307992827548316e-05, + "loss": 0.8764, + "step": 6931 + }, + { + "epoch": 0.5100574844791906, + "grad_norm": 1.1953125, + "learning_rate": 2.4302203470604388e-05, + "loss": 1.1631, + "step": 6932 + }, + { + "epoch": 0.5101310646125546, + "grad_norm": 0.6484375, + "learning_rate": 2.4296414151109893e-05, + "loss": 0.732, + "step": 6933 + }, + { + "epoch": 0.5102046447459186, + "grad_norm": 0.98828125, + "learning_rate": 2.4290624869375515e-05, + "loss": 0.9829, + "step": 6934 + }, + { + "epoch": 0.5102782248792826, + "grad_norm": 0.84375, + "learning_rate": 2.4284835625711957e-05, + "loss": 0.95, + "step": 6935 + }, + { + "epoch": 0.5103518050126465, + "grad_norm": 1.109375, + "learning_rate": 2.427904642042992e-05, + "loss": 1.4681, + "step": 6936 + }, + { + "epoch": 0.5104253851460105, + "grad_norm": 0.828125, + "learning_rate": 2.42732572538401e-05, + "loss": 0.8922, + "step": 6937 + }, + { + "epoch": 0.5104989652793746, + "grad_norm": 0.765625, + "learning_rate": 2.42674681262532e-05, + "loss": 0.966, + "step": 6938 + }, + { + "epoch": 0.5105725454127386, + "grad_norm": 0.99609375, + "learning_rate": 2.426167903797989e-05, + "loss": 1.0419, + "step": 6939 + }, + { + "epoch": 0.5106461255461026, + "grad_norm": 0.8828125, + "learning_rate": 2.4255889989330877e-05, + "loss": 0.7437, + "step": 6940 + }, + { + "epoch": 0.5107197056794666, + "grad_norm": 0.7421875, + "learning_rate": 2.425010098061684e-05, + "loss": 0.7891, + "step": 6941 + }, + { + "epoch": 0.5107932858128306, + "grad_norm": 0.84375, + "learning_rate": 2.424431201214848e-05, + "loss": 0.8571, + "step": 6942 + }, + { + "epoch": 0.5108668659461946, + "grad_norm": 0.69140625, + "learning_rate": 2.423852308423645e-05, + "loss": 0.398, + "step": 6943 + }, + { + "epoch": 0.5109404460795586, + "grad_norm": 0.89453125, + "learning_rate": 2.4232734197191448e-05, + "loss": 0.8547, + "step": 6944 + }, + { + "epoch": 0.5110140262129225, + "grad_norm": 1.1171875, + "learning_rate": 2.422694535132416e-05, + "loss": 1.027, + "step": 6945 + }, + { + "epoch": 0.5110876063462865, + "grad_norm": 0.81640625, + "learning_rate": 2.422115654694525e-05, + "loss": 0.8691, + "step": 6946 + }, + { + "epoch": 0.5111611864796505, + "grad_norm": 0.83984375, + "learning_rate": 2.4215367784365413e-05, + "loss": 0.9444, + "step": 6947 + }, + { + "epoch": 0.5112347666130145, + "grad_norm": 0.7421875, + "learning_rate": 2.420957906389529e-05, + "loss": 0.8475, + "step": 6948 + }, + { + "epoch": 0.5113083467463785, + "grad_norm": 0.8046875, + "learning_rate": 2.4203790385845564e-05, + "loss": 0.78, + "step": 6949 + }, + { + "epoch": 0.5113819268797425, + "grad_norm": 1.2734375, + "learning_rate": 2.419800175052691e-05, + "loss": 1.2925, + "step": 6950 + }, + { + "epoch": 0.5114555070131065, + "grad_norm": 0.8203125, + "learning_rate": 2.4192213158249987e-05, + "loss": 0.9363, + "step": 6951 + }, + { + "epoch": 0.5115290871464705, + "grad_norm": 0.75390625, + "learning_rate": 2.4186424609325455e-05, + "loss": 0.8718, + "step": 6952 + }, + { + "epoch": 0.5116026672798345, + "grad_norm": 0.69140625, + "learning_rate": 2.418063610406398e-05, + "loss": 0.7089, + "step": 6953 + }, + { + "epoch": 0.5116762474131984, + "grad_norm": 0.70703125, + "learning_rate": 2.4174847642776217e-05, + "loss": 0.8138, + "step": 6954 + }, + { + "epoch": 0.5117498275465624, + "grad_norm": 0.91015625, + "learning_rate": 2.416905922577282e-05, + "loss": 0.8168, + "step": 6955 + }, + { + "epoch": 0.5118234076799264, + "grad_norm": 1.0546875, + "learning_rate": 2.416327085336445e-05, + "loss": 0.9951, + "step": 6956 + }, + { + "epoch": 0.5118969878132904, + "grad_norm": 0.79296875, + "learning_rate": 2.4157482525861748e-05, + "loss": 0.9895, + "step": 6957 + }, + { + "epoch": 0.5119705679466544, + "grad_norm": 1.0078125, + "learning_rate": 2.415169424357537e-05, + "loss": 0.7844, + "step": 6958 + }, + { + "epoch": 0.5120441480800184, + "grad_norm": 0.79296875, + "learning_rate": 2.414590600681596e-05, + "loss": 0.6938, + "step": 6959 + }, + { + "epoch": 0.5121177282133824, + "grad_norm": 1.03125, + "learning_rate": 2.414011781589417e-05, + "loss": 1.1322, + "step": 6960 + }, + { + "epoch": 0.5121913083467464, + "grad_norm": 0.94140625, + "learning_rate": 2.4134329671120623e-05, + "loss": 1.162, + "step": 6961 + }, + { + "epoch": 0.5122648884801104, + "grad_norm": 0.73828125, + "learning_rate": 2.4128541572805966e-05, + "loss": 0.6704, + "step": 6962 + }, + { + "epoch": 0.5123384686134743, + "grad_norm": 0.72265625, + "learning_rate": 2.4122753521260837e-05, + "loss": 0.6643, + "step": 6963 + }, + { + "epoch": 0.5124120487468383, + "grad_norm": 0.90234375, + "learning_rate": 2.4116965516795873e-05, + "loss": 1.1199, + "step": 6964 + }, + { + "epoch": 0.5124856288802023, + "grad_norm": 0.71484375, + "learning_rate": 2.4111177559721703e-05, + "loss": 0.664, + "step": 6965 + }, + { + "epoch": 0.5125592090135663, + "grad_norm": 0.88671875, + "learning_rate": 2.4105389650348954e-05, + "loss": 0.8739, + "step": 6966 + }, + { + "epoch": 0.5126327891469303, + "grad_norm": 1.1484375, + "learning_rate": 2.4099601788988247e-05, + "loss": 1.1971, + "step": 6967 + }, + { + "epoch": 0.5127063692802943, + "grad_norm": 1.0546875, + "learning_rate": 2.4093813975950215e-05, + "loss": 0.9507, + "step": 6968 + }, + { + "epoch": 0.5127799494136583, + "grad_norm": 0.75390625, + "learning_rate": 2.4088026211545474e-05, + "loss": 0.8032, + "step": 6969 + }, + { + "epoch": 0.5128535295470223, + "grad_norm": 0.75, + "learning_rate": 2.408223849608465e-05, + "loss": 1.1861, + "step": 6970 + }, + { + "epoch": 0.5129271096803864, + "grad_norm": 0.94921875, + "learning_rate": 2.4076450829878347e-05, + "loss": 1.0445, + "step": 6971 + }, + { + "epoch": 0.5130006898137502, + "grad_norm": 1.171875, + "learning_rate": 2.4070663213237185e-05, + "loss": 1.5042, + "step": 6972 + }, + { + "epoch": 0.5130742699471142, + "grad_norm": 0.734375, + "learning_rate": 2.406487564647177e-05, + "loss": 0.7042, + "step": 6973 + }, + { + "epoch": 0.5131478500804783, + "grad_norm": 1.078125, + "learning_rate": 2.4059088129892725e-05, + "loss": 1.1904, + "step": 6974 + }, + { + "epoch": 0.5132214302138423, + "grad_norm": 0.87890625, + "learning_rate": 2.4053300663810633e-05, + "loss": 0.8076, + "step": 6975 + }, + { + "epoch": 0.5132950103472063, + "grad_norm": 0.9453125, + "learning_rate": 2.4047513248536104e-05, + "loss": 0.7586, + "step": 6976 + }, + { + "epoch": 0.5133685904805703, + "grad_norm": 0.75390625, + "learning_rate": 2.4041725884379744e-05, + "loss": 0.5449, + "step": 6977 + }, + { + "epoch": 0.5134421706139343, + "grad_norm": 0.75, + "learning_rate": 2.4035938571652143e-05, + "loss": 0.7126, + "step": 6978 + }, + { + "epoch": 0.5135157507472983, + "grad_norm": 0.8515625, + "learning_rate": 2.4030151310663904e-05, + "loss": 0.8298, + "step": 6979 + }, + { + "epoch": 0.5135893308806623, + "grad_norm": 0.69921875, + "learning_rate": 2.402436410172561e-05, + "loss": 0.5451, + "step": 6980 + }, + { + "epoch": 0.5136629110140262, + "grad_norm": 0.94921875, + "learning_rate": 2.401857694514785e-05, + "loss": 0.7817, + "step": 6981 + }, + { + "epoch": 0.5137364911473902, + "grad_norm": 0.66015625, + "learning_rate": 2.4012789841241205e-05, + "loss": 0.6687, + "step": 6982 + }, + { + "epoch": 0.5138100712807542, + "grad_norm": 1.1015625, + "learning_rate": 2.4007002790316276e-05, + "loss": 1.3465, + "step": 6983 + }, + { + "epoch": 0.5138836514141182, + "grad_norm": 0.91796875, + "learning_rate": 2.4001215792683625e-05, + "loss": 1.1978, + "step": 6984 + }, + { + "epoch": 0.5139572315474822, + "grad_norm": 1.015625, + "learning_rate": 2.399542884865384e-05, + "loss": 1.0106, + "step": 6985 + }, + { + "epoch": 0.5140308116808462, + "grad_norm": 0.83984375, + "learning_rate": 2.3989641958537486e-05, + "loss": 0.8311, + "step": 6986 + }, + { + "epoch": 0.5141043918142102, + "grad_norm": 0.7265625, + "learning_rate": 2.3983855122645142e-05, + "loss": 0.6216, + "step": 6987 + }, + { + "epoch": 0.5141779719475742, + "grad_norm": 0.875, + "learning_rate": 2.3978068341287385e-05, + "loss": 1.3475, + "step": 6988 + }, + { + "epoch": 0.5142515520809382, + "grad_norm": 1.078125, + "learning_rate": 2.397228161477476e-05, + "loss": 0.9304, + "step": 6989 + }, + { + "epoch": 0.5143251322143021, + "grad_norm": 0.86328125, + "learning_rate": 2.3966494943417838e-05, + "loss": 0.944, + "step": 6990 + }, + { + "epoch": 0.5143987123476661, + "grad_norm": 0.79296875, + "learning_rate": 2.3960708327527184e-05, + "loss": 0.7263, + "step": 6991 + }, + { + "epoch": 0.5144722924810301, + "grad_norm": 0.91015625, + "learning_rate": 2.3954921767413352e-05, + "loss": 0.9915, + "step": 6992 + }, + { + "epoch": 0.5145458726143941, + "grad_norm": 0.98828125, + "learning_rate": 2.3949135263386894e-05, + "loss": 0.864, + "step": 6993 + }, + { + "epoch": 0.5146194527477581, + "grad_norm": 0.97265625, + "learning_rate": 2.3943348815758356e-05, + "loss": 1.1575, + "step": 6994 + }, + { + "epoch": 0.5146930328811221, + "grad_norm": 0.9296875, + "learning_rate": 2.3937562424838294e-05, + "loss": 0.9029, + "step": 6995 + }, + { + "epoch": 0.5147666130144861, + "grad_norm": 0.83203125, + "learning_rate": 2.3931776090937252e-05, + "loss": 0.859, + "step": 6996 + }, + { + "epoch": 0.5148401931478501, + "grad_norm": 0.90625, + "learning_rate": 2.3925989814365774e-05, + "loss": 0.8189, + "step": 6997 + }, + { + "epoch": 0.5149137732812141, + "grad_norm": 0.81640625, + "learning_rate": 2.3920203595434387e-05, + "loss": 0.7732, + "step": 6998 + }, + { + "epoch": 0.514987353414578, + "grad_norm": 0.81640625, + "learning_rate": 2.3914417434453633e-05, + "loss": 0.9874, + "step": 6999 + }, + { + "epoch": 0.515060933547942, + "grad_norm": 0.921875, + "learning_rate": 2.390863133173405e-05, + "loss": 1.3188, + "step": 7000 + }, + { + "epoch": 0.515134513681306, + "grad_norm": 0.88671875, + "learning_rate": 2.390284528758617e-05, + "loss": 1.1229, + "step": 7001 + }, + { + "epoch": 0.51520809381467, + "grad_norm": 0.83984375, + "learning_rate": 2.38970593023205e-05, + "loss": 1.0909, + "step": 7002 + }, + { + "epoch": 0.515281673948034, + "grad_norm": 1.015625, + "learning_rate": 2.3891273376247572e-05, + "loss": 1.0579, + "step": 7003 + }, + { + "epoch": 0.515355254081398, + "grad_norm": 1.0078125, + "learning_rate": 2.388548750967791e-05, + "loss": 0.8992, + "step": 7004 + }, + { + "epoch": 0.515428834214762, + "grad_norm": 0.8046875, + "learning_rate": 2.3879701702922028e-05, + "loss": 0.8122, + "step": 7005 + }, + { + "epoch": 0.515502414348126, + "grad_norm": 0.71875, + "learning_rate": 2.3873915956290446e-05, + "loss": 0.649, + "step": 7006 + }, + { + "epoch": 0.51557599448149, + "grad_norm": 0.90625, + "learning_rate": 2.386813027009366e-05, + "loss": 0.831, + "step": 7007 + }, + { + "epoch": 0.515649574614854, + "grad_norm": 0.69921875, + "learning_rate": 2.3862344644642187e-05, + "loss": 0.6527, + "step": 7008 + }, + { + "epoch": 0.515723154748218, + "grad_norm": 0.8125, + "learning_rate": 2.385655908024653e-05, + "loss": 0.7749, + "step": 7009 + }, + { + "epoch": 0.515796734881582, + "grad_norm": 0.94140625, + "learning_rate": 2.385077357721718e-05, + "loss": 0.7606, + "step": 7010 + }, + { + "epoch": 0.515870315014946, + "grad_norm": 0.734375, + "learning_rate": 2.3844988135864654e-05, + "loss": 0.8744, + "step": 7011 + }, + { + "epoch": 0.51594389514831, + "grad_norm": 0.71875, + "learning_rate": 2.3839202756499426e-05, + "loss": 0.6514, + "step": 7012 + }, + { + "epoch": 0.516017475281674, + "grad_norm": 1.125, + "learning_rate": 2.3833417439431993e-05, + "loss": 1.3973, + "step": 7013 + }, + { + "epoch": 0.516091055415038, + "grad_norm": 1.015625, + "learning_rate": 2.3827632184972842e-05, + "loss": 0.8511, + "step": 7014 + }, + { + "epoch": 0.516164635548402, + "grad_norm": 0.9375, + "learning_rate": 2.382184699343247e-05, + "loss": 0.8448, + "step": 7015 + }, + { + "epoch": 0.516238215681766, + "grad_norm": 0.88671875, + "learning_rate": 2.3816061865121333e-05, + "loss": 0.9146, + "step": 7016 + }, + { + "epoch": 0.5163117958151299, + "grad_norm": 0.703125, + "learning_rate": 2.3810276800349918e-05, + "loss": 0.6113, + "step": 7017 + }, + { + "epoch": 0.5163853759484939, + "grad_norm": 0.8203125, + "learning_rate": 2.38044917994287e-05, + "loss": 0.7426, + "step": 7018 + }, + { + "epoch": 0.5164589560818579, + "grad_norm": 0.734375, + "learning_rate": 2.379870686266815e-05, + "loss": 0.8616, + "step": 7019 + }, + { + "epoch": 0.5165325362152219, + "grad_norm": 1.0703125, + "learning_rate": 2.379292199037874e-05, + "loss": 1.7812, + "step": 7020 + }, + { + "epoch": 0.5166061163485859, + "grad_norm": 0.84765625, + "learning_rate": 2.3787137182870926e-05, + "loss": 0.8274, + "step": 7021 + }, + { + "epoch": 0.5166796964819499, + "grad_norm": 0.9765625, + "learning_rate": 2.3781352440455162e-05, + "loss": 1.1996, + "step": 7022 + }, + { + "epoch": 0.5167532766153139, + "grad_norm": 2.90625, + "learning_rate": 2.3775567763441915e-05, + "loss": 0.9538, + "step": 7023 + }, + { + "epoch": 0.5168268567486779, + "grad_norm": 0.75390625, + "learning_rate": 2.3769783152141633e-05, + "loss": 0.7558, + "step": 7024 + }, + { + "epoch": 0.5169004368820419, + "grad_norm": 1.078125, + "learning_rate": 2.3763998606864764e-05, + "loss": 1.3663, + "step": 7025 + }, + { + "epoch": 0.5169740170154058, + "grad_norm": 1.09375, + "learning_rate": 2.375821412792176e-05, + "loss": 1.1262, + "step": 7026 + }, + { + "epoch": 0.5170475971487698, + "grad_norm": 0.99609375, + "learning_rate": 2.3752429715623052e-05, + "loss": 1.0163, + "step": 7027 + }, + { + "epoch": 0.5171211772821338, + "grad_norm": 1.09375, + "learning_rate": 2.3746645370279084e-05, + "loss": 1.0332, + "step": 7028 + }, + { + "epoch": 0.5171947574154978, + "grad_norm": 0.859375, + "learning_rate": 2.3740861092200307e-05, + "loss": 1.1006, + "step": 7029 + }, + { + "epoch": 0.5172683375488618, + "grad_norm": 0.80859375, + "learning_rate": 2.3735076881697124e-05, + "loss": 1.0731, + "step": 7030 + }, + { + "epoch": 0.5173419176822258, + "grad_norm": 0.8125, + "learning_rate": 2.3729292739079975e-05, + "loss": 0.7434, + "step": 7031 + }, + { + "epoch": 0.5174154978155898, + "grad_norm": 0.609375, + "learning_rate": 2.3723508664659286e-05, + "loss": 0.6844, + "step": 7032 + }, + { + "epoch": 0.5174890779489538, + "grad_norm": 1.125, + "learning_rate": 2.371772465874548e-05, + "loss": 1.2202, + "step": 7033 + }, + { + "epoch": 0.5175626580823178, + "grad_norm": 0.7421875, + "learning_rate": 2.3711940721648966e-05, + "loss": 0.7353, + "step": 7034 + }, + { + "epoch": 0.5176362382156817, + "grad_norm": 0.8046875, + "learning_rate": 2.370615685368016e-05, + "loss": 0.8891, + "step": 7035 + }, + { + "epoch": 0.5177098183490457, + "grad_norm": 0.890625, + "learning_rate": 2.3700373055149472e-05, + "loss": 1.232, + "step": 7036 + }, + { + "epoch": 0.5177833984824097, + "grad_norm": 0.984375, + "learning_rate": 2.3694589326367305e-05, + "loss": 0.8903, + "step": 7037 + }, + { + "epoch": 0.5178569786157737, + "grad_norm": 0.89453125, + "learning_rate": 2.3688805667644074e-05, + "loss": 1.089, + "step": 7038 + }, + { + "epoch": 0.5179305587491377, + "grad_norm": 0.9921875, + "learning_rate": 2.3683022079290158e-05, + "loss": 0.9619, + "step": 7039 + }, + { + "epoch": 0.5180041388825017, + "grad_norm": 0.76171875, + "learning_rate": 2.3677238561615958e-05, + "loss": 1.0581, + "step": 7040 + }, + { + "epoch": 0.5180777190158657, + "grad_norm": 1.0, + "learning_rate": 2.3671455114931867e-05, + "loss": 1.1741, + "step": 7041 + }, + { + "epoch": 0.5181512991492297, + "grad_norm": 0.95703125, + "learning_rate": 2.366567173954827e-05, + "loss": 0.871, + "step": 7042 + }, + { + "epoch": 0.5182248792825938, + "grad_norm": 0.99609375, + "learning_rate": 2.3659888435775565e-05, + "loss": 1.5283, + "step": 7043 + }, + { + "epoch": 0.5182984594159576, + "grad_norm": 0.7109375, + "learning_rate": 2.3654105203924105e-05, + "loss": 0.7814, + "step": 7044 + }, + { + "epoch": 0.5183720395493217, + "grad_norm": 0.80078125, + "learning_rate": 2.364832204430427e-05, + "loss": 0.8017, + "step": 7045 + }, + { + "epoch": 0.5184456196826857, + "grad_norm": 0.9765625, + "learning_rate": 2.364253895722644e-05, + "loss": 1.2968, + "step": 7046 + }, + { + "epoch": 0.5185191998160497, + "grad_norm": 2.703125, + "learning_rate": 2.3636755943000995e-05, + "loss": 0.6544, + "step": 7047 + }, + { + "epoch": 0.5185927799494137, + "grad_norm": 1.0234375, + "learning_rate": 2.3630973001938273e-05, + "loss": 1.4115, + "step": 7048 + }, + { + "epoch": 0.5186663600827777, + "grad_norm": 0.60546875, + "learning_rate": 2.362519013434864e-05, + "loss": 0.6817, + "step": 7049 + }, + { + "epoch": 0.5187399402161417, + "grad_norm": 0.921875, + "learning_rate": 2.361940734054246e-05, + "loss": 1.2427, + "step": 7050 + }, + { + "epoch": 0.5188135203495057, + "grad_norm": 1.0234375, + "learning_rate": 2.361362462083008e-05, + "loss": 0.8087, + "step": 7051 + }, + { + "epoch": 0.5188871004828697, + "grad_norm": 0.8515625, + "learning_rate": 2.3607841975521852e-05, + "loss": 1.005, + "step": 7052 + }, + { + "epoch": 0.5189606806162336, + "grad_norm": 0.9609375, + "learning_rate": 2.3602059404928112e-05, + "loss": 0.801, + "step": 7053 + }, + { + "epoch": 0.5190342607495976, + "grad_norm": 0.94140625, + "learning_rate": 2.35962769093592e-05, + "loss": 1.3282, + "step": 7054 + }, + { + "epoch": 0.5191078408829616, + "grad_norm": 0.69921875, + "learning_rate": 2.359049448912546e-05, + "loss": 0.7246, + "step": 7055 + }, + { + "epoch": 0.5191814210163256, + "grad_norm": 1.0625, + "learning_rate": 2.358471214453723e-05, + "loss": 1.1222, + "step": 7056 + }, + { + "epoch": 0.5192550011496896, + "grad_norm": 0.76953125, + "learning_rate": 2.3578929875904812e-05, + "loss": 0.9401, + "step": 7057 + }, + { + "epoch": 0.5193285812830536, + "grad_norm": 0.77734375, + "learning_rate": 2.3573147683538544e-05, + "loss": 0.7846, + "step": 7058 + }, + { + "epoch": 0.5194021614164176, + "grad_norm": 0.84765625, + "learning_rate": 2.3567365567748744e-05, + "loss": 0.6962, + "step": 7059 + }, + { + "epoch": 0.5194757415497816, + "grad_norm": 0.8828125, + "learning_rate": 2.3561583528845724e-05, + "loss": 0.8994, + "step": 7060 + }, + { + "epoch": 0.5195493216831456, + "grad_norm": 0.86328125, + "learning_rate": 2.3555801567139816e-05, + "loss": 0.9372, + "step": 7061 + }, + { + "epoch": 0.5196229018165095, + "grad_norm": 0.8125, + "learning_rate": 2.3550019682941295e-05, + "loss": 1.2208, + "step": 7062 + }, + { + "epoch": 0.5196964819498735, + "grad_norm": 0.83203125, + "learning_rate": 2.3544237876560484e-05, + "loss": 0.938, + "step": 7063 + }, + { + "epoch": 0.5197700620832375, + "grad_norm": 1.34375, + "learning_rate": 2.3538456148307674e-05, + "loss": 1.3258, + "step": 7064 + }, + { + "epoch": 0.5198436422166015, + "grad_norm": 1.109375, + "learning_rate": 2.3532674498493172e-05, + "loss": 1.2045, + "step": 7065 + }, + { + "epoch": 0.5199172223499655, + "grad_norm": 0.97265625, + "learning_rate": 2.3526892927427248e-05, + "loss": 0.8768, + "step": 7066 + }, + { + "epoch": 0.5199908024833295, + "grad_norm": 0.890625, + "learning_rate": 2.3521111435420198e-05, + "loss": 0.959, + "step": 7067 + }, + { + "epoch": 0.5200643826166935, + "grad_norm": 0.84375, + "learning_rate": 2.3515330022782307e-05, + "loss": 0.958, + "step": 7068 + }, + { + "epoch": 0.5201379627500575, + "grad_norm": 0.83984375, + "learning_rate": 2.350954868982385e-05, + "loss": 0.7206, + "step": 7069 + }, + { + "epoch": 0.5202115428834215, + "grad_norm": 0.890625, + "learning_rate": 2.3503767436855108e-05, + "loss": 0.8993, + "step": 7070 + }, + { + "epoch": 0.5202851230167854, + "grad_norm": 1.0078125, + "learning_rate": 2.349798626418633e-05, + "loss": 1.1485, + "step": 7071 + }, + { + "epoch": 0.5203587031501494, + "grad_norm": 0.91796875, + "learning_rate": 2.3492205172127797e-05, + "loss": 0.8782, + "step": 7072 + }, + { + "epoch": 0.5204322832835134, + "grad_norm": 0.99609375, + "learning_rate": 2.3486424160989753e-05, + "loss": 0.9922, + "step": 7073 + }, + { + "epoch": 0.5205058634168774, + "grad_norm": 0.734375, + "learning_rate": 2.3480643231082475e-05, + "loss": 0.6067, + "step": 7074 + }, + { + "epoch": 0.5205794435502414, + "grad_norm": 0.8203125, + "learning_rate": 2.347486238271622e-05, + "loss": 0.9247, + "step": 7075 + }, + { + "epoch": 0.5206530236836054, + "grad_norm": 0.7421875, + "learning_rate": 2.34690816162012e-05, + "loss": 0.7634, + "step": 7076 + }, + { + "epoch": 0.5207266038169694, + "grad_norm": 0.73828125, + "learning_rate": 2.3463300931847684e-05, + "loss": 0.9113, + "step": 7077 + }, + { + "epoch": 0.5208001839503335, + "grad_norm": 0.94921875, + "learning_rate": 2.34575203299659e-05, + "loss": 0.9231, + "step": 7078 + }, + { + "epoch": 0.5208737640836975, + "grad_norm": 0.77734375, + "learning_rate": 2.34517398108661e-05, + "loss": 0.4933, + "step": 7079 + }, + { + "epoch": 0.5209473442170613, + "grad_norm": 0.84765625, + "learning_rate": 2.344595937485849e-05, + "loss": 0.8535, + "step": 7080 + }, + { + "epoch": 0.5210209243504254, + "grad_norm": 0.96484375, + "learning_rate": 2.3440179022253303e-05, + "loss": 0.8081, + "step": 7081 + }, + { + "epoch": 0.5210945044837894, + "grad_norm": 0.796875, + "learning_rate": 2.3434398753360765e-05, + "loss": 1.1482, + "step": 7082 + }, + { + "epoch": 0.5211680846171534, + "grad_norm": 1.078125, + "learning_rate": 2.3428618568491086e-05, + "loss": 1.1784, + "step": 7083 + }, + { + "epoch": 0.5212416647505174, + "grad_norm": 0.72265625, + "learning_rate": 2.3422838467954495e-05, + "loss": 0.8263, + "step": 7084 + }, + { + "epoch": 0.5213152448838814, + "grad_norm": 0.87109375, + "learning_rate": 2.341705845206117e-05, + "loss": 0.8077, + "step": 7085 + }, + { + "epoch": 0.5213888250172454, + "grad_norm": 1.1796875, + "learning_rate": 2.3411278521121327e-05, + "loss": 1.7572, + "step": 7086 + }, + { + "epoch": 0.5214624051506094, + "grad_norm": 0.8203125, + "learning_rate": 2.3405498675445166e-05, + "loss": 0.8529, + "step": 7087 + }, + { + "epoch": 0.5215359852839734, + "grad_norm": 0.8984375, + "learning_rate": 2.3399718915342893e-05, + "loss": 1.0166, + "step": 7088 + }, + { + "epoch": 0.5216095654173373, + "grad_norm": 0.75390625, + "learning_rate": 2.3393939241124672e-05, + "loss": 0.741, + "step": 7089 + }, + { + "epoch": 0.5216831455507013, + "grad_norm": 1.0703125, + "learning_rate": 2.33881596531007e-05, + "loss": 0.9169, + "step": 7090 + }, + { + "epoch": 0.5217567256840653, + "grad_norm": 0.87109375, + "learning_rate": 2.3382380151581154e-05, + "loss": 0.9867, + "step": 7091 + }, + { + "epoch": 0.5218303058174293, + "grad_norm": 0.8515625, + "learning_rate": 2.337660073687621e-05, + "loss": 1.0336, + "step": 7092 + }, + { + "epoch": 0.5219038859507933, + "grad_norm": 0.734375, + "learning_rate": 2.337082140929604e-05, + "loss": 1.063, + "step": 7093 + }, + { + "epoch": 0.5219774660841573, + "grad_norm": 1.125, + "learning_rate": 2.3365042169150808e-05, + "loss": 1.2778, + "step": 7094 + }, + { + "epoch": 0.5220510462175213, + "grad_norm": 0.9453125, + "learning_rate": 2.3359263016750673e-05, + "loss": 0.8718, + "step": 7095 + }, + { + "epoch": 0.5221246263508853, + "grad_norm": 0.78125, + "learning_rate": 2.335348395240579e-05, + "loss": 0.4805, + "step": 7096 + }, + { + "epoch": 0.5221982064842493, + "grad_norm": 0.8203125, + "learning_rate": 2.3347704976426328e-05, + "loss": 0.8076, + "step": 7097 + }, + { + "epoch": 0.5222717866176132, + "grad_norm": 0.83984375, + "learning_rate": 2.334192608912241e-05, + "loss": 0.7714, + "step": 7098 + }, + { + "epoch": 0.5223453667509772, + "grad_norm": 0.90234375, + "learning_rate": 2.333614729080418e-05, + "loss": 0.9829, + "step": 7099 + }, + { + "epoch": 0.5224189468843412, + "grad_norm": 0.8515625, + "learning_rate": 2.3330368581781783e-05, + "loss": 1.0025, + "step": 7100 + }, + { + "epoch": 0.5224925270177052, + "grad_norm": 0.89453125, + "learning_rate": 2.3324589962365357e-05, + "loss": 0.8973, + "step": 7101 + }, + { + "epoch": 0.5225661071510692, + "grad_norm": 0.984375, + "learning_rate": 2.3318811432865032e-05, + "loss": 1.0828, + "step": 7102 + }, + { + "epoch": 0.5226396872844332, + "grad_norm": 0.9140625, + "learning_rate": 2.3313032993590907e-05, + "loss": 0.9739, + "step": 7103 + }, + { + "epoch": 0.5227132674177972, + "grad_norm": 0.90234375, + "learning_rate": 2.3307254644853122e-05, + "loss": 0.9812, + "step": 7104 + }, + { + "epoch": 0.5227868475511612, + "grad_norm": 0.859375, + "learning_rate": 2.330147638696178e-05, + "loss": 0.8089, + "step": 7105 + }, + { + "epoch": 0.5228604276845252, + "grad_norm": 0.91015625, + "learning_rate": 2.329569822022699e-05, + "loss": 0.848, + "step": 7106 + }, + { + "epoch": 0.5229340078178891, + "grad_norm": 1.1171875, + "learning_rate": 2.3289920144958864e-05, + "loss": 1.4171, + "step": 7107 + }, + { + "epoch": 0.5230075879512531, + "grad_norm": 0.9765625, + "learning_rate": 2.3284142161467493e-05, + "loss": 0.9272, + "step": 7108 + }, + { + "epoch": 0.5230811680846171, + "grad_norm": 0.8359375, + "learning_rate": 2.3278364270062966e-05, + "loss": 1.0096, + "step": 7109 + }, + { + "epoch": 0.5231547482179811, + "grad_norm": 0.84765625, + "learning_rate": 2.327258647105538e-05, + "loss": 0.9844, + "step": 7110 + }, + { + "epoch": 0.5232283283513451, + "grad_norm": 0.93359375, + "learning_rate": 2.3266808764754824e-05, + "loss": 1.5004, + "step": 7111 + }, + { + "epoch": 0.5233019084847091, + "grad_norm": 0.9453125, + "learning_rate": 2.326103115147136e-05, + "loss": 0.9452, + "step": 7112 + }, + { + "epoch": 0.5233754886180731, + "grad_norm": 0.8046875, + "learning_rate": 2.3255253631515062e-05, + "loss": 1.2341, + "step": 7113 + }, + { + "epoch": 0.5234490687514372, + "grad_norm": 0.8359375, + "learning_rate": 2.3249476205196014e-05, + "loss": 0.9148, + "step": 7114 + }, + { + "epoch": 0.5235226488848012, + "grad_norm": 0.90625, + "learning_rate": 2.3243698872824267e-05, + "loss": 1.3095, + "step": 7115 + }, + { + "epoch": 0.523596229018165, + "grad_norm": 0.6875, + "learning_rate": 2.32379216347099e-05, + "loss": 0.7405, + "step": 7116 + }, + { + "epoch": 0.523669809151529, + "grad_norm": 0.921875, + "learning_rate": 2.3232144491162938e-05, + "loss": 1.0501, + "step": 7117 + }, + { + "epoch": 0.5237433892848931, + "grad_norm": 1.03125, + "learning_rate": 2.3226367442493442e-05, + "loss": 0.9641, + "step": 7118 + }, + { + "epoch": 0.5238169694182571, + "grad_norm": 0.75, + "learning_rate": 2.3220590489011455e-05, + "loss": 0.7268, + "step": 7119 + }, + { + "epoch": 0.5238905495516211, + "grad_norm": 0.90625, + "learning_rate": 2.3214813631027025e-05, + "loss": 1.2105, + "step": 7120 + }, + { + "epoch": 0.5239641296849851, + "grad_norm": 0.80078125, + "learning_rate": 2.3209036868850164e-05, + "loss": 0.9473, + "step": 7121 + }, + { + "epoch": 0.5240377098183491, + "grad_norm": 1.0625, + "learning_rate": 2.3203260202790915e-05, + "loss": 1.2614, + "step": 7122 + }, + { + "epoch": 0.5241112899517131, + "grad_norm": 0.90234375, + "learning_rate": 2.31974836331593e-05, + "loss": 1.2449, + "step": 7123 + }, + { + "epoch": 0.5241848700850771, + "grad_norm": 0.984375, + "learning_rate": 2.319170716026533e-05, + "loss": 1.1326, + "step": 7124 + }, + { + "epoch": 0.524258450218441, + "grad_norm": 1.0625, + "learning_rate": 2.318593078441903e-05, + "loss": 1.1696, + "step": 7125 + }, + { + "epoch": 0.524332030351805, + "grad_norm": 0.9765625, + "learning_rate": 2.318015450593039e-05, + "loss": 1.0247, + "step": 7126 + }, + { + "epoch": 0.524405610485169, + "grad_norm": 0.8984375, + "learning_rate": 2.317437832510942e-05, + "loss": 0.9742, + "step": 7127 + }, + { + "epoch": 0.524479190618533, + "grad_norm": 1.046875, + "learning_rate": 2.316860224226612e-05, + "loss": 0.8823, + "step": 7128 + }, + { + "epoch": 0.524552770751897, + "grad_norm": 0.83203125, + "learning_rate": 2.3162826257710488e-05, + "loss": 0.9043, + "step": 7129 + }, + { + "epoch": 0.524626350885261, + "grad_norm": 0.76953125, + "learning_rate": 2.315705037175249e-05, + "loss": 0.7274, + "step": 7130 + }, + { + "epoch": 0.524699931018625, + "grad_norm": 0.6796875, + "learning_rate": 2.315127458470212e-05, + "loss": 0.5809, + "step": 7131 + }, + { + "epoch": 0.524773511151989, + "grad_norm": 0.73046875, + "learning_rate": 2.3145498896869345e-05, + "loss": 1.0719, + "step": 7132 + }, + { + "epoch": 0.524847091285353, + "grad_norm": 1.078125, + "learning_rate": 2.3139723308564146e-05, + "loss": 1.2744, + "step": 7133 + }, + { + "epoch": 0.5249206714187169, + "grad_norm": 0.69921875, + "learning_rate": 2.3133947820096487e-05, + "loss": 0.7277, + "step": 7134 + }, + { + "epoch": 0.5249942515520809, + "grad_norm": 0.80078125, + "learning_rate": 2.3128172431776322e-05, + "loss": 0.8872, + "step": 7135 + }, + { + "epoch": 0.5250678316854449, + "grad_norm": 0.99609375, + "learning_rate": 2.3122397143913604e-05, + "loss": 1.0752, + "step": 7136 + }, + { + "epoch": 0.5251414118188089, + "grad_norm": 0.8984375, + "learning_rate": 2.311662195681829e-05, + "loss": 0.7656, + "step": 7137 + }, + { + "epoch": 0.5252149919521729, + "grad_norm": 0.97265625, + "learning_rate": 2.3110846870800324e-05, + "loss": 1.03, + "step": 7138 + }, + { + "epoch": 0.5252885720855369, + "grad_norm": 0.84375, + "learning_rate": 2.310507188616962e-05, + "loss": 0.8522, + "step": 7139 + }, + { + "epoch": 0.5253621522189009, + "grad_norm": 0.97265625, + "learning_rate": 2.309929700323614e-05, + "loss": 0.817, + "step": 7140 + }, + { + "epoch": 0.5254357323522649, + "grad_norm": 0.9296875, + "learning_rate": 2.30935222223098e-05, + "loss": 1.248, + "step": 7141 + }, + { + "epoch": 0.5255093124856289, + "grad_norm": 0.8359375, + "learning_rate": 2.3087747543700516e-05, + "loss": 0.8816, + "step": 7142 + }, + { + "epoch": 0.5255828926189928, + "grad_norm": 0.8671875, + "learning_rate": 2.3081972967718226e-05, + "loss": 0.814, + "step": 7143 + }, + { + "epoch": 0.5256564727523568, + "grad_norm": 0.93359375, + "learning_rate": 2.307619849467281e-05, + "loss": 0.8109, + "step": 7144 + }, + { + "epoch": 0.5257300528857208, + "grad_norm": 0.85546875, + "learning_rate": 2.307042412487419e-05, + "loss": 0.9709, + "step": 7145 + }, + { + "epoch": 0.5258036330190848, + "grad_norm": 0.73828125, + "learning_rate": 2.306464985863226e-05, + "loss": 0.5595, + "step": 7146 + }, + { + "epoch": 0.5258772131524488, + "grad_norm": 0.75, + "learning_rate": 2.3058875696256917e-05, + "loss": 0.7491, + "step": 7147 + }, + { + "epoch": 0.5259507932858128, + "grad_norm": 0.80078125, + "learning_rate": 2.3053101638058055e-05, + "loss": 1.3381, + "step": 7148 + }, + { + "epoch": 0.5260243734191768, + "grad_norm": 0.765625, + "learning_rate": 2.3047327684345548e-05, + "loss": 0.8883, + "step": 7149 + }, + { + "epoch": 0.5260979535525409, + "grad_norm": 0.8984375, + "learning_rate": 2.304155383542927e-05, + "loss": 1.0357, + "step": 7150 + }, + { + "epoch": 0.5261715336859049, + "grad_norm": 1.0546875, + "learning_rate": 2.3035780091619104e-05, + "loss": 1.2783, + "step": 7151 + }, + { + "epoch": 0.5262451138192688, + "grad_norm": 0.71875, + "learning_rate": 2.303000645322492e-05, + "loss": 0.7235, + "step": 7152 + }, + { + "epoch": 0.5263186939526328, + "grad_norm": 0.7734375, + "learning_rate": 2.3024232920556555e-05, + "loss": 1.0723, + "step": 7153 + }, + { + "epoch": 0.5263922740859968, + "grad_norm": 0.9140625, + "learning_rate": 2.301845949392388e-05, + "loss": 0.7592, + "step": 7154 + }, + { + "epoch": 0.5264658542193608, + "grad_norm": 0.984375, + "learning_rate": 2.3012686173636742e-05, + "loss": 0.8355, + "step": 7155 + }, + { + "epoch": 0.5265394343527248, + "grad_norm": 1.0859375, + "learning_rate": 2.3006912960004985e-05, + "loss": 1.0347, + "step": 7156 + }, + { + "epoch": 0.5266130144860888, + "grad_norm": 0.8984375, + "learning_rate": 2.3001139853338453e-05, + "loss": 1.2346, + "step": 7157 + }, + { + "epoch": 0.5266865946194528, + "grad_norm": 0.7109375, + "learning_rate": 2.299536685394696e-05, + "loss": 0.8342, + "step": 7158 + }, + { + "epoch": 0.5267601747528168, + "grad_norm": 0.94921875, + "learning_rate": 2.298959396214034e-05, + "loss": 1.0139, + "step": 7159 + }, + { + "epoch": 0.5268337548861808, + "grad_norm": 0.8984375, + "learning_rate": 2.2983821178228416e-05, + "loss": 0.7412, + "step": 7160 + }, + { + "epoch": 0.5269073350195447, + "grad_norm": 0.9296875, + "learning_rate": 2.297804850252101e-05, + "loss": 1.248, + "step": 7161 + }, + { + "epoch": 0.5269809151529087, + "grad_norm": 0.8203125, + "learning_rate": 2.2972275935327914e-05, + "loss": 0.7006, + "step": 7162 + }, + { + "epoch": 0.5270544952862727, + "grad_norm": 0.85546875, + "learning_rate": 2.296650347695894e-05, + "loss": 0.9661, + "step": 7163 + }, + { + "epoch": 0.5271280754196367, + "grad_norm": 0.890625, + "learning_rate": 2.2960731127723885e-05, + "loss": 0.8884, + "step": 7164 + }, + { + "epoch": 0.5272016555530007, + "grad_norm": 0.890625, + "learning_rate": 2.2954958887932534e-05, + "loss": 1.1262, + "step": 7165 + }, + { + "epoch": 0.5272752356863647, + "grad_norm": 0.859375, + "learning_rate": 2.2949186757894685e-05, + "loss": 0.7143, + "step": 7166 + }, + { + "epoch": 0.5273488158197287, + "grad_norm": 0.7734375, + "learning_rate": 2.29434147379201e-05, + "loss": 0.6592, + "step": 7167 + }, + { + "epoch": 0.5274223959530927, + "grad_norm": 0.96875, + "learning_rate": 2.2937642828318568e-05, + "loss": 1.0319, + "step": 7168 + }, + { + "epoch": 0.5274959760864567, + "grad_norm": 0.8671875, + "learning_rate": 2.293187102939985e-05, + "loss": 1.0607, + "step": 7169 + }, + { + "epoch": 0.5275695562198206, + "grad_norm": 0.87890625, + "learning_rate": 2.2926099341473714e-05, + "loss": 1.3403, + "step": 7170 + }, + { + "epoch": 0.5276431363531846, + "grad_norm": 0.79296875, + "learning_rate": 2.29203277648499e-05, + "loss": 0.7048, + "step": 7171 + }, + { + "epoch": 0.5277167164865486, + "grad_norm": 0.9765625, + "learning_rate": 2.2914556299838166e-05, + "loss": 1.1061, + "step": 7172 + }, + { + "epoch": 0.5277902966199126, + "grad_norm": 0.68359375, + "learning_rate": 2.290878494674826e-05, + "loss": 0.6776, + "step": 7173 + }, + { + "epoch": 0.5278638767532766, + "grad_norm": 0.80859375, + "learning_rate": 2.2903013705889916e-05, + "loss": 0.7855, + "step": 7174 + }, + { + "epoch": 0.5279374568866406, + "grad_norm": 0.7734375, + "learning_rate": 2.2897242577572868e-05, + "loss": 0.9804, + "step": 7175 + }, + { + "epoch": 0.5280110370200046, + "grad_norm": 0.9296875, + "learning_rate": 2.2891471562106832e-05, + "loss": 1.1, + "step": 7176 + }, + { + "epoch": 0.5280846171533686, + "grad_norm": 0.95703125, + "learning_rate": 2.288570065980154e-05, + "loss": 1.1266, + "step": 7177 + }, + { + "epoch": 0.5281581972867326, + "grad_norm": 0.796875, + "learning_rate": 2.28799298709667e-05, + "loss": 0.9623, + "step": 7178 + }, + { + "epoch": 0.5282317774200965, + "grad_norm": 0.75390625, + "learning_rate": 2.2874159195912015e-05, + "loss": 0.8287, + "step": 7179 + }, + { + "epoch": 0.5283053575534605, + "grad_norm": 0.8359375, + "learning_rate": 2.28683886349472e-05, + "loss": 0.9611, + "step": 7180 + }, + { + "epoch": 0.5283789376868245, + "grad_norm": 0.7265625, + "learning_rate": 2.2862618188381936e-05, + "loss": 0.9517, + "step": 7181 + }, + { + "epoch": 0.5284525178201885, + "grad_norm": 0.86328125, + "learning_rate": 2.2856847856525916e-05, + "loss": 0.797, + "step": 7182 + }, + { + "epoch": 0.5285260979535525, + "grad_norm": 0.84375, + "learning_rate": 2.285107763968882e-05, + "loss": 1.0026, + "step": 7183 + }, + { + "epoch": 0.5285996780869165, + "grad_norm": 0.9921875, + "learning_rate": 2.2845307538180342e-05, + "loss": 0.999, + "step": 7184 + }, + { + "epoch": 0.5286732582202806, + "grad_norm": 0.8125, + "learning_rate": 2.283953755231013e-05, + "loss": 1.0142, + "step": 7185 + }, + { + "epoch": 0.5287468383536446, + "grad_norm": 1.0078125, + "learning_rate": 2.2833767682387856e-05, + "loss": 0.991, + "step": 7186 + }, + { + "epoch": 0.5288204184870086, + "grad_norm": 0.76171875, + "learning_rate": 2.2827997928723177e-05, + "loss": 0.8853, + "step": 7187 + }, + { + "epoch": 0.5288939986203725, + "grad_norm": 1.015625, + "learning_rate": 2.2822228291625746e-05, + "loss": 1.3182, + "step": 7188 + }, + { + "epoch": 0.5289675787537365, + "grad_norm": 0.6640625, + "learning_rate": 2.281645877140522e-05, + "loss": 0.722, + "step": 7189 + }, + { + "epoch": 0.5290411588871005, + "grad_norm": 0.828125, + "learning_rate": 2.281068936837122e-05, + "loss": 0.8684, + "step": 7190 + }, + { + "epoch": 0.5291147390204645, + "grad_norm": 0.8828125, + "learning_rate": 2.2804920082833385e-05, + "loss": 0.7389, + "step": 7191 + }, + { + "epoch": 0.5291883191538285, + "grad_norm": 0.87109375, + "learning_rate": 2.279915091510134e-05, + "loss": 1.354, + "step": 7192 + }, + { + "epoch": 0.5292618992871925, + "grad_norm": 0.796875, + "learning_rate": 2.279338186548472e-05, + "loss": 0.796, + "step": 7193 + }, + { + "epoch": 0.5293354794205565, + "grad_norm": 0.7578125, + "learning_rate": 2.278761293429312e-05, + "loss": 0.7321, + "step": 7194 + }, + { + "epoch": 0.5294090595539205, + "grad_norm": 0.828125, + "learning_rate": 2.2781844121836155e-05, + "loss": 0.8823, + "step": 7195 + }, + { + "epoch": 0.5294826396872845, + "grad_norm": 0.8046875, + "learning_rate": 2.2776075428423426e-05, + "loss": 0.7476, + "step": 7196 + }, + { + "epoch": 0.5295562198206484, + "grad_norm": 0.9453125, + "learning_rate": 2.277030685436453e-05, + "loss": 0.8428, + "step": 7197 + }, + { + "epoch": 0.5296297999540124, + "grad_norm": 0.79296875, + "learning_rate": 2.2764538399969065e-05, + "loss": 0.8999, + "step": 7198 + }, + { + "epoch": 0.5297033800873764, + "grad_norm": 0.78515625, + "learning_rate": 2.275877006554659e-05, + "loss": 0.8448, + "step": 7199 + }, + { + "epoch": 0.5297769602207404, + "grad_norm": 0.84765625, + "learning_rate": 2.275300185140669e-05, + "loss": 0.9239, + "step": 7200 + }, + { + "epoch": 0.5298505403541044, + "grad_norm": 0.91796875, + "learning_rate": 2.274723375785894e-05, + "loss": 0.9951, + "step": 7201 + }, + { + "epoch": 0.5299241204874684, + "grad_norm": 0.9609375, + "learning_rate": 2.2741465785212905e-05, + "loss": 0.832, + "step": 7202 + }, + { + "epoch": 0.5299977006208324, + "grad_norm": 0.82421875, + "learning_rate": 2.273569793377813e-05, + "loss": 0.6314, + "step": 7203 + }, + { + "epoch": 0.5300712807541964, + "grad_norm": 0.78515625, + "learning_rate": 2.2729930203864167e-05, + "loss": 0.9671, + "step": 7204 + }, + { + "epoch": 0.5301448608875604, + "grad_norm": 0.8046875, + "learning_rate": 2.2724162595780564e-05, + "loss": 0.9649, + "step": 7205 + }, + { + "epoch": 0.5302184410209243, + "grad_norm": 0.8125, + "learning_rate": 2.271839510983686e-05, + "loss": 0.7653, + "step": 7206 + }, + { + "epoch": 0.5302920211542883, + "grad_norm": 0.7890625, + "learning_rate": 2.271262774634258e-05, + "loss": 0.6205, + "step": 7207 + }, + { + "epoch": 0.5303656012876523, + "grad_norm": 0.93359375, + "learning_rate": 2.2706860505607246e-05, + "loss": 1.0973, + "step": 7208 + }, + { + "epoch": 0.5304391814210163, + "grad_norm": 0.73046875, + "learning_rate": 2.2701093387940378e-05, + "loss": 0.7639, + "step": 7209 + }, + { + "epoch": 0.5305127615543803, + "grad_norm": 0.859375, + "learning_rate": 2.2695326393651485e-05, + "loss": 0.7974, + "step": 7210 + }, + { + "epoch": 0.5305863416877443, + "grad_norm": 0.9296875, + "learning_rate": 2.2689559523050073e-05, + "loss": 1.028, + "step": 7211 + }, + { + "epoch": 0.5306599218211083, + "grad_norm": 0.79296875, + "learning_rate": 2.268379277644565e-05, + "loss": 0.7399, + "step": 7212 + }, + { + "epoch": 0.5307335019544723, + "grad_norm": 1.015625, + "learning_rate": 2.267802615414768e-05, + "loss": 1.0821, + "step": 7213 + }, + { + "epoch": 0.5308070820878363, + "grad_norm": 1.1953125, + "learning_rate": 2.267225965646566e-05, + "loss": 1.3199, + "step": 7214 + }, + { + "epoch": 0.5308806622212002, + "grad_norm": 1.296875, + "learning_rate": 2.266649328370907e-05, + "loss": 1.1325, + "step": 7215 + }, + { + "epoch": 0.5309542423545642, + "grad_norm": 0.9453125, + "learning_rate": 2.2660727036187384e-05, + "loss": 1.1914, + "step": 7216 + }, + { + "epoch": 0.5310278224879282, + "grad_norm": 0.94140625, + "learning_rate": 2.2654960914210053e-05, + "loss": 1.0181, + "step": 7217 + }, + { + "epoch": 0.5311014026212922, + "grad_norm": 0.69140625, + "learning_rate": 2.264919491808654e-05, + "loss": 0.9841, + "step": 7218 + }, + { + "epoch": 0.5311749827546562, + "grad_norm": 0.6171875, + "learning_rate": 2.2643429048126298e-05, + "loss": 0.6514, + "step": 7219 + }, + { + "epoch": 0.5312485628880202, + "grad_norm": 1.2578125, + "learning_rate": 2.2637663304638764e-05, + "loss": 1.4098, + "step": 7220 + }, + { + "epoch": 0.5313221430213843, + "grad_norm": 0.77734375, + "learning_rate": 2.2631897687933388e-05, + "loss": 1.1333, + "step": 7221 + }, + { + "epoch": 0.5313957231547483, + "grad_norm": 0.890625, + "learning_rate": 2.2626132198319582e-05, + "loss": 1.2393, + "step": 7222 + }, + { + "epoch": 0.5314693032881123, + "grad_norm": 0.7578125, + "learning_rate": 2.262036683610678e-05, + "loss": 0.8314, + "step": 7223 + }, + { + "epoch": 0.5315428834214762, + "grad_norm": 0.7578125, + "learning_rate": 2.2614601601604393e-05, + "loss": 0.611, + "step": 7224 + }, + { + "epoch": 0.5316164635548402, + "grad_norm": 0.75390625, + "learning_rate": 2.2608836495121845e-05, + "loss": 0.7074, + "step": 7225 + }, + { + "epoch": 0.5316900436882042, + "grad_norm": 0.7578125, + "learning_rate": 2.2603071516968515e-05, + "loss": 1.0695, + "step": 7226 + }, + { + "epoch": 0.5317636238215682, + "grad_norm": 0.84375, + "learning_rate": 2.259730666745381e-05, + "loss": 0.7008, + "step": 7227 + }, + { + "epoch": 0.5318372039549322, + "grad_norm": 0.84765625, + "learning_rate": 2.2591541946887118e-05, + "loss": 0.6894, + "step": 7228 + }, + { + "epoch": 0.5319107840882962, + "grad_norm": 0.7890625, + "learning_rate": 2.2585777355577814e-05, + "loss": 0.7755, + "step": 7229 + }, + { + "epoch": 0.5319843642216602, + "grad_norm": 0.8125, + "learning_rate": 2.25800128938353e-05, + "loss": 0.986, + "step": 7230 + }, + { + "epoch": 0.5320579443550242, + "grad_norm": 0.76953125, + "learning_rate": 2.2574248561968904e-05, + "loss": 0.8011, + "step": 7231 + }, + { + "epoch": 0.5321315244883882, + "grad_norm": 0.875, + "learning_rate": 2.2568484360288014e-05, + "loss": 1.0675, + "step": 7232 + }, + { + "epoch": 0.5322051046217521, + "grad_norm": 0.765625, + "learning_rate": 2.2562720289101975e-05, + "loss": 0.7311, + "step": 7233 + }, + { + "epoch": 0.5322786847551161, + "grad_norm": 0.62890625, + "learning_rate": 2.2556956348720138e-05, + "loss": 0.6406, + "step": 7234 + }, + { + "epoch": 0.5323522648884801, + "grad_norm": 0.796875, + "learning_rate": 2.255119253945183e-05, + "loss": 0.6657, + "step": 7235 + }, + { + "epoch": 0.5324258450218441, + "grad_norm": 0.7734375, + "learning_rate": 2.25454288616064e-05, + "loss": 0.8719, + "step": 7236 + }, + { + "epoch": 0.5324994251552081, + "grad_norm": 0.76953125, + "learning_rate": 2.2539665315493164e-05, + "loss": 0.777, + "step": 7237 + }, + { + "epoch": 0.5325730052885721, + "grad_norm": 0.87890625, + "learning_rate": 2.2533901901421445e-05, + "loss": 1.1499, + "step": 7238 + }, + { + "epoch": 0.5326465854219361, + "grad_norm": 0.98828125, + "learning_rate": 2.2528138619700562e-05, + "loss": 0.8541, + "step": 7239 + }, + { + "epoch": 0.5327201655553001, + "grad_norm": 1.125, + "learning_rate": 2.25223754706398e-05, + "loss": 0.9575, + "step": 7240 + }, + { + "epoch": 0.5327937456886641, + "grad_norm": 0.76171875, + "learning_rate": 2.2516612454548468e-05, + "loss": 0.8939, + "step": 7241 + }, + { + "epoch": 0.532867325822028, + "grad_norm": 0.71484375, + "learning_rate": 2.2510849571735857e-05, + "loss": 0.7688, + "step": 7242 + }, + { + "epoch": 0.532940905955392, + "grad_norm": 0.92578125, + "learning_rate": 2.250508682251125e-05, + "loss": 0.7608, + "step": 7243 + }, + { + "epoch": 0.533014486088756, + "grad_norm": 0.96484375, + "learning_rate": 2.249932420718392e-05, + "loss": 1.3086, + "step": 7244 + }, + { + "epoch": 0.53308806622212, + "grad_norm": 0.76171875, + "learning_rate": 2.2493561726063135e-05, + "loss": 0.7568, + "step": 7245 + }, + { + "epoch": 0.533161646355484, + "grad_norm": 0.8828125, + "learning_rate": 2.2487799379458156e-05, + "loss": 0.693, + "step": 7246 + }, + { + "epoch": 0.533235226488848, + "grad_norm": 0.80078125, + "learning_rate": 2.248203716767824e-05, + "loss": 0.8562, + "step": 7247 + }, + { + "epoch": 0.533308806622212, + "grad_norm": 0.86328125, + "learning_rate": 2.247627509103264e-05, + "loss": 0.9862, + "step": 7248 + }, + { + "epoch": 0.533382386755576, + "grad_norm": 0.77734375, + "learning_rate": 2.247051314983058e-05, + "loss": 0.9593, + "step": 7249 + }, + { + "epoch": 0.53345596688894, + "grad_norm": 0.73046875, + "learning_rate": 2.2464751344381307e-05, + "loss": 1.0049, + "step": 7250 + }, + { + "epoch": 0.5335295470223039, + "grad_norm": 0.859375, + "learning_rate": 2.245898967499404e-05, + "loss": 1.0317, + "step": 7251 + }, + { + "epoch": 0.5336031271556679, + "grad_norm": 0.73046875, + "learning_rate": 2.2453228141977996e-05, + "loss": 0.8445, + "step": 7252 + }, + { + "epoch": 0.5336767072890319, + "grad_norm": 0.92578125, + "learning_rate": 2.2447466745642397e-05, + "loss": 1.3378, + "step": 7253 + }, + { + "epoch": 0.5337502874223959, + "grad_norm": 0.76171875, + "learning_rate": 2.2441705486296427e-05, + "loss": 0.7162, + "step": 7254 + }, + { + "epoch": 0.53382386755576, + "grad_norm": 0.9140625, + "learning_rate": 2.2435944364249294e-05, + "loss": 1.0402, + "step": 7255 + }, + { + "epoch": 0.533897447689124, + "grad_norm": 0.8046875, + "learning_rate": 2.2430183379810178e-05, + "loss": 1.0057, + "step": 7256 + }, + { + "epoch": 0.533971027822488, + "grad_norm": 0.875, + "learning_rate": 2.242442253328828e-05, + "loss": 1.0033, + "step": 7257 + }, + { + "epoch": 0.534044607955852, + "grad_norm": 0.78515625, + "learning_rate": 2.241866182499275e-05, + "loss": 0.6219, + "step": 7258 + }, + { + "epoch": 0.534118188089216, + "grad_norm": 1.0078125, + "learning_rate": 2.2412901255232765e-05, + "loss": 1.0824, + "step": 7259 + }, + { + "epoch": 0.5341917682225799, + "grad_norm": 0.69921875, + "learning_rate": 2.2407140824317486e-05, + "loss": 0.7379, + "step": 7260 + }, + { + "epoch": 0.5342653483559439, + "grad_norm": 0.7421875, + "learning_rate": 2.240138053255606e-05, + "loss": 0.7896, + "step": 7261 + }, + { + "epoch": 0.5343389284893079, + "grad_norm": 0.92578125, + "learning_rate": 2.2395620380257638e-05, + "loss": 1.1455, + "step": 7262 + }, + { + "epoch": 0.5344125086226719, + "grad_norm": 1.0234375, + "learning_rate": 2.2389860367731346e-05, + "loss": 1.0093, + "step": 7263 + }, + { + "epoch": 0.5344860887560359, + "grad_norm": 0.984375, + "learning_rate": 2.2384100495286315e-05, + "loss": 1.1271, + "step": 7264 + }, + { + "epoch": 0.5345596688893999, + "grad_norm": 0.97265625, + "learning_rate": 2.2378340763231673e-05, + "loss": 1.311, + "step": 7265 + }, + { + "epoch": 0.5346332490227639, + "grad_norm": 0.828125, + "learning_rate": 2.237258117187654e-05, + "loss": 1.0288, + "step": 7266 + }, + { + "epoch": 0.5347068291561279, + "grad_norm": 1.0859375, + "learning_rate": 2.2366821721530005e-05, + "loss": 1.013, + "step": 7267 + }, + { + "epoch": 0.5347804092894919, + "grad_norm": 1.015625, + "learning_rate": 2.236106241250117e-05, + "loss": 1.0508, + "step": 7268 + }, + { + "epoch": 0.5348539894228558, + "grad_norm": 1.015625, + "learning_rate": 2.2355303245099128e-05, + "loss": 1.031, + "step": 7269 + }, + { + "epoch": 0.5349275695562198, + "grad_norm": 1.1796875, + "learning_rate": 2.2349544219632974e-05, + "loss": 1.4136, + "step": 7270 + }, + { + "epoch": 0.5350011496895838, + "grad_norm": 0.83203125, + "learning_rate": 2.2343785336411783e-05, + "loss": 0.6587, + "step": 7271 + }, + { + "epoch": 0.5350747298229478, + "grad_norm": 0.79296875, + "learning_rate": 2.2338026595744604e-05, + "loss": 0.6274, + "step": 7272 + }, + { + "epoch": 0.5351483099563118, + "grad_norm": 0.90625, + "learning_rate": 2.2332267997940516e-05, + "loss": 1.0917, + "step": 7273 + }, + { + "epoch": 0.5352218900896758, + "grad_norm": 0.91796875, + "learning_rate": 2.232650954330856e-05, + "loss": 1.4517, + "step": 7274 + }, + { + "epoch": 0.5352954702230398, + "grad_norm": 0.76953125, + "learning_rate": 2.2320751232157793e-05, + "loss": 0.8355, + "step": 7275 + }, + { + "epoch": 0.5353690503564038, + "grad_norm": 0.9453125, + "learning_rate": 2.2314993064797244e-05, + "loss": 1.3266, + "step": 7276 + }, + { + "epoch": 0.5354426304897678, + "grad_norm": 1.171875, + "learning_rate": 2.2309235041535947e-05, + "loss": 1.5163, + "step": 7277 + }, + { + "epoch": 0.5355162106231317, + "grad_norm": 0.8984375, + "learning_rate": 2.230347716268292e-05, + "loss": 1.0089, + "step": 7278 + }, + { + "epoch": 0.5355897907564957, + "grad_norm": 0.98046875, + "learning_rate": 2.2297719428547186e-05, + "loss": 1.3617, + "step": 7279 + }, + { + "epoch": 0.5356633708898597, + "grad_norm": 0.828125, + "learning_rate": 2.229196183943775e-05, + "loss": 1.0673, + "step": 7280 + }, + { + "epoch": 0.5357369510232237, + "grad_norm": 0.73828125, + "learning_rate": 2.2286204395663602e-05, + "loss": 0.8204, + "step": 7281 + }, + { + "epoch": 0.5358105311565877, + "grad_norm": 0.91015625, + "learning_rate": 2.228044709753373e-05, + "loss": 1.0077, + "step": 7282 + }, + { + "epoch": 0.5358841112899517, + "grad_norm": 1.0546875, + "learning_rate": 2.2274689945357133e-05, + "loss": 0.9992, + "step": 7283 + }, + { + "epoch": 0.5359576914233157, + "grad_norm": 1.1171875, + "learning_rate": 2.226893293944278e-05, + "loss": 1.1013, + "step": 7284 + }, + { + "epoch": 0.5360312715566797, + "grad_norm": 0.85546875, + "learning_rate": 2.2263176080099645e-05, + "loss": 1.0745, + "step": 7285 + }, + { + "epoch": 0.5361048516900437, + "grad_norm": 0.79296875, + "learning_rate": 2.2257419367636673e-05, + "loss": 0.9617, + "step": 7286 + }, + { + "epoch": 0.5361784318234076, + "grad_norm": 0.92578125, + "learning_rate": 2.225166280236282e-05, + "loss": 0.7743, + "step": 7287 + }, + { + "epoch": 0.5362520119567716, + "grad_norm": 0.8828125, + "learning_rate": 2.2245906384587036e-05, + "loss": 1.2785, + "step": 7288 + }, + { + "epoch": 0.5363255920901356, + "grad_norm": 0.78125, + "learning_rate": 2.224015011461826e-05, + "loss": 0.8162, + "step": 7289 + }, + { + "epoch": 0.5363991722234996, + "grad_norm": 0.78515625, + "learning_rate": 2.223439399276541e-05, + "loss": 0.7667, + "step": 7290 + }, + { + "epoch": 0.5364727523568636, + "grad_norm": 1.15625, + "learning_rate": 2.222863801933741e-05, + "loss": 1.3415, + "step": 7291 + }, + { + "epoch": 0.5365463324902277, + "grad_norm": 0.96875, + "learning_rate": 2.2222882194643175e-05, + "loss": 0.9342, + "step": 7292 + }, + { + "epoch": 0.5366199126235917, + "grad_norm": 0.80078125, + "learning_rate": 2.2217126518991604e-05, + "loss": 0.8758, + "step": 7293 + }, + { + "epoch": 0.5366934927569557, + "grad_norm": 0.98046875, + "learning_rate": 2.221137099269161e-05, + "loss": 0.9747, + "step": 7294 + }, + { + "epoch": 0.5367670728903197, + "grad_norm": 0.91015625, + "learning_rate": 2.2205615616052057e-05, + "loss": 0.8359, + "step": 7295 + }, + { + "epoch": 0.5368406530236836, + "grad_norm": 0.94921875, + "learning_rate": 2.219986038938183e-05, + "loss": 1.2206, + "step": 7296 + }, + { + "epoch": 0.5369142331570476, + "grad_norm": 0.61328125, + "learning_rate": 2.2194105312989813e-05, + "loss": 0.6821, + "step": 7297 + }, + { + "epoch": 0.5369878132904116, + "grad_norm": 0.921875, + "learning_rate": 2.218835038718487e-05, + "loss": 0.8949, + "step": 7298 + }, + { + "epoch": 0.5370613934237756, + "grad_norm": 0.87890625, + "learning_rate": 2.2182595612275848e-05, + "loss": 0.7176, + "step": 7299 + }, + { + "epoch": 0.5371349735571396, + "grad_norm": 0.86328125, + "learning_rate": 2.2176840988571594e-05, + "loss": 1.0669, + "step": 7300 + }, + { + "epoch": 0.5372085536905036, + "grad_norm": 0.9921875, + "learning_rate": 2.2171086516380955e-05, + "loss": 0.9216, + "step": 7301 + }, + { + "epoch": 0.5372821338238676, + "grad_norm": 0.7578125, + "learning_rate": 2.216533219601276e-05, + "loss": 0.6807, + "step": 7302 + }, + { + "epoch": 0.5373557139572316, + "grad_norm": 0.8671875, + "learning_rate": 2.2159578027775833e-05, + "loss": 0.9525, + "step": 7303 + }, + { + "epoch": 0.5374292940905956, + "grad_norm": 0.83984375, + "learning_rate": 2.2153824011978984e-05, + "loss": 0.631, + "step": 7304 + }, + { + "epoch": 0.5375028742239595, + "grad_norm": 0.921875, + "learning_rate": 2.214807014893103e-05, + "loss": 1.2324, + "step": 7305 + }, + { + "epoch": 0.5375764543573235, + "grad_norm": 1.046875, + "learning_rate": 2.2142316438940757e-05, + "loss": 1.3342, + "step": 7306 + }, + { + "epoch": 0.5376500344906875, + "grad_norm": 0.8359375, + "learning_rate": 2.213656288231698e-05, + "loss": 1.0739, + "step": 7307 + }, + { + "epoch": 0.5377236146240515, + "grad_norm": 0.96484375, + "learning_rate": 2.213080947936845e-05, + "loss": 0.8572, + "step": 7308 + }, + { + "epoch": 0.5377971947574155, + "grad_norm": 0.921875, + "learning_rate": 2.212505623040395e-05, + "loss": 0.905, + "step": 7309 + }, + { + "epoch": 0.5378707748907795, + "grad_norm": 0.75, + "learning_rate": 2.211930313573226e-05, + "loss": 0.7273, + "step": 7310 + }, + { + "epoch": 0.5379443550241435, + "grad_norm": 0.83984375, + "learning_rate": 2.2113550195662132e-05, + "loss": 0.9139, + "step": 7311 + }, + { + "epoch": 0.5380179351575075, + "grad_norm": 0.72265625, + "learning_rate": 2.2107797410502324e-05, + "loss": 1.1933, + "step": 7312 + }, + { + "epoch": 0.5380915152908715, + "grad_norm": 0.92578125, + "learning_rate": 2.2102044780561552e-05, + "loss": 0.8918, + "step": 7313 + }, + { + "epoch": 0.5381650954242354, + "grad_norm": 0.83203125, + "learning_rate": 2.2096292306148565e-05, + "loss": 0.9323, + "step": 7314 + }, + { + "epoch": 0.5382386755575994, + "grad_norm": 0.73828125, + "learning_rate": 2.2090539987572083e-05, + "loss": 0.8173, + "step": 7315 + }, + { + "epoch": 0.5383122556909634, + "grad_norm": 0.8671875, + "learning_rate": 2.208478782514083e-05, + "loss": 1.0322, + "step": 7316 + }, + { + "epoch": 0.5383858358243274, + "grad_norm": 0.890625, + "learning_rate": 2.207903581916351e-05, + "loss": 0.5873, + "step": 7317 + }, + { + "epoch": 0.5384594159576914, + "grad_norm": 0.83203125, + "learning_rate": 2.2073283969948816e-05, + "loss": 1.1847, + "step": 7318 + }, + { + "epoch": 0.5385329960910554, + "grad_norm": 0.91015625, + "learning_rate": 2.206753227780544e-05, + "loss": 1.084, + "step": 7319 + }, + { + "epoch": 0.5386065762244194, + "grad_norm": 0.89453125, + "learning_rate": 2.2061780743042073e-05, + "loss": 1.1231, + "step": 7320 + }, + { + "epoch": 0.5386801563577834, + "grad_norm": 1.0546875, + "learning_rate": 2.205602936596739e-05, + "loss": 0.9485, + "step": 7321 + }, + { + "epoch": 0.5387537364911474, + "grad_norm": 0.76171875, + "learning_rate": 2.2050278146890042e-05, + "loss": 0.7016, + "step": 7322 + }, + { + "epoch": 0.5388273166245113, + "grad_norm": 0.859375, + "learning_rate": 2.2044527086118693e-05, + "loss": 1.0751, + "step": 7323 + }, + { + "epoch": 0.5389008967578753, + "grad_norm": 1.0390625, + "learning_rate": 2.2038776183961998e-05, + "loss": 0.8132, + "step": 7324 + }, + { + "epoch": 0.5389744768912393, + "grad_norm": 0.71484375, + "learning_rate": 2.203302544072859e-05, + "loss": 0.8207, + "step": 7325 + }, + { + "epoch": 0.5390480570246033, + "grad_norm": 0.87109375, + "learning_rate": 2.2027274856727115e-05, + "loss": 1.4968, + "step": 7326 + }, + { + "epoch": 0.5391216371579673, + "grad_norm": 1.171875, + "learning_rate": 2.2021524432266173e-05, + "loss": 1.3769, + "step": 7327 + }, + { + "epoch": 0.5391952172913314, + "grad_norm": 0.8359375, + "learning_rate": 2.2015774167654386e-05, + "loss": 1.0462, + "step": 7328 + }, + { + "epoch": 0.5392687974246954, + "grad_norm": 0.61328125, + "learning_rate": 2.2010024063200364e-05, + "loss": 0.5404, + "step": 7329 + }, + { + "epoch": 0.5393423775580594, + "grad_norm": 0.9453125, + "learning_rate": 2.200427411921271e-05, + "loss": 1.2398, + "step": 7330 + }, + { + "epoch": 0.5394159576914234, + "grad_norm": 1.1484375, + "learning_rate": 2.1998524335999998e-05, + "loss": 1.2768, + "step": 7331 + }, + { + "epoch": 0.5394895378247873, + "grad_norm": 0.8671875, + "learning_rate": 2.199277471387082e-05, + "loss": 0.8722, + "step": 7332 + }, + { + "epoch": 0.5395631179581513, + "grad_norm": 0.7890625, + "learning_rate": 2.1987025253133743e-05, + "loss": 1.1583, + "step": 7333 + }, + { + "epoch": 0.5396366980915153, + "grad_norm": 0.89453125, + "learning_rate": 2.1981275954097323e-05, + "loss": 0.8982, + "step": 7334 + }, + { + "epoch": 0.5397102782248793, + "grad_norm": 0.83203125, + "learning_rate": 2.1975526817070137e-05, + "loss": 1.0524, + "step": 7335 + }, + { + "epoch": 0.5397838583582433, + "grad_norm": 1.0625, + "learning_rate": 2.19697778423607e-05, + "loss": 0.9947, + "step": 7336 + }, + { + "epoch": 0.5398574384916073, + "grad_norm": 0.79296875, + "learning_rate": 2.1964029030277567e-05, + "loss": 1.2866, + "step": 7337 + }, + { + "epoch": 0.5399310186249713, + "grad_norm": 0.68359375, + "learning_rate": 2.1958280381129263e-05, + "loss": 0.707, + "step": 7338 + }, + { + "epoch": 0.5400045987583353, + "grad_norm": 0.671875, + "learning_rate": 2.1952531895224313e-05, + "loss": 0.5779, + "step": 7339 + }, + { + "epoch": 0.5400781788916993, + "grad_norm": 0.78515625, + "learning_rate": 2.194678357287121e-05, + "loss": 0.7908, + "step": 7340 + }, + { + "epoch": 0.5401517590250632, + "grad_norm": 0.86328125, + "learning_rate": 2.194103541437847e-05, + "loss": 1.3806, + "step": 7341 + }, + { + "epoch": 0.5402253391584272, + "grad_norm": 0.703125, + "learning_rate": 2.193528742005458e-05, + "loss": 0.8333, + "step": 7342 + }, + { + "epoch": 0.5402989192917912, + "grad_norm": 0.9609375, + "learning_rate": 2.192953959020802e-05, + "loss": 1.1764, + "step": 7343 + }, + { + "epoch": 0.5403724994251552, + "grad_norm": 0.8046875, + "learning_rate": 2.1923791925147285e-05, + "loss": 0.8591, + "step": 7344 + }, + { + "epoch": 0.5404460795585192, + "grad_norm": 0.79296875, + "learning_rate": 2.191804442518082e-05, + "loss": 0.6764, + "step": 7345 + }, + { + "epoch": 0.5405196596918832, + "grad_norm": 0.6484375, + "learning_rate": 2.1912297090617084e-05, + "loss": 0.5337, + "step": 7346 + }, + { + "epoch": 0.5405932398252472, + "grad_norm": 0.8671875, + "learning_rate": 2.1906549921764535e-05, + "loss": 0.9451, + "step": 7347 + }, + { + "epoch": 0.5406668199586112, + "grad_norm": 0.62109375, + "learning_rate": 2.190080291893161e-05, + "loss": 0.6899, + "step": 7348 + }, + { + "epoch": 0.5407404000919752, + "grad_norm": 0.84765625, + "learning_rate": 2.1895056082426744e-05, + "loss": 0.7317, + "step": 7349 + }, + { + "epoch": 0.5408139802253391, + "grad_norm": 0.93359375, + "learning_rate": 2.1889309412558346e-05, + "loss": 0.7372, + "step": 7350 + }, + { + "epoch": 0.5408875603587031, + "grad_norm": 0.7421875, + "learning_rate": 2.188356290963484e-05, + "loss": 1.036, + "step": 7351 + }, + { + "epoch": 0.5409611404920671, + "grad_norm": 0.91015625, + "learning_rate": 2.1877816573964626e-05, + "loss": 0.9117, + "step": 7352 + }, + { + "epoch": 0.5410347206254311, + "grad_norm": 1.0234375, + "learning_rate": 2.1872070405856106e-05, + "loss": 0.9015, + "step": 7353 + }, + { + "epoch": 0.5411083007587951, + "grad_norm": 0.953125, + "learning_rate": 2.186632440561765e-05, + "loss": 0.8015, + "step": 7354 + }, + { + "epoch": 0.5411818808921591, + "grad_norm": 1.125, + "learning_rate": 2.186057857355765e-05, + "loss": 1.2927, + "step": 7355 + }, + { + "epoch": 0.5412554610255231, + "grad_norm": 0.66796875, + "learning_rate": 2.1854832909984463e-05, + "loss": 0.5719, + "step": 7356 + }, + { + "epoch": 0.5413290411588871, + "grad_norm": 0.8359375, + "learning_rate": 2.1849087415206455e-05, + "loss": 1.0546, + "step": 7357 + }, + { + "epoch": 0.5414026212922511, + "grad_norm": 0.76171875, + "learning_rate": 2.184334208953198e-05, + "loss": 0.7692, + "step": 7358 + }, + { + "epoch": 0.541476201425615, + "grad_norm": 0.7890625, + "learning_rate": 2.1837596933269366e-05, + "loss": 1.1095, + "step": 7359 + }, + { + "epoch": 0.541549781558979, + "grad_norm": 0.63671875, + "learning_rate": 2.1831851946726953e-05, + "loss": 0.5641, + "step": 7360 + }, + { + "epoch": 0.541623361692343, + "grad_norm": 0.875, + "learning_rate": 2.182610713021306e-05, + "loss": 1.0079, + "step": 7361 + }, + { + "epoch": 0.541696941825707, + "grad_norm": 0.89453125, + "learning_rate": 2.182036248403601e-05, + "loss": 0.9787, + "step": 7362 + }, + { + "epoch": 0.541770521959071, + "grad_norm": 0.76171875, + "learning_rate": 2.1814618008504094e-05, + "loss": 0.9466, + "step": 7363 + }, + { + "epoch": 0.541844102092435, + "grad_norm": 0.8359375, + "learning_rate": 2.1808873703925616e-05, + "loss": 0.6536, + "step": 7364 + }, + { + "epoch": 0.5419176822257991, + "grad_norm": 0.73046875, + "learning_rate": 2.1803129570608858e-05, + "loss": 0.8832, + "step": 7365 + }, + { + "epoch": 0.5419912623591631, + "grad_norm": 0.7265625, + "learning_rate": 2.1797385608862093e-05, + "loss": 0.6238, + "step": 7366 + }, + { + "epoch": 0.5420648424925271, + "grad_norm": 0.98828125, + "learning_rate": 2.179164181899361e-05, + "loss": 1.0085, + "step": 7367 + }, + { + "epoch": 0.542138422625891, + "grad_norm": 0.73828125, + "learning_rate": 2.178589820131164e-05, + "loss": 0.8215, + "step": 7368 + }, + { + "epoch": 0.542212002759255, + "grad_norm": 0.78515625, + "learning_rate": 2.178015475612444e-05, + "loss": 0.6883, + "step": 7369 + }, + { + "epoch": 0.542285582892619, + "grad_norm": 0.95703125, + "learning_rate": 2.1774411483740255e-05, + "loss": 0.9847, + "step": 7370 + }, + { + "epoch": 0.542359163025983, + "grad_norm": 0.98828125, + "learning_rate": 2.1768668384467316e-05, + "loss": 1.1023, + "step": 7371 + }, + { + "epoch": 0.542432743159347, + "grad_norm": 0.81640625, + "learning_rate": 2.176292545861384e-05, + "loss": 0.6608, + "step": 7372 + }, + { + "epoch": 0.542506323292711, + "grad_norm": 1.1015625, + "learning_rate": 2.175718270648804e-05, + "loss": 0.9281, + "step": 7373 + }, + { + "epoch": 0.542579903426075, + "grad_norm": 0.8046875, + "learning_rate": 2.1751440128398115e-05, + "loss": 0.8566, + "step": 7374 + }, + { + "epoch": 0.542653483559439, + "grad_norm": 0.859375, + "learning_rate": 2.1745697724652268e-05, + "loss": 1.1914, + "step": 7375 + }, + { + "epoch": 0.542727063692803, + "grad_norm": 1.0546875, + "learning_rate": 2.1739955495558678e-05, + "loss": 0.9672, + "step": 7376 + }, + { + "epoch": 0.5428006438261669, + "grad_norm": 0.87109375, + "learning_rate": 2.1734213441425516e-05, + "loss": 0.8312, + "step": 7377 + }, + { + "epoch": 0.5428742239595309, + "grad_norm": 0.796875, + "learning_rate": 2.1728471562560955e-05, + "loss": 1.1319, + "step": 7378 + }, + { + "epoch": 0.5429478040928949, + "grad_norm": 0.703125, + "learning_rate": 2.172272985927314e-05, + "loss": 0.6857, + "step": 7379 + }, + { + "epoch": 0.5430213842262589, + "grad_norm": 0.96875, + "learning_rate": 2.1716988331870236e-05, + "loss": 0.8395, + "step": 7380 + }, + { + "epoch": 0.5430949643596229, + "grad_norm": 0.8359375, + "learning_rate": 2.171124698066036e-05, + "loss": 0.778, + "step": 7381 + }, + { + "epoch": 0.5431685444929869, + "grad_norm": 0.91796875, + "learning_rate": 2.1705505805951644e-05, + "loss": 0.9363, + "step": 7382 + }, + { + "epoch": 0.5432421246263509, + "grad_norm": 0.9765625, + "learning_rate": 2.1699764808052206e-05, + "loss": 1.06, + "step": 7383 + }, + { + "epoch": 0.5433157047597149, + "grad_norm": 1.0234375, + "learning_rate": 2.169402398727016e-05, + "loss": 1.117, + "step": 7384 + }, + { + "epoch": 0.5433892848930789, + "grad_norm": 0.7734375, + "learning_rate": 2.168828334391361e-05, + "loss": 0.7153, + "step": 7385 + }, + { + "epoch": 0.5434628650264428, + "grad_norm": 0.79296875, + "learning_rate": 2.168254287829063e-05, + "loss": 0.736, + "step": 7386 + }, + { + "epoch": 0.5435364451598068, + "grad_norm": 1.359375, + "learning_rate": 2.167680259070931e-05, + "loss": 0.6422, + "step": 7387 + }, + { + "epoch": 0.5436100252931708, + "grad_norm": 0.71484375, + "learning_rate": 2.1671062481477718e-05, + "loss": 0.7089, + "step": 7388 + }, + { + "epoch": 0.5436836054265348, + "grad_norm": 0.9765625, + "learning_rate": 2.1665322550903914e-05, + "loss": 0.9998, + "step": 7389 + }, + { + "epoch": 0.5437571855598988, + "grad_norm": 0.8203125, + "learning_rate": 2.1659582799295955e-05, + "loss": 0.9858, + "step": 7390 + }, + { + "epoch": 0.5438307656932628, + "grad_norm": 1.0078125, + "learning_rate": 2.1653843226961872e-05, + "loss": 1.0957, + "step": 7391 + }, + { + "epoch": 0.5439043458266268, + "grad_norm": 0.8828125, + "learning_rate": 2.1648103834209707e-05, + "loss": 0.9069, + "step": 7392 + }, + { + "epoch": 0.5439779259599908, + "grad_norm": 1.03125, + "learning_rate": 2.1642364621347476e-05, + "loss": 1.0202, + "step": 7393 + }, + { + "epoch": 0.5440515060933548, + "grad_norm": 0.67578125, + "learning_rate": 2.1636625588683206e-05, + "loss": 0.7816, + "step": 7394 + }, + { + "epoch": 0.5441250862267187, + "grad_norm": 0.8515625, + "learning_rate": 2.1630886736524873e-05, + "loss": 0.9869, + "step": 7395 + }, + { + "epoch": 0.5441986663600827, + "grad_norm": 0.96875, + "learning_rate": 2.1625148065180492e-05, + "loss": 1.2544, + "step": 7396 + }, + { + "epoch": 0.5442722464934467, + "grad_norm": 0.66796875, + "learning_rate": 2.161940957495804e-05, + "loss": 0.7264, + "step": 7397 + }, + { + "epoch": 0.5443458266268107, + "grad_norm": 0.78515625, + "learning_rate": 2.1613671266165487e-05, + "loss": 0.8085, + "step": 7398 + }, + { + "epoch": 0.5444194067601748, + "grad_norm": 1.015625, + "learning_rate": 2.1607933139110807e-05, + "loss": 1.1389, + "step": 7399 + }, + { + "epoch": 0.5444929868935388, + "grad_norm": 1.390625, + "learning_rate": 2.1602195194101944e-05, + "loss": 0.8613, + "step": 7400 + }, + { + "epoch": 0.5445665670269028, + "grad_norm": 0.8125, + "learning_rate": 2.1596457431446848e-05, + "loss": 0.7618, + "step": 7401 + }, + { + "epoch": 0.5446401471602668, + "grad_norm": 0.8984375, + "learning_rate": 2.1590719851453455e-05, + "loss": 0.6643, + "step": 7402 + }, + { + "epoch": 0.5447137272936308, + "grad_norm": 0.98828125, + "learning_rate": 2.1584982454429688e-05, + "loss": 0.8465, + "step": 7403 + }, + { + "epoch": 0.5447873074269947, + "grad_norm": 0.91015625, + "learning_rate": 2.157924524068346e-05, + "loss": 1.0482, + "step": 7404 + }, + { + "epoch": 0.5448608875603587, + "grad_norm": 0.765625, + "learning_rate": 2.157350821052268e-05, + "loss": 0.7691, + "step": 7405 + }, + { + "epoch": 0.5449344676937227, + "grad_norm": 0.6953125, + "learning_rate": 2.1567771364255244e-05, + "loss": 0.7051, + "step": 7406 + }, + { + "epoch": 0.5450080478270867, + "grad_norm": 0.75, + "learning_rate": 2.1562034702189033e-05, + "loss": 0.696, + "step": 7407 + }, + { + "epoch": 0.5450816279604507, + "grad_norm": 0.74609375, + "learning_rate": 2.1556298224631942e-05, + "loss": 0.6731, + "step": 7408 + }, + { + "epoch": 0.5451552080938147, + "grad_norm": 0.83984375, + "learning_rate": 2.1550561931891805e-05, + "loss": 0.7866, + "step": 7409 + }, + { + "epoch": 0.5452287882271787, + "grad_norm": 0.65234375, + "learning_rate": 2.1544825824276498e-05, + "loss": 0.5782, + "step": 7410 + }, + { + "epoch": 0.5453023683605427, + "grad_norm": 0.87109375, + "learning_rate": 2.153908990209386e-05, + "loss": 0.9274, + "step": 7411 + }, + { + "epoch": 0.5453759484939067, + "grad_norm": 0.6953125, + "learning_rate": 2.1533354165651737e-05, + "loss": 0.7583, + "step": 7412 + }, + { + "epoch": 0.5454495286272706, + "grad_norm": 1.015625, + "learning_rate": 2.1527618615257943e-05, + "loss": 1.0156, + "step": 7413 + }, + { + "epoch": 0.5455231087606346, + "grad_norm": 0.9453125, + "learning_rate": 2.1521883251220297e-05, + "loss": 1.203, + "step": 7414 + }, + { + "epoch": 0.5455966888939986, + "grad_norm": 0.69921875, + "learning_rate": 2.1516148073846613e-05, + "loss": 0.7928, + "step": 7415 + }, + { + "epoch": 0.5456702690273626, + "grad_norm": 0.91796875, + "learning_rate": 2.151041308344468e-05, + "loss": 0.9713, + "step": 7416 + }, + { + "epoch": 0.5457438491607266, + "grad_norm": 0.921875, + "learning_rate": 2.150467828032229e-05, + "loss": 0.876, + "step": 7417 + }, + { + "epoch": 0.5458174292940906, + "grad_norm": 0.85546875, + "learning_rate": 2.1498943664787208e-05, + "loss": 1.079, + "step": 7418 + }, + { + "epoch": 0.5458910094274546, + "grad_norm": 0.90625, + "learning_rate": 2.149320923714721e-05, + "loss": 1.0431, + "step": 7419 + }, + { + "epoch": 0.5459645895608186, + "grad_norm": 0.98828125, + "learning_rate": 2.148747499771005e-05, + "loss": 0.6573, + "step": 7420 + }, + { + "epoch": 0.5460381696941826, + "grad_norm": 1.140625, + "learning_rate": 2.1481740946783468e-05, + "loss": 1.3769, + "step": 7421 + }, + { + "epoch": 0.5461117498275465, + "grad_norm": 0.9140625, + "learning_rate": 2.147600708467522e-05, + "loss": 1.1254, + "step": 7422 + }, + { + "epoch": 0.5461853299609105, + "grad_norm": 0.78125, + "learning_rate": 2.1470273411692998e-05, + "loss": 0.5865, + "step": 7423 + }, + { + "epoch": 0.5462589100942745, + "grad_norm": 0.796875, + "learning_rate": 2.146453992814454e-05, + "loss": 1.0619, + "step": 7424 + }, + { + "epoch": 0.5463324902276385, + "grad_norm": 0.796875, + "learning_rate": 2.145880663433754e-05, + "loss": 0.8634, + "step": 7425 + }, + { + "epoch": 0.5464060703610025, + "grad_norm": 0.70703125, + "learning_rate": 2.1453073530579716e-05, + "loss": 0.5685, + "step": 7426 + }, + { + "epoch": 0.5464796504943665, + "grad_norm": 0.71875, + "learning_rate": 2.144734061717873e-05, + "loss": 0.991, + "step": 7427 + }, + { + "epoch": 0.5465532306277305, + "grad_norm": 0.8984375, + "learning_rate": 2.144160789444226e-05, + "loss": 0.9245, + "step": 7428 + }, + { + "epoch": 0.5466268107610945, + "grad_norm": 0.86328125, + "learning_rate": 2.143587536267797e-05, + "loss": 0.9547, + "step": 7429 + }, + { + "epoch": 0.5467003908944585, + "grad_norm": 0.76953125, + "learning_rate": 2.143014302219352e-05, + "loss": 1.04, + "step": 7430 + }, + { + "epoch": 0.5467739710278224, + "grad_norm": 0.91015625, + "learning_rate": 2.1424410873296557e-05, + "loss": 1.016, + "step": 7431 + }, + { + "epoch": 0.5468475511611864, + "grad_norm": 1.171875, + "learning_rate": 2.1418678916294705e-05, + "loss": 1.1255, + "step": 7432 + }, + { + "epoch": 0.5469211312945504, + "grad_norm": 1.046875, + "learning_rate": 2.141294715149559e-05, + "loss": 1.1142, + "step": 7433 + }, + { + "epoch": 0.5469947114279144, + "grad_norm": 0.86328125, + "learning_rate": 2.1407215579206826e-05, + "loss": 0.9451, + "step": 7434 + }, + { + "epoch": 0.5470682915612785, + "grad_norm": 0.75390625, + "learning_rate": 2.140148419973603e-05, + "loss": 0.8057, + "step": 7435 + }, + { + "epoch": 0.5471418716946425, + "grad_norm": 0.8359375, + "learning_rate": 2.1395753013390765e-05, + "loss": 0.7773, + "step": 7436 + }, + { + "epoch": 0.5472154518280065, + "grad_norm": 0.88671875, + "learning_rate": 2.139002202047863e-05, + "loss": 0.9999, + "step": 7437 + }, + { + "epoch": 0.5472890319613705, + "grad_norm": 0.8671875, + "learning_rate": 2.1384291221307195e-05, + "loss": 1.0387, + "step": 7438 + }, + { + "epoch": 0.5473626120947345, + "grad_norm": 0.77734375, + "learning_rate": 2.1378560616184017e-05, + "loss": 0.7955, + "step": 7439 + }, + { + "epoch": 0.5474361922280984, + "grad_norm": 1.078125, + "learning_rate": 2.1372830205416666e-05, + "loss": 1.2673, + "step": 7440 + }, + { + "epoch": 0.5475097723614624, + "grad_norm": 0.90234375, + "learning_rate": 2.1367099989312657e-05, + "loss": 0.7981, + "step": 7441 + }, + { + "epoch": 0.5475833524948264, + "grad_norm": 0.7890625, + "learning_rate": 2.136136996817953e-05, + "loss": 1.0126, + "step": 7442 + }, + { + "epoch": 0.5476569326281904, + "grad_norm": 0.765625, + "learning_rate": 2.1355640142324804e-05, + "loss": 0.8105, + "step": 7443 + }, + { + "epoch": 0.5477305127615544, + "grad_norm": 0.80078125, + "learning_rate": 2.1349910512055992e-05, + "loss": 0.7491, + "step": 7444 + }, + { + "epoch": 0.5478040928949184, + "grad_norm": 0.7734375, + "learning_rate": 2.1344181077680585e-05, + "loss": 0.8243, + "step": 7445 + }, + { + "epoch": 0.5478776730282824, + "grad_norm": 0.984375, + "learning_rate": 2.1338451839506075e-05, + "loss": 0.9097, + "step": 7446 + }, + { + "epoch": 0.5479512531616464, + "grad_norm": 0.828125, + "learning_rate": 2.1332722797839937e-05, + "loss": 0.8273, + "step": 7447 + }, + { + "epoch": 0.5480248332950104, + "grad_norm": 1.0625, + "learning_rate": 2.1326993952989642e-05, + "loss": 1.2072, + "step": 7448 + }, + { + "epoch": 0.5480984134283743, + "grad_norm": 0.84765625, + "learning_rate": 2.1321265305262654e-05, + "loss": 0.629, + "step": 7449 + }, + { + "epoch": 0.5481719935617383, + "grad_norm": 0.828125, + "learning_rate": 2.13155368549664e-05, + "loss": 1.0074, + "step": 7450 + }, + { + "epoch": 0.5482455736951023, + "grad_norm": 0.86328125, + "learning_rate": 2.1309808602408323e-05, + "loss": 0.7728, + "step": 7451 + }, + { + "epoch": 0.5483191538284663, + "grad_norm": 0.76953125, + "learning_rate": 2.1304080547895845e-05, + "loss": 1.2414, + "step": 7452 + }, + { + "epoch": 0.5483927339618303, + "grad_norm": 0.828125, + "learning_rate": 2.1298352691736386e-05, + "loss": 0.8012, + "step": 7453 + }, + { + "epoch": 0.5484663140951943, + "grad_norm": 0.66796875, + "learning_rate": 2.1292625034237358e-05, + "loss": 0.4832, + "step": 7454 + }, + { + "epoch": 0.5485398942285583, + "grad_norm": 1.2734375, + "learning_rate": 2.1286897575706132e-05, + "loss": 0.9493, + "step": 7455 + }, + { + "epoch": 0.5486134743619223, + "grad_norm": 0.76953125, + "learning_rate": 2.12811703164501e-05, + "loss": 1.1735, + "step": 7456 + }, + { + "epoch": 0.5486870544952863, + "grad_norm": 0.99609375, + "learning_rate": 2.1275443256776632e-05, + "loss": 1.3607, + "step": 7457 + }, + { + "epoch": 0.5487606346286502, + "grad_norm": 0.78515625, + "learning_rate": 2.1269716396993095e-05, + "loss": 0.9992, + "step": 7458 + }, + { + "epoch": 0.5488342147620142, + "grad_norm": 0.83984375, + "learning_rate": 2.126398973740683e-05, + "loss": 0.6699, + "step": 7459 + }, + { + "epoch": 0.5489077948953782, + "grad_norm": 0.93359375, + "learning_rate": 2.125826327832518e-05, + "loss": 0.9599, + "step": 7460 + }, + { + "epoch": 0.5489813750287422, + "grad_norm": 0.65234375, + "learning_rate": 2.1252537020055468e-05, + "loss": 0.617, + "step": 7461 + }, + { + "epoch": 0.5490549551621062, + "grad_norm": 0.7890625, + "learning_rate": 2.1246810962905024e-05, + "loss": 1.0797, + "step": 7462 + }, + { + "epoch": 0.5491285352954702, + "grad_norm": 0.8359375, + "learning_rate": 2.1241085107181148e-05, + "loss": 0.6466, + "step": 7463 + }, + { + "epoch": 0.5492021154288342, + "grad_norm": 1.0390625, + "learning_rate": 2.1235359453191132e-05, + "loss": 1.1459, + "step": 7464 + }, + { + "epoch": 0.5492756955621982, + "grad_norm": 0.984375, + "learning_rate": 2.1229634001242256e-05, + "loss": 1.2396, + "step": 7465 + }, + { + "epoch": 0.5493492756955622, + "grad_norm": 0.8359375, + "learning_rate": 2.1223908751641805e-05, + "loss": 0.8712, + "step": 7466 + }, + { + "epoch": 0.5494228558289261, + "grad_norm": 0.7109375, + "learning_rate": 2.1218183704697053e-05, + "loss": 0.7721, + "step": 7467 + }, + { + "epoch": 0.5494964359622901, + "grad_norm": 0.96484375, + "learning_rate": 2.1212458860715225e-05, + "loss": 1.315, + "step": 7468 + }, + { + "epoch": 0.5495700160956541, + "grad_norm": 0.7109375, + "learning_rate": 2.1206734220003575e-05, + "loss": 0.9055, + "step": 7469 + }, + { + "epoch": 0.5496435962290181, + "grad_norm": 0.9296875, + "learning_rate": 2.120100978286934e-05, + "loss": 1.1254, + "step": 7470 + }, + { + "epoch": 0.5497171763623822, + "grad_norm": 0.9765625, + "learning_rate": 2.1195285549619733e-05, + "loss": 1.2472, + "step": 7471 + }, + { + "epoch": 0.5497907564957462, + "grad_norm": 0.921875, + "learning_rate": 2.118956152056197e-05, + "loss": 0.7265, + "step": 7472 + }, + { + "epoch": 0.5498643366291102, + "grad_norm": 0.9140625, + "learning_rate": 2.118383769600324e-05, + "loss": 1.5356, + "step": 7473 + }, + { + "epoch": 0.5499379167624742, + "grad_norm": 0.7890625, + "learning_rate": 2.1178114076250727e-05, + "loss": 0.8452, + "step": 7474 + }, + { + "epoch": 0.5500114968958382, + "grad_norm": 0.88671875, + "learning_rate": 2.1172390661611618e-05, + "loss": 0.8157, + "step": 7475 + }, + { + "epoch": 0.5500850770292021, + "grad_norm": 0.95703125, + "learning_rate": 2.1166667452393082e-05, + "loss": 0.8626, + "step": 7476 + }, + { + "epoch": 0.5501586571625661, + "grad_norm": 0.94140625, + "learning_rate": 2.1160944448902254e-05, + "loss": 1.1232, + "step": 7477 + }, + { + "epoch": 0.5502322372959301, + "grad_norm": 1.0234375, + "learning_rate": 2.1155221651446278e-05, + "loss": 1.1201, + "step": 7478 + }, + { + "epoch": 0.5503058174292941, + "grad_norm": 0.66015625, + "learning_rate": 2.1149499060332302e-05, + "loss": 0.8872, + "step": 7479 + }, + { + "epoch": 0.5503793975626581, + "grad_norm": 0.87109375, + "learning_rate": 2.114377667586744e-05, + "loss": 1.3235, + "step": 7480 + }, + { + "epoch": 0.5504529776960221, + "grad_norm": 0.796875, + "learning_rate": 2.1138054498358808e-05, + "loss": 0.6922, + "step": 7481 + }, + { + "epoch": 0.5505265578293861, + "grad_norm": 0.703125, + "learning_rate": 2.113233252811349e-05, + "loss": 0.5839, + "step": 7482 + }, + { + "epoch": 0.5506001379627501, + "grad_norm": 0.6875, + "learning_rate": 2.1126610765438574e-05, + "loss": 0.7273, + "step": 7483 + }, + { + "epoch": 0.5506737180961141, + "grad_norm": 0.72265625, + "learning_rate": 2.1120889210641152e-05, + "loss": 0.626, + "step": 7484 + }, + { + "epoch": 0.550747298229478, + "grad_norm": 0.8203125, + "learning_rate": 2.1115167864028278e-05, + "loss": 0.7516, + "step": 7485 + }, + { + "epoch": 0.550820878362842, + "grad_norm": 0.76171875, + "learning_rate": 2.1109446725907003e-05, + "loss": 1.0033, + "step": 7486 + }, + { + "epoch": 0.550894458496206, + "grad_norm": 0.92578125, + "learning_rate": 2.1103725796584374e-05, + "loss": 1.243, + "step": 7487 + }, + { + "epoch": 0.55096803862957, + "grad_norm": 0.91015625, + "learning_rate": 2.109800507636742e-05, + "loss": 1.1234, + "step": 7488 + }, + { + "epoch": 0.551041618762934, + "grad_norm": 0.73046875, + "learning_rate": 2.1092284565563168e-05, + "loss": 0.795, + "step": 7489 + }, + { + "epoch": 0.551115198896298, + "grad_norm": 1.015625, + "learning_rate": 2.1086564264478635e-05, + "loss": 0.94, + "step": 7490 + }, + { + "epoch": 0.551188779029662, + "grad_norm": 1.0703125, + "learning_rate": 2.108084417342079e-05, + "loss": 1.2097, + "step": 7491 + }, + { + "epoch": 0.551262359163026, + "grad_norm": 0.88671875, + "learning_rate": 2.1075124292696636e-05, + "loss": 0.8736, + "step": 7492 + }, + { + "epoch": 0.55133593929639, + "grad_norm": 1.3125, + "learning_rate": 2.106940462261315e-05, + "loss": 1.0572, + "step": 7493 + }, + { + "epoch": 0.5514095194297539, + "grad_norm": 0.8203125, + "learning_rate": 2.1063685163477296e-05, + "loss": 0.6779, + "step": 7494 + }, + { + "epoch": 0.5514830995631179, + "grad_norm": 0.7421875, + "learning_rate": 2.1057965915596034e-05, + "loss": 0.703, + "step": 7495 + }, + { + "epoch": 0.5515566796964819, + "grad_norm": 0.69140625, + "learning_rate": 2.1052246879276287e-05, + "loss": 0.6112, + "step": 7496 + }, + { + "epoch": 0.5516302598298459, + "grad_norm": 0.69921875, + "learning_rate": 2.104652805482499e-05, + "loss": 0.9929, + "step": 7497 + }, + { + "epoch": 0.5517038399632099, + "grad_norm": 0.86328125, + "learning_rate": 2.1040809442549068e-05, + "loss": 0.7051, + "step": 7498 + }, + { + "epoch": 0.5517774200965739, + "grad_norm": 0.81640625, + "learning_rate": 2.103509104275543e-05, + "loss": 0.8016, + "step": 7499 + }, + { + "epoch": 0.5518510002299379, + "grad_norm": 0.8671875, + "learning_rate": 2.1029372855750962e-05, + "loss": 0.8743, + "step": 7500 + }, + { + "epoch": 0.5519245803633019, + "grad_norm": 0.89453125, + "learning_rate": 2.1023654881842553e-05, + "loss": 1.0809, + "step": 7501 + }, + { + "epoch": 0.551998160496666, + "grad_norm": 0.98046875, + "learning_rate": 2.101793712133708e-05, + "loss": 1.3755, + "step": 7502 + }, + { + "epoch": 0.5520717406300298, + "grad_norm": 0.76171875, + "learning_rate": 2.1012219574541395e-05, + "loss": 0.5926, + "step": 7503 + }, + { + "epoch": 0.5521453207633938, + "grad_norm": 0.7421875, + "learning_rate": 2.1006502241762365e-05, + "loss": 0.9977, + "step": 7504 + }, + { + "epoch": 0.5522189008967578, + "grad_norm": 0.7734375, + "learning_rate": 2.1000785123306804e-05, + "loss": 0.7903, + "step": 7505 + }, + { + "epoch": 0.5522924810301219, + "grad_norm": 0.84375, + "learning_rate": 2.0995068219481555e-05, + "loss": 0.812, + "step": 7506 + }, + { + "epoch": 0.5523660611634859, + "grad_norm": 0.8828125, + "learning_rate": 2.098935153059343e-05, + "loss": 0.8522, + "step": 7507 + }, + { + "epoch": 0.5524396412968499, + "grad_norm": 1.0078125, + "learning_rate": 2.0983635056949246e-05, + "loss": 0.7858, + "step": 7508 + }, + { + "epoch": 0.5525132214302139, + "grad_norm": 0.91015625, + "learning_rate": 2.0977918798855774e-05, + "loss": 0.7539, + "step": 7509 + }, + { + "epoch": 0.5525868015635779, + "grad_norm": 0.93359375, + "learning_rate": 2.0972202756619798e-05, + "loss": 1.0636, + "step": 7510 + }, + { + "epoch": 0.5526603816969419, + "grad_norm": 0.8203125, + "learning_rate": 2.0966486930548097e-05, + "loss": 0.6219, + "step": 7511 + }, + { + "epoch": 0.5527339618303058, + "grad_norm": 0.82421875, + "learning_rate": 2.0960771320947423e-05, + "loss": 0.9379, + "step": 7512 + }, + { + "epoch": 0.5528075419636698, + "grad_norm": 0.7421875, + "learning_rate": 2.095505592812453e-05, + "loss": 0.8834, + "step": 7513 + }, + { + "epoch": 0.5528811220970338, + "grad_norm": 0.8359375, + "learning_rate": 2.094934075238614e-05, + "loss": 0.8552, + "step": 7514 + }, + { + "epoch": 0.5529547022303978, + "grad_norm": 0.8359375, + "learning_rate": 2.0943625794038977e-05, + "loss": 0.9106, + "step": 7515 + }, + { + "epoch": 0.5530282823637618, + "grad_norm": 0.8828125, + "learning_rate": 2.093791105338976e-05, + "loss": 0.6977, + "step": 7516 + }, + { + "epoch": 0.5531018624971258, + "grad_norm": 1.109375, + "learning_rate": 2.093219653074519e-05, + "loss": 1.0213, + "step": 7517 + }, + { + "epoch": 0.5531754426304898, + "grad_norm": 0.88671875, + "learning_rate": 2.092648222641194e-05, + "loss": 1.0649, + "step": 7518 + }, + { + "epoch": 0.5532490227638538, + "grad_norm": 0.7421875, + "learning_rate": 2.0920768140696696e-05, + "loss": 0.9968, + "step": 7519 + }, + { + "epoch": 0.5533226028972178, + "grad_norm": 0.81640625, + "learning_rate": 2.0915054273906125e-05, + "loss": 0.6715, + "step": 7520 + }, + { + "epoch": 0.5533961830305817, + "grad_norm": 0.72265625, + "learning_rate": 2.0909340626346874e-05, + "loss": 0.9704, + "step": 7521 + }, + { + "epoch": 0.5534697631639457, + "grad_norm": 0.8125, + "learning_rate": 2.0903627198325593e-05, + "loss": 0.6492, + "step": 7522 + }, + { + "epoch": 0.5535433432973097, + "grad_norm": 0.7578125, + "learning_rate": 2.0897913990148895e-05, + "loss": 0.8551, + "step": 7523 + }, + { + "epoch": 0.5536169234306737, + "grad_norm": 0.95703125, + "learning_rate": 2.0892201002123406e-05, + "loss": 0.9271, + "step": 7524 + }, + { + "epoch": 0.5536905035640377, + "grad_norm": 0.890625, + "learning_rate": 2.088648823455573e-05, + "loss": 0.826, + "step": 7525 + }, + { + "epoch": 0.5537640836974017, + "grad_norm": 0.77734375, + "learning_rate": 2.0880775687752464e-05, + "loss": 0.797, + "step": 7526 + }, + { + "epoch": 0.5538376638307657, + "grad_norm": 0.84375, + "learning_rate": 2.087506336202019e-05, + "loss": 0.8945, + "step": 7527 + }, + { + "epoch": 0.5539112439641297, + "grad_norm": 0.625, + "learning_rate": 2.0869351257665467e-05, + "loss": 0.7587, + "step": 7528 + }, + { + "epoch": 0.5539848240974937, + "grad_norm": 0.87890625, + "learning_rate": 2.0863639374994863e-05, + "loss": 0.7524, + "step": 7529 + }, + { + "epoch": 0.5540584042308576, + "grad_norm": 0.7578125, + "learning_rate": 2.0857927714314922e-05, + "loss": 0.7819, + "step": 7530 + }, + { + "epoch": 0.5541319843642216, + "grad_norm": 1.09375, + "learning_rate": 2.085221627593218e-05, + "loss": 1.0946, + "step": 7531 + }, + { + "epoch": 0.5542055644975856, + "grad_norm": 0.80859375, + "learning_rate": 2.084650506015315e-05, + "loss": 0.83, + "step": 7532 + }, + { + "epoch": 0.5542791446309496, + "grad_norm": 0.8046875, + "learning_rate": 2.0840794067284353e-05, + "loss": 0.9074, + "step": 7533 + }, + { + "epoch": 0.5543527247643136, + "grad_norm": 0.78125, + "learning_rate": 2.083508329763228e-05, + "loss": 0.7968, + "step": 7534 + }, + { + "epoch": 0.5544263048976776, + "grad_norm": 0.84375, + "learning_rate": 2.082937275150342e-05, + "loss": 0.8298, + "step": 7535 + }, + { + "epoch": 0.5544998850310416, + "grad_norm": 0.84765625, + "learning_rate": 2.082366242920426e-05, + "loss": 1.2276, + "step": 7536 + }, + { + "epoch": 0.5545734651644056, + "grad_norm": 0.77734375, + "learning_rate": 2.0817952331041236e-05, + "loss": 0.7956, + "step": 7537 + }, + { + "epoch": 0.5546470452977696, + "grad_norm": 1.125, + "learning_rate": 2.0812242457320812e-05, + "loss": 1.2686, + "step": 7538 + }, + { + "epoch": 0.5547206254311335, + "grad_norm": 0.98828125, + "learning_rate": 2.0806532808349426e-05, + "loss": 0.809, + "step": 7539 + }, + { + "epoch": 0.5547942055644975, + "grad_norm": 0.89453125, + "learning_rate": 2.080082338443351e-05, + "loss": 0.732, + "step": 7540 + }, + { + "epoch": 0.5548677856978615, + "grad_norm": 0.70703125, + "learning_rate": 2.0795114185879468e-05, + "loss": 0.8356, + "step": 7541 + }, + { + "epoch": 0.5549413658312256, + "grad_norm": 0.84765625, + "learning_rate": 2.0789405212993704e-05, + "loss": 0.9492, + "step": 7542 + }, + { + "epoch": 0.5550149459645896, + "grad_norm": 0.76953125, + "learning_rate": 2.078369646608261e-05, + "loss": 0.8767, + "step": 7543 + }, + { + "epoch": 0.5550885260979536, + "grad_norm": 0.9296875, + "learning_rate": 2.0777987945452563e-05, + "loss": 0.8245, + "step": 7544 + }, + { + "epoch": 0.5551621062313176, + "grad_norm": 0.796875, + "learning_rate": 2.0772279651409933e-05, + "loss": 0.8415, + "step": 7545 + }, + { + "epoch": 0.5552356863646816, + "grad_norm": 0.80078125, + "learning_rate": 2.0766571584261066e-05, + "loss": 0.7504, + "step": 7546 + }, + { + "epoch": 0.5553092664980456, + "grad_norm": 0.91015625, + "learning_rate": 2.0760863744312305e-05, + "loss": 0.8587, + "step": 7547 + }, + { + "epoch": 0.5553828466314095, + "grad_norm": 0.75, + "learning_rate": 2.075515613186998e-05, + "loss": 0.5417, + "step": 7548 + }, + { + "epoch": 0.5554564267647735, + "grad_norm": 0.88671875, + "learning_rate": 2.0749448747240417e-05, + "loss": 0.8205, + "step": 7549 + }, + { + "epoch": 0.5555300068981375, + "grad_norm": 0.8984375, + "learning_rate": 2.0743741590729903e-05, + "loss": 0.7322, + "step": 7550 + }, + { + "epoch": 0.5556035870315015, + "grad_norm": 0.7421875, + "learning_rate": 2.073803466264474e-05, + "loss": 0.7334, + "step": 7551 + }, + { + "epoch": 0.5556771671648655, + "grad_norm": 0.875, + "learning_rate": 2.0732327963291203e-05, + "loss": 0.9157, + "step": 7552 + }, + { + "epoch": 0.5557507472982295, + "grad_norm": 0.81640625, + "learning_rate": 2.0726621492975567e-05, + "loss": 0.8135, + "step": 7553 + }, + { + "epoch": 0.5558243274315935, + "grad_norm": 0.8046875, + "learning_rate": 2.072091525200409e-05, + "loss": 0.6913, + "step": 7554 + }, + { + "epoch": 0.5558979075649575, + "grad_norm": 0.76171875, + "learning_rate": 2.0715209240683005e-05, + "loss": 0.5635, + "step": 7555 + }, + { + "epoch": 0.5559714876983215, + "grad_norm": 1.109375, + "learning_rate": 2.0709503459318544e-05, + "loss": 1.2376, + "step": 7556 + }, + { + "epoch": 0.5560450678316854, + "grad_norm": 1.015625, + "learning_rate": 2.070379790821693e-05, + "loss": 0.9284, + "step": 7557 + }, + { + "epoch": 0.5561186479650494, + "grad_norm": 0.6796875, + "learning_rate": 2.0698092587684367e-05, + "loss": 1.0626, + "step": 7558 + }, + { + "epoch": 0.5561922280984134, + "grad_norm": 0.84375, + "learning_rate": 2.0692387498027055e-05, + "loss": 0.8049, + "step": 7559 + }, + { + "epoch": 0.5562658082317774, + "grad_norm": 1.015625, + "learning_rate": 2.0686682639551167e-05, + "loss": 1.0139, + "step": 7560 + }, + { + "epoch": 0.5563393883651414, + "grad_norm": 0.89453125, + "learning_rate": 2.0680978012562875e-05, + "loss": 0.9984, + "step": 7561 + }, + { + "epoch": 0.5564129684985054, + "grad_norm": 0.83203125, + "learning_rate": 2.0675273617368334e-05, + "loss": 0.9501, + "step": 7562 + }, + { + "epoch": 0.5564865486318694, + "grad_norm": 0.65625, + "learning_rate": 2.0669569454273698e-05, + "loss": 0.7705, + "step": 7563 + }, + { + "epoch": 0.5565601287652334, + "grad_norm": 0.97265625, + "learning_rate": 2.0663865523585083e-05, + "loss": 0.9158, + "step": 7564 + }, + { + "epoch": 0.5566337088985974, + "grad_norm": 0.66796875, + "learning_rate": 2.0658161825608612e-05, + "loss": 0.8112, + "step": 7565 + }, + { + "epoch": 0.5567072890319613, + "grad_norm": 0.75, + "learning_rate": 2.06524583606504e-05, + "loss": 0.8144, + "step": 7566 + }, + { + "epoch": 0.5567808691653253, + "grad_norm": 0.9765625, + "learning_rate": 2.0646755129016535e-05, + "loss": 1.0407, + "step": 7567 + }, + { + "epoch": 0.5568544492986893, + "grad_norm": 0.9296875, + "learning_rate": 2.0641052131013107e-05, + "loss": 0.9335, + "step": 7568 + }, + { + "epoch": 0.5569280294320533, + "grad_norm": 0.7578125, + "learning_rate": 2.0635349366946168e-05, + "loss": 0.7903, + "step": 7569 + }, + { + "epoch": 0.5570016095654173, + "grad_norm": 0.921875, + "learning_rate": 2.0629646837121787e-05, + "loss": 1.2399, + "step": 7570 + }, + { + "epoch": 0.5570751896987813, + "grad_norm": 0.875, + "learning_rate": 2.062394454184601e-05, + "loss": 1.1816, + "step": 7571 + }, + { + "epoch": 0.5571487698321453, + "grad_norm": 0.9453125, + "learning_rate": 2.0618242481424864e-05, + "loss": 0.9437, + "step": 7572 + }, + { + "epoch": 0.5572223499655093, + "grad_norm": 0.83203125, + "learning_rate": 2.0612540656164365e-05, + "loss": 0.9093, + "step": 7573 + }, + { + "epoch": 0.5572959300988733, + "grad_norm": 0.9296875, + "learning_rate": 2.060683906637052e-05, + "loss": 0.9084, + "step": 7574 + }, + { + "epoch": 0.5573695102322372, + "grad_norm": 0.765625, + "learning_rate": 2.0601137712349328e-05, + "loss": 1.0847, + "step": 7575 + }, + { + "epoch": 0.5574430903656012, + "grad_norm": 0.83203125, + "learning_rate": 2.0595436594406764e-05, + "loss": 0.9812, + "step": 7576 + }, + { + "epoch": 0.5575166704989652, + "grad_norm": 0.921875, + "learning_rate": 2.0589735712848813e-05, + "loss": 0.9761, + "step": 7577 + }, + { + "epoch": 0.5575902506323293, + "grad_norm": 0.83984375, + "learning_rate": 2.05840350679814e-05, + "loss": 1.1286, + "step": 7578 + }, + { + "epoch": 0.5576638307656933, + "grad_norm": 0.98046875, + "learning_rate": 2.057833466011049e-05, + "loss": 0.7901, + "step": 7579 + }, + { + "epoch": 0.5577374108990573, + "grad_norm": 0.79296875, + "learning_rate": 2.0572634489542007e-05, + "loss": 0.9269, + "step": 7580 + }, + { + "epoch": 0.5578109910324213, + "grad_norm": 0.828125, + "learning_rate": 2.0566934556581874e-05, + "loss": 0.588, + "step": 7581 + }, + { + "epoch": 0.5578845711657853, + "grad_norm": 1.0546875, + "learning_rate": 2.0561234861535983e-05, + "loss": 1.3045, + "step": 7582 + }, + { + "epoch": 0.5579581512991493, + "grad_norm": 0.8515625, + "learning_rate": 2.0555535404710237e-05, + "loss": 0.7659, + "step": 7583 + }, + { + "epoch": 0.5580317314325132, + "grad_norm": 0.97265625, + "learning_rate": 2.0549836186410508e-05, + "loss": 0.8927, + "step": 7584 + }, + { + "epoch": 0.5581053115658772, + "grad_norm": 1.1015625, + "learning_rate": 2.0544137206942665e-05, + "loss": 1.1027, + "step": 7585 + }, + { + "epoch": 0.5581788916992412, + "grad_norm": 0.91015625, + "learning_rate": 2.053843846661257e-05, + "loss": 1.3398, + "step": 7586 + }, + { + "epoch": 0.5582524718326052, + "grad_norm": 0.890625, + "learning_rate": 2.053273996572605e-05, + "loss": 0.9436, + "step": 7587 + }, + { + "epoch": 0.5583260519659692, + "grad_norm": 0.94140625, + "learning_rate": 2.052704170458894e-05, + "loss": 1.2582, + "step": 7588 + }, + { + "epoch": 0.5583996320993332, + "grad_norm": 0.9453125, + "learning_rate": 2.0521343683507047e-05, + "loss": 0.6782, + "step": 7589 + }, + { + "epoch": 0.5584732122326972, + "grad_norm": 0.7578125, + "learning_rate": 2.0515645902786184e-05, + "loss": 0.8031, + "step": 7590 + }, + { + "epoch": 0.5585467923660612, + "grad_norm": 0.73828125, + "learning_rate": 2.050994836273215e-05, + "loss": 0.6921, + "step": 7591 + }, + { + "epoch": 0.5586203724994252, + "grad_norm": 0.9296875, + "learning_rate": 2.050425106365069e-05, + "loss": 0.9654, + "step": 7592 + }, + { + "epoch": 0.5586939526327891, + "grad_norm": 0.90625, + "learning_rate": 2.0498554005847588e-05, + "loss": 0.7598, + "step": 7593 + }, + { + "epoch": 0.5587675327661531, + "grad_norm": 1.015625, + "learning_rate": 2.049285718962859e-05, + "loss": 0.8186, + "step": 7594 + }, + { + "epoch": 0.5588411128995171, + "grad_norm": 1.0390625, + "learning_rate": 2.0487160615299437e-05, + "loss": 0.9807, + "step": 7595 + }, + { + "epoch": 0.5589146930328811, + "grad_norm": 1.0390625, + "learning_rate": 2.0481464283165847e-05, + "loss": 1.0518, + "step": 7596 + }, + { + "epoch": 0.5589882731662451, + "grad_norm": 1.0703125, + "learning_rate": 2.047576819353353e-05, + "loss": 1.0099, + "step": 7597 + }, + { + "epoch": 0.5590618532996091, + "grad_norm": 0.875, + "learning_rate": 2.0470072346708196e-05, + "loss": 1.2192, + "step": 7598 + }, + { + "epoch": 0.5591354334329731, + "grad_norm": 0.9921875, + "learning_rate": 2.046437674299552e-05, + "loss": 1.0461, + "step": 7599 + }, + { + "epoch": 0.5592090135663371, + "grad_norm": 0.85546875, + "learning_rate": 2.0458681382701184e-05, + "loss": 1.4193, + "step": 7600 + }, + { + "epoch": 0.5592825936997011, + "grad_norm": 0.78515625, + "learning_rate": 2.0452986266130834e-05, + "loss": 1.0968, + "step": 7601 + }, + { + "epoch": 0.559356173833065, + "grad_norm": 1.0546875, + "learning_rate": 2.0447291393590124e-05, + "loss": 1.3195, + "step": 7602 + }, + { + "epoch": 0.559429753966429, + "grad_norm": 0.828125, + "learning_rate": 2.0441596765384684e-05, + "loss": 0.8513, + "step": 7603 + }, + { + "epoch": 0.559503334099793, + "grad_norm": 0.86328125, + "learning_rate": 2.043590238182015e-05, + "loss": 1.076, + "step": 7604 + }, + { + "epoch": 0.559576914233157, + "grad_norm": 0.9765625, + "learning_rate": 2.0430208243202104e-05, + "loss": 0.9032, + "step": 7605 + }, + { + "epoch": 0.559650494366521, + "grad_norm": 0.9296875, + "learning_rate": 2.042451434983615e-05, + "loss": 0.9369, + "step": 7606 + }, + { + "epoch": 0.559724074499885, + "grad_norm": 1.09375, + "learning_rate": 2.0418820702027866e-05, + "loss": 1.3684, + "step": 7607 + }, + { + "epoch": 0.559797654633249, + "grad_norm": 0.97265625, + "learning_rate": 2.0413127300082818e-05, + "loss": 0.9994, + "step": 7608 + }, + { + "epoch": 0.559871234766613, + "grad_norm": 0.56640625, + "learning_rate": 2.040743414430658e-05, + "loss": 0.7899, + "step": 7609 + }, + { + "epoch": 0.559944814899977, + "grad_norm": 1.0859375, + "learning_rate": 2.040174123500467e-05, + "loss": 1.2626, + "step": 7610 + }, + { + "epoch": 0.5600183950333409, + "grad_norm": 0.7265625, + "learning_rate": 2.039604857248262e-05, + "loss": 0.8315, + "step": 7611 + }, + { + "epoch": 0.560091975166705, + "grad_norm": 0.86328125, + "learning_rate": 2.039035615704595e-05, + "loss": 0.7149, + "step": 7612 + }, + { + "epoch": 0.560165555300069, + "grad_norm": 1.0234375, + "learning_rate": 2.0384663989000162e-05, + "loss": 1.2585, + "step": 7613 + }, + { + "epoch": 0.560239135433433, + "grad_norm": 0.7734375, + "learning_rate": 2.0378972068650736e-05, + "loss": 0.9356, + "step": 7614 + }, + { + "epoch": 0.560312715566797, + "grad_norm": 0.9609375, + "learning_rate": 2.037328039630315e-05, + "loss": 1.1026, + "step": 7615 + }, + { + "epoch": 0.560386295700161, + "grad_norm": 0.98046875, + "learning_rate": 2.0367588972262867e-05, + "loss": 0.9621, + "step": 7616 + }, + { + "epoch": 0.560459875833525, + "grad_norm": 1.0078125, + "learning_rate": 2.036189779683534e-05, + "loss": 1.2336, + "step": 7617 + }, + { + "epoch": 0.560533455966889, + "grad_norm": 0.89453125, + "learning_rate": 2.0356206870326004e-05, + "loss": 0.8108, + "step": 7618 + }, + { + "epoch": 0.560607036100253, + "grad_norm": 0.91015625, + "learning_rate": 2.035051619304026e-05, + "loss": 0.8924, + "step": 7619 + }, + { + "epoch": 0.5606806162336169, + "grad_norm": 0.92578125, + "learning_rate": 2.0344825765283535e-05, + "loss": 0.7975, + "step": 7620 + }, + { + "epoch": 0.5607541963669809, + "grad_norm": 0.8125, + "learning_rate": 2.0339135587361213e-05, + "loss": 1.0401, + "step": 7621 + }, + { + "epoch": 0.5608277765003449, + "grad_norm": 0.640625, + "learning_rate": 2.0333445659578695e-05, + "loss": 0.5405, + "step": 7622 + }, + { + "epoch": 0.5609013566337089, + "grad_norm": 0.94921875, + "learning_rate": 2.0327755982241326e-05, + "loss": 0.8101, + "step": 7623 + }, + { + "epoch": 0.5609749367670729, + "grad_norm": 0.7109375, + "learning_rate": 2.0322066555654467e-05, + "loss": 0.6919, + "step": 7624 + }, + { + "epoch": 0.5610485169004369, + "grad_norm": 0.99609375, + "learning_rate": 2.0316377380123465e-05, + "loss": 1.1004, + "step": 7625 + }, + { + "epoch": 0.5611220970338009, + "grad_norm": 0.62890625, + "learning_rate": 2.0310688455953637e-05, + "loss": 0.5486, + "step": 7626 + }, + { + "epoch": 0.5611956771671649, + "grad_norm": 0.93359375, + "learning_rate": 2.030499978345031e-05, + "loss": 1.0957, + "step": 7627 + }, + { + "epoch": 0.5612692573005289, + "grad_norm": 1.078125, + "learning_rate": 2.0299311362918775e-05, + "loss": 1.3608, + "step": 7628 + }, + { + "epoch": 0.5613428374338928, + "grad_norm": 0.8046875, + "learning_rate": 2.0293623194664317e-05, + "loss": 0.8691, + "step": 7629 + }, + { + "epoch": 0.5614164175672568, + "grad_norm": 0.890625, + "learning_rate": 2.0287935278992214e-05, + "loss": 0.9023, + "step": 7630 + }, + { + "epoch": 0.5614899977006208, + "grad_norm": 0.88671875, + "learning_rate": 2.0282247616207727e-05, + "loss": 0.7869, + "step": 7631 + }, + { + "epoch": 0.5615635778339848, + "grad_norm": 0.9765625, + "learning_rate": 2.027656020661611e-05, + "loss": 1.2842, + "step": 7632 + }, + { + "epoch": 0.5616371579673488, + "grad_norm": 0.8515625, + "learning_rate": 2.0270873050522566e-05, + "loss": 0.7527, + "step": 7633 + }, + { + "epoch": 0.5617107381007128, + "grad_norm": 0.96875, + "learning_rate": 2.0265186148232343e-05, + "loss": 1.1703, + "step": 7634 + }, + { + "epoch": 0.5617843182340768, + "grad_norm": 0.984375, + "learning_rate": 2.025949950005063e-05, + "loss": 1.009, + "step": 7635 + }, + { + "epoch": 0.5618578983674408, + "grad_norm": 0.8046875, + "learning_rate": 2.025381310628264e-05, + "loss": 1.2368, + "step": 7636 + }, + { + "epoch": 0.5619314785008048, + "grad_norm": 1.0625, + "learning_rate": 2.0248126967233524e-05, + "loss": 1.1802, + "step": 7637 + }, + { + "epoch": 0.5620050586341687, + "grad_norm": 0.65234375, + "learning_rate": 2.024244108320846e-05, + "loss": 0.6024, + "step": 7638 + }, + { + "epoch": 0.5620786387675327, + "grad_norm": 0.83203125, + "learning_rate": 2.02367554545126e-05, + "loss": 0.7543, + "step": 7639 + }, + { + "epoch": 0.5621522189008967, + "grad_norm": 0.890625, + "learning_rate": 2.0231070081451076e-05, + "loss": 0.6373, + "step": 7640 + }, + { + "epoch": 0.5622257990342607, + "grad_norm": 0.74609375, + "learning_rate": 2.022538496432902e-05, + "loss": 0.7917, + "step": 7641 + }, + { + "epoch": 0.5622993791676247, + "grad_norm": 0.953125, + "learning_rate": 2.0219700103451528e-05, + "loss": 0.7983, + "step": 7642 + }, + { + "epoch": 0.5623729593009887, + "grad_norm": 1.0546875, + "learning_rate": 2.02140154991237e-05, + "loss": 1.0568, + "step": 7643 + }, + { + "epoch": 0.5624465394343527, + "grad_norm": 0.7109375, + "learning_rate": 2.0208331151650626e-05, + "loss": 0.6334, + "step": 7644 + }, + { + "epoch": 0.5625201195677167, + "grad_norm": 0.890625, + "learning_rate": 2.0202647061337376e-05, + "loss": 0.8103, + "step": 7645 + }, + { + "epoch": 0.5625936997010808, + "grad_norm": 0.703125, + "learning_rate": 2.019696322848899e-05, + "loss": 0.8565, + "step": 7646 + }, + { + "epoch": 0.5626672798344446, + "grad_norm": 0.84765625, + "learning_rate": 2.0191279653410513e-05, + "loss": 1.0922, + "step": 7647 + }, + { + "epoch": 0.5627408599678086, + "grad_norm": 0.984375, + "learning_rate": 2.018559633640697e-05, + "loss": 0.8309, + "step": 7648 + }, + { + "epoch": 0.5628144401011727, + "grad_norm": 0.8671875, + "learning_rate": 2.0179913277783386e-05, + "loss": 0.7959, + "step": 7649 + }, + { + "epoch": 0.5628880202345367, + "grad_norm": 0.8515625, + "learning_rate": 2.017423047784476e-05, + "loss": 1.374, + "step": 7650 + }, + { + "epoch": 0.5629616003679007, + "grad_norm": 0.828125, + "learning_rate": 2.016854793689606e-05, + "loss": 0.7029, + "step": 7651 + }, + { + "epoch": 0.5630351805012647, + "grad_norm": 0.73828125, + "learning_rate": 2.016286565524227e-05, + "loss": 0.7069, + "step": 7652 + }, + { + "epoch": 0.5631087606346287, + "grad_norm": 0.6640625, + "learning_rate": 2.015718363318834e-05, + "loss": 0.4827, + "step": 7653 + }, + { + "epoch": 0.5631823407679927, + "grad_norm": 0.671875, + "learning_rate": 2.0151501871039224e-05, + "loss": 0.6133, + "step": 7654 + }, + { + "epoch": 0.5632559209013567, + "grad_norm": 0.8984375, + "learning_rate": 2.014582036909984e-05, + "loss": 0.8943, + "step": 7655 + }, + { + "epoch": 0.5633295010347206, + "grad_norm": 0.80859375, + "learning_rate": 2.014013912767511e-05, + "loss": 0.678, + "step": 7656 + }, + { + "epoch": 0.5634030811680846, + "grad_norm": 0.71875, + "learning_rate": 2.013445814706993e-05, + "loss": 0.5697, + "step": 7657 + }, + { + "epoch": 0.5634766613014486, + "grad_norm": 0.71484375, + "learning_rate": 2.0128777427589198e-05, + "loss": 0.6967, + "step": 7658 + }, + { + "epoch": 0.5635502414348126, + "grad_norm": 0.8125, + "learning_rate": 2.0123096969537787e-05, + "loss": 1.3641, + "step": 7659 + }, + { + "epoch": 0.5636238215681766, + "grad_norm": 0.640625, + "learning_rate": 2.011741677322054e-05, + "loss": 0.666, + "step": 7660 + }, + { + "epoch": 0.5636974017015406, + "grad_norm": 0.6875, + "learning_rate": 2.0111736838942306e-05, + "loss": 0.5978, + "step": 7661 + }, + { + "epoch": 0.5637709818349046, + "grad_norm": 0.87890625, + "learning_rate": 2.0106057167007934e-05, + "loss": 0.6678, + "step": 7662 + }, + { + "epoch": 0.5638445619682686, + "grad_norm": 0.8671875, + "learning_rate": 2.010037775772223e-05, + "loss": 1.0527, + "step": 7663 + }, + { + "epoch": 0.5639181421016326, + "grad_norm": 0.9140625, + "learning_rate": 2.0094698611390005e-05, + "loss": 0.9668, + "step": 7664 + }, + { + "epoch": 0.5639917222349966, + "grad_norm": 0.91015625, + "learning_rate": 2.0089019728316027e-05, + "loss": 0.9403, + "step": 7665 + }, + { + "epoch": 0.5640653023683605, + "grad_norm": 0.9453125, + "learning_rate": 2.008334110880509e-05, + "loss": 1.039, + "step": 7666 + }, + { + "epoch": 0.5641388825017245, + "grad_norm": 1.0390625, + "learning_rate": 2.0077662753161946e-05, + "loss": 0.8635, + "step": 7667 + }, + { + "epoch": 0.5642124626350885, + "grad_norm": 0.94140625, + "learning_rate": 2.0071984661691354e-05, + "loss": 1.0429, + "step": 7668 + }, + { + "epoch": 0.5642860427684525, + "grad_norm": 0.8359375, + "learning_rate": 2.006630683469803e-05, + "loss": 0.7902, + "step": 7669 + }, + { + "epoch": 0.5643596229018165, + "grad_norm": 1.0625, + "learning_rate": 2.00606292724867e-05, + "loss": 1.0544, + "step": 7670 + }, + { + "epoch": 0.5644332030351805, + "grad_norm": 1.078125, + "learning_rate": 2.0054951975362067e-05, + "loss": 1.218, + "step": 7671 + }, + { + "epoch": 0.5645067831685445, + "grad_norm": 1.015625, + "learning_rate": 2.0049274943628822e-05, + "loss": 1.234, + "step": 7672 + }, + { + "epoch": 0.5645803633019085, + "grad_norm": 0.83984375, + "learning_rate": 2.0043598177591655e-05, + "loss": 0.7262, + "step": 7673 + }, + { + "epoch": 0.5646539434352725, + "grad_norm": 1.0625, + "learning_rate": 2.0037921677555194e-05, + "loss": 1.0449, + "step": 7674 + }, + { + "epoch": 0.5647275235686364, + "grad_norm": 0.7890625, + "learning_rate": 2.0032245443824106e-05, + "loss": 0.5502, + "step": 7675 + }, + { + "epoch": 0.5648011037020004, + "grad_norm": 1.0390625, + "learning_rate": 2.002656947670303e-05, + "loss": 1.3229, + "step": 7676 + }, + { + "epoch": 0.5648746838353644, + "grad_norm": 0.97265625, + "learning_rate": 2.002089377649658e-05, + "loss": 1.1109, + "step": 7677 + }, + { + "epoch": 0.5649482639687284, + "grad_norm": 0.9765625, + "learning_rate": 2.0015218343509347e-05, + "loss": 1.4111, + "step": 7678 + }, + { + "epoch": 0.5650218441020924, + "grad_norm": 0.81640625, + "learning_rate": 2.0009543178045932e-05, + "loss": 0.9055, + "step": 7679 + }, + { + "epoch": 0.5650954242354564, + "grad_norm": 0.78125, + "learning_rate": 2.000386828041091e-05, + "loss": 0.5753, + "step": 7680 + }, + { + "epoch": 0.5651690043688204, + "grad_norm": 0.73828125, + "learning_rate": 1.9998193650908843e-05, + "loss": 1.1224, + "step": 7681 + }, + { + "epoch": 0.5652425845021845, + "grad_norm": 1.015625, + "learning_rate": 1.9992519289844274e-05, + "loss": 1.0986, + "step": 7682 + }, + { + "epoch": 0.5653161646355485, + "grad_norm": 0.69921875, + "learning_rate": 1.9986845197521737e-05, + "loss": 0.6512, + "step": 7683 + }, + { + "epoch": 0.5653897447689124, + "grad_norm": 1.1015625, + "learning_rate": 1.9981171374245748e-05, + "loss": 1.4244, + "step": 7684 + }, + { + "epoch": 0.5654633249022764, + "grad_norm": 0.8203125, + "learning_rate": 1.9975497820320815e-05, + "loss": 0.7566, + "step": 7685 + }, + { + "epoch": 0.5655369050356404, + "grad_norm": 0.80859375, + "learning_rate": 1.9969824536051433e-05, + "loss": 0.7247, + "step": 7686 + }, + { + "epoch": 0.5656104851690044, + "grad_norm": 0.77734375, + "learning_rate": 1.9964151521742057e-05, + "loss": 0.6758, + "step": 7687 + }, + { + "epoch": 0.5656840653023684, + "grad_norm": 0.8984375, + "learning_rate": 1.995847877769715e-05, + "loss": 0.663, + "step": 7688 + }, + { + "epoch": 0.5657576454357324, + "grad_norm": 0.69140625, + "learning_rate": 1.9952806304221173e-05, + "loss": 0.6042, + "step": 7689 + }, + { + "epoch": 0.5658312255690964, + "grad_norm": 0.796875, + "learning_rate": 1.9947134101618547e-05, + "loss": 0.8123, + "step": 7690 + }, + { + "epoch": 0.5659048057024604, + "grad_norm": 1.0859375, + "learning_rate": 1.99414621701937e-05, + "loss": 1.0737, + "step": 7691 + }, + { + "epoch": 0.5659783858358244, + "grad_norm": 0.81640625, + "learning_rate": 1.9935790510251013e-05, + "loss": 1.0011, + "step": 7692 + }, + { + "epoch": 0.5660519659691883, + "grad_norm": 0.9609375, + "learning_rate": 1.993011912209489e-05, + "loss": 0.7549, + "step": 7693 + }, + { + "epoch": 0.5661255461025523, + "grad_norm": 1.0, + "learning_rate": 1.9924448006029695e-05, + "loss": 1.2663, + "step": 7694 + }, + { + "epoch": 0.5661991262359163, + "grad_norm": 0.77734375, + "learning_rate": 1.9918777162359787e-05, + "loss": 1.3989, + "step": 7695 + }, + { + "epoch": 0.5662727063692803, + "grad_norm": 0.86328125, + "learning_rate": 1.9913106591389517e-05, + "loss": 0.7846, + "step": 7696 + }, + { + "epoch": 0.5663462865026443, + "grad_norm": 0.85546875, + "learning_rate": 1.9907436293423208e-05, + "loss": 0.7626, + "step": 7697 + }, + { + "epoch": 0.5664198666360083, + "grad_norm": 0.84765625, + "learning_rate": 1.990176626876517e-05, + "loss": 1.2104, + "step": 7698 + }, + { + "epoch": 0.5664934467693723, + "grad_norm": 0.93359375, + "learning_rate": 1.989609651771971e-05, + "loss": 1.087, + "step": 7699 + }, + { + "epoch": 0.5665670269027363, + "grad_norm": 0.7578125, + "learning_rate": 1.9890427040591116e-05, + "loss": 0.8319, + "step": 7700 + }, + { + "epoch": 0.5666406070361003, + "grad_norm": 0.734375, + "learning_rate": 1.988475783768364e-05, + "loss": 0.7973, + "step": 7701 + }, + { + "epoch": 0.5667141871694642, + "grad_norm": 1.109375, + "learning_rate": 1.9879088909301556e-05, + "loss": 1.1152, + "step": 7702 + }, + { + "epoch": 0.5667877673028282, + "grad_norm": 0.8984375, + "learning_rate": 1.9873420255749096e-05, + "loss": 1.093, + "step": 7703 + }, + { + "epoch": 0.5668613474361922, + "grad_norm": 0.8671875, + "learning_rate": 1.9867751877330488e-05, + "loss": 0.951, + "step": 7704 + }, + { + "epoch": 0.5669349275695562, + "grad_norm": 1.09375, + "learning_rate": 1.9862083774349956e-05, + "loss": 1.2658, + "step": 7705 + }, + { + "epoch": 0.5670085077029202, + "grad_norm": 0.90234375, + "learning_rate": 1.985641594711167e-05, + "loss": 0.9686, + "step": 7706 + }, + { + "epoch": 0.5670820878362842, + "grad_norm": 0.921875, + "learning_rate": 1.9850748395919826e-05, + "loss": 1.0193, + "step": 7707 + }, + { + "epoch": 0.5671556679696482, + "grad_norm": 0.80078125, + "learning_rate": 1.984508112107859e-05, + "loss": 0.7656, + "step": 7708 + }, + { + "epoch": 0.5672292481030122, + "grad_norm": 0.8515625, + "learning_rate": 1.983941412289212e-05, + "loss": 0.8086, + "step": 7709 + }, + { + "epoch": 0.5673028282363762, + "grad_norm": 0.875, + "learning_rate": 1.9833747401664543e-05, + "loss": 0.7632, + "step": 7710 + }, + { + "epoch": 0.5673764083697401, + "grad_norm": 0.69921875, + "learning_rate": 1.9828080957699987e-05, + "loss": 0.4955, + "step": 7711 + }, + { + "epoch": 0.5674499885031041, + "grad_norm": 0.82421875, + "learning_rate": 1.982241479130255e-05, + "loss": 0.9354, + "step": 7712 + }, + { + "epoch": 0.5675235686364681, + "grad_norm": 0.92578125, + "learning_rate": 1.981674890277634e-05, + "loss": 1.072, + "step": 7713 + }, + { + "epoch": 0.5675971487698321, + "grad_norm": 0.7578125, + "learning_rate": 1.9811083292425427e-05, + "loss": 0.7631, + "step": 7714 + }, + { + "epoch": 0.5676707289031961, + "grad_norm": 1.0234375, + "learning_rate": 1.980541796055387e-05, + "loss": 0.9107, + "step": 7715 + }, + { + "epoch": 0.5677443090365601, + "grad_norm": 0.8984375, + "learning_rate": 1.9799752907465717e-05, + "loss": 0.9741, + "step": 7716 + }, + { + "epoch": 0.5678178891699242, + "grad_norm": 0.87890625, + "learning_rate": 1.9794088133465008e-05, + "loss": 0.9064, + "step": 7717 + }, + { + "epoch": 0.5678914693032882, + "grad_norm": 0.9296875, + "learning_rate": 1.9788423638855767e-05, + "loss": 0.9792, + "step": 7718 + }, + { + "epoch": 0.5679650494366522, + "grad_norm": 0.734375, + "learning_rate": 1.978275942394197e-05, + "loss": 0.5625, + "step": 7719 + }, + { + "epoch": 0.568038629570016, + "grad_norm": 0.7734375, + "learning_rate": 1.977709548902763e-05, + "loss": 0.7593, + "step": 7720 + }, + { + "epoch": 0.5681122097033801, + "grad_norm": 0.83984375, + "learning_rate": 1.97714318344167e-05, + "loss": 0.6402, + "step": 7721 + }, + { + "epoch": 0.5681857898367441, + "grad_norm": 0.98046875, + "learning_rate": 1.9765768460413153e-05, + "loss": 1.0625, + "step": 7722 + }, + { + "epoch": 0.5682593699701081, + "grad_norm": 0.96875, + "learning_rate": 1.9760105367320934e-05, + "loss": 1.1693, + "step": 7723 + }, + { + "epoch": 0.5683329501034721, + "grad_norm": 0.96484375, + "learning_rate": 1.9754442555443956e-05, + "loss": 1.3715, + "step": 7724 + }, + { + "epoch": 0.5684065302368361, + "grad_norm": 0.890625, + "learning_rate": 1.9748780025086136e-05, + "loss": 0.9845, + "step": 7725 + }, + { + "epoch": 0.5684801103702001, + "grad_norm": 0.80859375, + "learning_rate": 1.9743117776551377e-05, + "loss": 0.7554, + "step": 7726 + }, + { + "epoch": 0.5685536905035641, + "grad_norm": 0.8359375, + "learning_rate": 1.9737455810143564e-05, + "loss": 0.791, + "step": 7727 + }, + { + "epoch": 0.5686272706369281, + "grad_norm": 0.9375, + "learning_rate": 1.973179412616655e-05, + "loss": 1.1941, + "step": 7728 + }, + { + "epoch": 0.568700850770292, + "grad_norm": 0.875, + "learning_rate": 1.9726132724924195e-05, + "loss": 0.678, + "step": 7729 + }, + { + "epoch": 0.568774430903656, + "grad_norm": 0.87109375, + "learning_rate": 1.9720471606720338e-05, + "loss": 0.8929, + "step": 7730 + }, + { + "epoch": 0.56884801103702, + "grad_norm": 0.6796875, + "learning_rate": 1.9714810771858797e-05, + "loss": 0.554, + "step": 7731 + }, + { + "epoch": 0.568921591170384, + "grad_norm": 0.78125, + "learning_rate": 1.970915022064339e-05, + "loss": 0.8933, + "step": 7732 + }, + { + "epoch": 0.568995171303748, + "grad_norm": 0.82421875, + "learning_rate": 1.9703489953377888e-05, + "loss": 1.0544, + "step": 7733 + }, + { + "epoch": 0.569068751437112, + "grad_norm": 0.92578125, + "learning_rate": 1.9697829970366076e-05, + "loss": 0.9059, + "step": 7734 + }, + { + "epoch": 0.569142331570476, + "grad_norm": 1.375, + "learning_rate": 1.9692170271911717e-05, + "loss": 1.1534, + "step": 7735 + }, + { + "epoch": 0.56921591170384, + "grad_norm": 0.98828125, + "learning_rate": 1.9686510858318553e-05, + "loss": 1.1078, + "step": 7736 + }, + { + "epoch": 0.569289491837204, + "grad_norm": 1.46875, + "learning_rate": 1.968085172989032e-05, + "loss": 1.294, + "step": 7737 + }, + { + "epoch": 0.5693630719705679, + "grad_norm": 0.8828125, + "learning_rate": 1.9675192886930722e-05, + "loss": 0.9051, + "step": 7738 + }, + { + "epoch": 0.5694366521039319, + "grad_norm": 0.89453125, + "learning_rate": 1.9669534329743467e-05, + "loss": 0.8163, + "step": 7739 + }, + { + "epoch": 0.5695102322372959, + "grad_norm": 0.9140625, + "learning_rate": 1.9663876058632235e-05, + "loss": 1.6457, + "step": 7740 + }, + { + "epoch": 0.5695838123706599, + "grad_norm": 0.7265625, + "learning_rate": 1.9658218073900704e-05, + "loss": 0.618, + "step": 7741 + }, + { + "epoch": 0.5696573925040239, + "grad_norm": 0.96875, + "learning_rate": 1.965256037585251e-05, + "loss": 0.9302, + "step": 7742 + }, + { + "epoch": 0.5697309726373879, + "grad_norm": 0.80859375, + "learning_rate": 1.9646902964791305e-05, + "loss": 1.4923, + "step": 7743 + }, + { + "epoch": 0.5698045527707519, + "grad_norm": 0.87890625, + "learning_rate": 1.9641245841020705e-05, + "loss": 0.9057, + "step": 7744 + }, + { + "epoch": 0.5698781329041159, + "grad_norm": 1.09375, + "learning_rate": 1.9635589004844322e-05, + "loss": 1.0035, + "step": 7745 + }, + { + "epoch": 0.5699517130374799, + "grad_norm": 0.87890625, + "learning_rate": 1.9629932456565752e-05, + "loss": 0.849, + "step": 7746 + }, + { + "epoch": 0.5700252931708438, + "grad_norm": 0.9453125, + "learning_rate": 1.9624276196488556e-05, + "loss": 0.8605, + "step": 7747 + }, + { + "epoch": 0.5700988733042078, + "grad_norm": 0.84375, + "learning_rate": 1.9618620224916304e-05, + "loss": 0.7747, + "step": 7748 + }, + { + "epoch": 0.5701724534375718, + "grad_norm": 0.87109375, + "learning_rate": 1.961296454215254e-05, + "loss": 1.1588, + "step": 7749 + }, + { + "epoch": 0.5702460335709358, + "grad_norm": 0.765625, + "learning_rate": 1.96073091485008e-05, + "loss": 0.5469, + "step": 7750 + }, + { + "epoch": 0.5703196137042998, + "grad_norm": 0.875, + "learning_rate": 1.9601654044264586e-05, + "loss": 0.9935, + "step": 7751 + }, + { + "epoch": 0.5703931938376638, + "grad_norm": 0.9140625, + "learning_rate": 1.9595999229747405e-05, + "loss": 0.9887, + "step": 7752 + }, + { + "epoch": 0.5704667739710279, + "grad_norm": 1.0625, + "learning_rate": 1.959034470525274e-05, + "loss": 1.0697, + "step": 7753 + }, + { + "epoch": 0.5705403541043919, + "grad_norm": 0.97265625, + "learning_rate": 1.9584690471084053e-05, + "loss": 1.1769, + "step": 7754 + }, + { + "epoch": 0.5706139342377559, + "grad_norm": 0.71875, + "learning_rate": 1.957903652754481e-05, + "loss": 1.0359, + "step": 7755 + }, + { + "epoch": 0.5706875143711198, + "grad_norm": 0.80859375, + "learning_rate": 1.957338287493843e-05, + "loss": 0.9603, + "step": 7756 + }, + { + "epoch": 0.5707610945044838, + "grad_norm": 1.34375, + "learning_rate": 1.956772951356834e-05, + "loss": 0.9707, + "step": 7757 + }, + { + "epoch": 0.5708346746378478, + "grad_norm": 0.6953125, + "learning_rate": 1.9562076443737947e-05, + "loss": 0.5688, + "step": 7758 + }, + { + "epoch": 0.5709082547712118, + "grad_norm": 0.765625, + "learning_rate": 1.9556423665750655e-05, + "loss": 0.6643, + "step": 7759 + }, + { + "epoch": 0.5709818349045758, + "grad_norm": 0.859375, + "learning_rate": 1.955077117990981e-05, + "loss": 0.8678, + "step": 7760 + }, + { + "epoch": 0.5710554150379398, + "grad_norm": 0.7890625, + "learning_rate": 1.9545118986518785e-05, + "loss": 0.7246, + "step": 7761 + }, + { + "epoch": 0.5711289951713038, + "grad_norm": 0.8125, + "learning_rate": 1.953946708588092e-05, + "loss": 0.9106, + "step": 7762 + }, + { + "epoch": 0.5712025753046678, + "grad_norm": 0.87890625, + "learning_rate": 1.9533815478299543e-05, + "loss": 0.7549, + "step": 7763 + }, + { + "epoch": 0.5712761554380318, + "grad_norm": 0.96484375, + "learning_rate": 1.952816416407797e-05, + "loss": 1.1657, + "step": 7764 + }, + { + "epoch": 0.5713497355713957, + "grad_norm": 0.77734375, + "learning_rate": 1.9522513143519488e-05, + "loss": 0.8916, + "step": 7765 + }, + { + "epoch": 0.5714233157047597, + "grad_norm": 1.03125, + "learning_rate": 1.9516862416927383e-05, + "loss": 1.3224, + "step": 7766 + }, + { + "epoch": 0.5714968958381237, + "grad_norm": 0.84375, + "learning_rate": 1.951121198460491e-05, + "loss": 0.8834, + "step": 7767 + }, + { + "epoch": 0.5715704759714877, + "grad_norm": 0.828125, + "learning_rate": 1.9505561846855326e-05, + "loss": 0.8804, + "step": 7768 + }, + { + "epoch": 0.5716440561048517, + "grad_norm": 1.0078125, + "learning_rate": 1.9499912003981864e-05, + "loss": 1.0037, + "step": 7769 + }, + { + "epoch": 0.5717176362382157, + "grad_norm": 0.93359375, + "learning_rate": 1.949426245628773e-05, + "loss": 1.2265, + "step": 7770 + }, + { + "epoch": 0.5717912163715797, + "grad_norm": 0.76171875, + "learning_rate": 1.9488613204076133e-05, + "loss": 0.714, + "step": 7771 + }, + { + "epoch": 0.5718647965049437, + "grad_norm": 0.765625, + "learning_rate": 1.948296424765026e-05, + "loss": 0.9577, + "step": 7772 + }, + { + "epoch": 0.5719383766383077, + "grad_norm": 0.8203125, + "learning_rate": 1.947731558731328e-05, + "loss": 0.8312, + "step": 7773 + }, + { + "epoch": 0.5720119567716716, + "grad_norm": 0.8984375, + "learning_rate": 1.9471667223368333e-05, + "loss": 0.8835, + "step": 7774 + }, + { + "epoch": 0.5720855369050356, + "grad_norm": 0.81640625, + "learning_rate": 1.9466019156118565e-05, + "loss": 1.032, + "step": 7775 + }, + { + "epoch": 0.5721591170383996, + "grad_norm": 0.796875, + "learning_rate": 1.9460371385867097e-05, + "loss": 0.89, + "step": 7776 + }, + { + "epoch": 0.5722326971717636, + "grad_norm": 0.8515625, + "learning_rate": 1.9454723912917035e-05, + "loss": 0.712, + "step": 7777 + }, + { + "epoch": 0.5723062773051276, + "grad_norm": 0.9296875, + "learning_rate": 1.9449076737571467e-05, + "loss": 1.0629, + "step": 7778 + }, + { + "epoch": 0.5723798574384916, + "grad_norm": 1.03125, + "learning_rate": 1.9443429860133467e-05, + "loss": 1.1659, + "step": 7779 + }, + { + "epoch": 0.5724534375718556, + "grad_norm": 0.77734375, + "learning_rate": 1.9437783280906086e-05, + "loss": 0.6964, + "step": 7780 + }, + { + "epoch": 0.5725270177052196, + "grad_norm": 0.75390625, + "learning_rate": 1.9432137000192376e-05, + "loss": 0.9, + "step": 7781 + }, + { + "epoch": 0.5726005978385836, + "grad_norm": 0.97265625, + "learning_rate": 1.942649101829536e-05, + "loss": 1.2242, + "step": 7782 + }, + { + "epoch": 0.5726741779719475, + "grad_norm": 1.0390625, + "learning_rate": 1.9420845335518036e-05, + "loss": 0.907, + "step": 7783 + }, + { + "epoch": 0.5727477581053115, + "grad_norm": 0.67578125, + "learning_rate": 1.941519995216341e-05, + "loss": 0.5625, + "step": 7784 + }, + { + "epoch": 0.5728213382386755, + "grad_norm": 0.9140625, + "learning_rate": 1.940955486853445e-05, + "loss": 0.8627, + "step": 7785 + }, + { + "epoch": 0.5728949183720395, + "grad_norm": 0.74609375, + "learning_rate": 1.9403910084934128e-05, + "loss": 0.6583, + "step": 7786 + }, + { + "epoch": 0.5729684985054035, + "grad_norm": 0.66015625, + "learning_rate": 1.939826560166539e-05, + "loss": 0.5249, + "step": 7787 + }, + { + "epoch": 0.5730420786387675, + "grad_norm": 0.74609375, + "learning_rate": 1.939262141903114e-05, + "loss": 0.691, + "step": 7788 + }, + { + "epoch": 0.5731156587721316, + "grad_norm": 0.984375, + "learning_rate": 1.9386977537334316e-05, + "loss": 1.3678, + "step": 7789 + }, + { + "epoch": 0.5731892389054956, + "grad_norm": 0.94140625, + "learning_rate": 1.93813339568778e-05, + "loss": 0.896, + "step": 7790 + }, + { + "epoch": 0.5732628190388596, + "grad_norm": 0.96484375, + "learning_rate": 1.937569067796449e-05, + "loss": 0.9596, + "step": 7791 + }, + { + "epoch": 0.5733363991722235, + "grad_norm": 0.79296875, + "learning_rate": 1.9370047700897227e-05, + "loss": 0.849, + "step": 7792 + }, + { + "epoch": 0.5734099793055875, + "grad_norm": 0.85546875, + "learning_rate": 1.9364405025978877e-05, + "loss": 1.0411, + "step": 7793 + }, + { + "epoch": 0.5734835594389515, + "grad_norm": 0.84765625, + "learning_rate": 1.935876265351226e-05, + "loss": 0.7349, + "step": 7794 + }, + { + "epoch": 0.5735571395723155, + "grad_norm": 0.8359375, + "learning_rate": 1.9353120583800197e-05, + "loss": 0.716, + "step": 7795 + }, + { + "epoch": 0.5736307197056795, + "grad_norm": 0.86328125, + "learning_rate": 1.9347478817145492e-05, + "loss": 0.9886, + "step": 7796 + }, + { + "epoch": 0.5737042998390435, + "grad_norm": 0.74609375, + "learning_rate": 1.934183735385092e-05, + "loss": 0.8516, + "step": 7797 + }, + { + "epoch": 0.5737778799724075, + "grad_norm": 0.734375, + "learning_rate": 1.933619619421925e-05, + "loss": 1.1078, + "step": 7798 + }, + { + "epoch": 0.5738514601057715, + "grad_norm": 0.875, + "learning_rate": 1.933055533855323e-05, + "loss": 1.1994, + "step": 7799 + }, + { + "epoch": 0.5739250402391355, + "grad_norm": 0.921875, + "learning_rate": 1.93249147871556e-05, + "loss": 1.1573, + "step": 7800 + }, + { + "epoch": 0.5739986203724994, + "grad_norm": 0.7734375, + "learning_rate": 1.9319274540329085e-05, + "loss": 0.9439, + "step": 7801 + }, + { + "epoch": 0.5740722005058634, + "grad_norm": 0.90625, + "learning_rate": 1.9313634598376363e-05, + "loss": 0.8465, + "step": 7802 + }, + { + "epoch": 0.5741457806392274, + "grad_norm": 0.7890625, + "learning_rate": 1.9307994961600136e-05, + "loss": 1.1433, + "step": 7803 + }, + { + "epoch": 0.5742193607725914, + "grad_norm": 0.95703125, + "learning_rate": 1.930235563030306e-05, + "loss": 1.1513, + "step": 7804 + }, + { + "epoch": 0.5742929409059554, + "grad_norm": 0.85546875, + "learning_rate": 1.929671660478781e-05, + "loss": 1.366, + "step": 7805 + }, + { + "epoch": 0.5743665210393194, + "grad_norm": 1.015625, + "learning_rate": 1.9291077885357e-05, + "loss": 0.9652, + "step": 7806 + }, + { + "epoch": 0.5744401011726834, + "grad_norm": 0.77734375, + "learning_rate": 1.928543947231326e-05, + "loss": 0.8515, + "step": 7807 + }, + { + "epoch": 0.5745136813060474, + "grad_norm": 1.0078125, + "learning_rate": 1.927980136595919e-05, + "loss": 1.1187, + "step": 7808 + }, + { + "epoch": 0.5745872614394114, + "grad_norm": 0.89453125, + "learning_rate": 1.9274163566597372e-05, + "loss": 1.021, + "step": 7809 + }, + { + "epoch": 0.5746608415727753, + "grad_norm": 0.76171875, + "learning_rate": 1.9268526074530386e-05, + "loss": 0.9821, + "step": 7810 + }, + { + "epoch": 0.5747344217061393, + "grad_norm": 0.7578125, + "learning_rate": 1.926288889006078e-05, + "loss": 0.8988, + "step": 7811 + }, + { + "epoch": 0.5748080018395033, + "grad_norm": 0.9609375, + "learning_rate": 1.9257252013491088e-05, + "loss": 0.7945, + "step": 7812 + }, + { + "epoch": 0.5748815819728673, + "grad_norm": 0.98046875, + "learning_rate": 1.9251615445123835e-05, + "loss": 0.8642, + "step": 7813 + }, + { + "epoch": 0.5749551621062313, + "grad_norm": 0.8046875, + "learning_rate": 1.9245979185261536e-05, + "loss": 0.7619, + "step": 7814 + }, + { + "epoch": 0.5750287422395953, + "grad_norm": 0.7890625, + "learning_rate": 1.9240343234206653e-05, + "loss": 0.549, + "step": 7815 + }, + { + "epoch": 0.5751023223729593, + "grad_norm": 0.828125, + "learning_rate": 1.9234707592261672e-05, + "loss": 0.6836, + "step": 7816 + }, + { + "epoch": 0.5751759025063233, + "grad_norm": 0.765625, + "learning_rate": 1.9229072259729045e-05, + "loss": 0.6723, + "step": 7817 + }, + { + "epoch": 0.5752494826396873, + "grad_norm": 0.7421875, + "learning_rate": 1.9223437236911203e-05, + "loss": 0.5329, + "step": 7818 + }, + { + "epoch": 0.5753230627730512, + "grad_norm": 1.0625, + "learning_rate": 1.9217802524110594e-05, + "loss": 1.3878, + "step": 7819 + }, + { + "epoch": 0.5753966429064152, + "grad_norm": 0.7734375, + "learning_rate": 1.921216812162959e-05, + "loss": 0.7419, + "step": 7820 + }, + { + "epoch": 0.5754702230397792, + "grad_norm": 0.98828125, + "learning_rate": 1.920653402977059e-05, + "loss": 0.7552, + "step": 7821 + }, + { + "epoch": 0.5755438031731432, + "grad_norm": 0.77734375, + "learning_rate": 1.9200900248835967e-05, + "loss": 0.662, + "step": 7822 + }, + { + "epoch": 0.5756173833065072, + "grad_norm": 0.875, + "learning_rate": 1.919526677912808e-05, + "loss": 1.0735, + "step": 7823 + }, + { + "epoch": 0.5756909634398713, + "grad_norm": 0.9609375, + "learning_rate": 1.9189633620949256e-05, + "loss": 1.2137, + "step": 7824 + }, + { + "epoch": 0.5757645435732353, + "grad_norm": 0.6953125, + "learning_rate": 1.918400077460182e-05, + "loss": 0.5444, + "step": 7825 + }, + { + "epoch": 0.5758381237065993, + "grad_norm": 0.74609375, + "learning_rate": 1.917836824038808e-05, + "loss": 0.6851, + "step": 7826 + }, + { + "epoch": 0.5759117038399633, + "grad_norm": 0.82421875, + "learning_rate": 1.9172736018610322e-05, + "loss": 0.9634, + "step": 7827 + }, + { + "epoch": 0.5759852839733272, + "grad_norm": 0.9296875, + "learning_rate": 1.9167104109570826e-05, + "loss": 0.8805, + "step": 7828 + }, + { + "epoch": 0.5760588641066912, + "grad_norm": 1.0625, + "learning_rate": 1.916147251357182e-05, + "loss": 0.9175, + "step": 7829 + }, + { + "epoch": 0.5761324442400552, + "grad_norm": 0.921875, + "learning_rate": 1.915584123091556e-05, + "loss": 0.9094, + "step": 7830 + }, + { + "epoch": 0.5762060243734192, + "grad_norm": 1.09375, + "learning_rate": 1.9150210261904257e-05, + "loss": 0.9626, + "step": 7831 + }, + { + "epoch": 0.5762796045067832, + "grad_norm": 1.03125, + "learning_rate": 1.9144579606840142e-05, + "loss": 0.6984, + "step": 7832 + }, + { + "epoch": 0.5763531846401472, + "grad_norm": 0.94921875, + "learning_rate": 1.9138949266025362e-05, + "loss": 1.0027, + "step": 7833 + }, + { + "epoch": 0.5764267647735112, + "grad_norm": 0.74609375, + "learning_rate": 1.913331923976211e-05, + "loss": 0.8469, + "step": 7834 + }, + { + "epoch": 0.5765003449068752, + "grad_norm": 0.82421875, + "learning_rate": 1.9127689528352532e-05, + "loss": 0.7408, + "step": 7835 + }, + { + "epoch": 0.5765739250402392, + "grad_norm": 0.90625, + "learning_rate": 1.9122060132098764e-05, + "loss": 0.8928, + "step": 7836 + }, + { + "epoch": 0.5766475051736031, + "grad_norm": 0.89453125, + "learning_rate": 1.9116431051302936e-05, + "loss": 0.9892, + "step": 7837 + }, + { + "epoch": 0.5767210853069671, + "grad_norm": 0.73828125, + "learning_rate": 1.9110802286267133e-05, + "loss": 0.6941, + "step": 7838 + }, + { + "epoch": 0.5767946654403311, + "grad_norm": 0.8828125, + "learning_rate": 1.9105173837293448e-05, + "loss": 0.8589, + "step": 7839 + }, + { + "epoch": 0.5768682455736951, + "grad_norm": 0.8984375, + "learning_rate": 1.909954570468395e-05, + "loss": 1.0987, + "step": 7840 + }, + { + "epoch": 0.5769418257070591, + "grad_norm": 0.83203125, + "learning_rate": 1.909391788874069e-05, + "loss": 0.7726, + "step": 7841 + }, + { + "epoch": 0.5770154058404231, + "grad_norm": 0.7890625, + "learning_rate": 1.908829038976571e-05, + "loss": 0.7741, + "step": 7842 + }, + { + "epoch": 0.5770889859737871, + "grad_norm": 1.1328125, + "learning_rate": 1.9082663208061014e-05, + "loss": 0.9955, + "step": 7843 + }, + { + "epoch": 0.5771625661071511, + "grad_norm": 1.046875, + "learning_rate": 1.9077036343928596e-05, + "loss": 1.2145, + "step": 7844 + }, + { + "epoch": 0.5772361462405151, + "grad_norm": 0.765625, + "learning_rate": 1.9071409797670462e-05, + "loss": 0.6294, + "step": 7845 + }, + { + "epoch": 0.577309726373879, + "grad_norm": 0.76171875, + "learning_rate": 1.9065783569588576e-05, + "loss": 0.8148, + "step": 7846 + }, + { + "epoch": 0.577383306507243, + "grad_norm": 0.84375, + "learning_rate": 1.906015765998486e-05, + "loss": 0.9866, + "step": 7847 + }, + { + "epoch": 0.577456886640607, + "grad_norm": 0.828125, + "learning_rate": 1.905453206916127e-05, + "loss": 0.8646, + "step": 7848 + }, + { + "epoch": 0.577530466773971, + "grad_norm": 0.98046875, + "learning_rate": 1.9048906797419713e-05, + "loss": 0.8968, + "step": 7849 + }, + { + "epoch": 0.577604046907335, + "grad_norm": 0.90234375, + "learning_rate": 1.9043281845062087e-05, + "loss": 1.5815, + "step": 7850 + }, + { + "epoch": 0.577677627040699, + "grad_norm": 0.88671875, + "learning_rate": 1.903765721239028e-05, + "loss": 0.7399, + "step": 7851 + }, + { + "epoch": 0.577751207174063, + "grad_norm": 0.7890625, + "learning_rate": 1.903203289970615e-05, + "loss": 0.694, + "step": 7852 + }, + { + "epoch": 0.577824787307427, + "grad_norm": 0.69921875, + "learning_rate": 1.9026408907311532e-05, + "loss": 0.7211, + "step": 7853 + }, + { + "epoch": 0.577898367440791, + "grad_norm": 0.92578125, + "learning_rate": 1.902078523550827e-05, + "loss": 0.979, + "step": 7854 + }, + { + "epoch": 0.5779719475741549, + "grad_norm": 1.0546875, + "learning_rate": 1.901516188459818e-05, + "loss": 1.4331, + "step": 7855 + }, + { + "epoch": 0.5780455277075189, + "grad_norm": 1.015625, + "learning_rate": 1.900953885488304e-05, + "loss": 1.0824, + "step": 7856 + }, + { + "epoch": 0.5781191078408829, + "grad_norm": 0.7734375, + "learning_rate": 1.900391614666463e-05, + "loss": 0.7317, + "step": 7857 + }, + { + "epoch": 0.5781926879742469, + "grad_norm": 0.94921875, + "learning_rate": 1.899829376024472e-05, + "loss": 1.0449, + "step": 7858 + }, + { + "epoch": 0.578266268107611, + "grad_norm": 0.93359375, + "learning_rate": 1.899267169592505e-05, + "loss": 0.7986, + "step": 7859 + }, + { + "epoch": 0.578339848240975, + "grad_norm": 1.0, + "learning_rate": 1.898704995400735e-05, + "loss": 1.1066, + "step": 7860 + }, + { + "epoch": 0.578413428374339, + "grad_norm": 0.80078125, + "learning_rate": 1.8981428534793317e-05, + "loss": 0.6905, + "step": 7861 + }, + { + "epoch": 0.578487008507703, + "grad_norm": 0.640625, + "learning_rate": 1.8975807438584642e-05, + "loss": 0.7546, + "step": 7862 + }, + { + "epoch": 0.578560588641067, + "grad_norm": 0.73828125, + "learning_rate": 1.8970186665683005e-05, + "loss": 0.7572, + "step": 7863 + }, + { + "epoch": 0.5786341687744309, + "grad_norm": 1.1484375, + "learning_rate": 1.896456621639007e-05, + "loss": 1.2453, + "step": 7864 + }, + { + "epoch": 0.5787077489077949, + "grad_norm": 0.77734375, + "learning_rate": 1.8958946091007458e-05, + "loss": 0.6999, + "step": 7865 + }, + { + "epoch": 0.5787813290411589, + "grad_norm": 0.890625, + "learning_rate": 1.89533262898368e-05, + "loss": 0.8552, + "step": 7866 + }, + { + "epoch": 0.5788549091745229, + "grad_norm": 0.78515625, + "learning_rate": 1.89477068131797e-05, + "loss": 0.6997, + "step": 7867 + }, + { + "epoch": 0.5789284893078869, + "grad_norm": 0.87109375, + "learning_rate": 1.8942087661337742e-05, + "loss": 0.8844, + "step": 7868 + }, + { + "epoch": 0.5790020694412509, + "grad_norm": 0.85546875, + "learning_rate": 1.893646883461251e-05, + "loss": 0.9771, + "step": 7869 + }, + { + "epoch": 0.5790756495746149, + "grad_norm": 1.03125, + "learning_rate": 1.8930850333305532e-05, + "loss": 1.0053, + "step": 7870 + }, + { + "epoch": 0.5791492297079789, + "grad_norm": 0.90625, + "learning_rate": 1.8925232157718352e-05, + "loss": 0.8334, + "step": 7871 + }, + { + "epoch": 0.5792228098413429, + "grad_norm": 0.89453125, + "learning_rate": 1.891961430815249e-05, + "loss": 0.9882, + "step": 7872 + }, + { + "epoch": 0.5792963899747068, + "grad_norm": 0.83984375, + "learning_rate": 1.8913996784909445e-05, + "loss": 0.9642, + "step": 7873 + }, + { + "epoch": 0.5793699701080708, + "grad_norm": 0.99609375, + "learning_rate": 1.8908379588290707e-05, + "loss": 0.8087, + "step": 7874 + }, + { + "epoch": 0.5794435502414348, + "grad_norm": 0.71484375, + "learning_rate": 1.890276271859772e-05, + "loss": 0.7891, + "step": 7875 + }, + { + "epoch": 0.5795171303747988, + "grad_norm": 0.94921875, + "learning_rate": 1.8897146176131945e-05, + "loss": 0.8852, + "step": 7876 + }, + { + "epoch": 0.5795907105081628, + "grad_norm": 0.85546875, + "learning_rate": 1.8891529961194804e-05, + "loss": 0.8946, + "step": 7877 + }, + { + "epoch": 0.5796642906415268, + "grad_norm": 0.70703125, + "learning_rate": 1.8885914074087722e-05, + "loss": 0.7218, + "step": 7878 + }, + { + "epoch": 0.5797378707748908, + "grad_norm": 1.0, + "learning_rate": 1.8880298515112073e-05, + "loss": 1.0425, + "step": 7879 + }, + { + "epoch": 0.5798114509082548, + "grad_norm": 0.67578125, + "learning_rate": 1.887468328456925e-05, + "loss": 0.5877, + "step": 7880 + }, + { + "epoch": 0.5798850310416188, + "grad_norm": 0.953125, + "learning_rate": 1.8869068382760604e-05, + "loss": 0.9595, + "step": 7881 + }, + { + "epoch": 0.5799586111749827, + "grad_norm": 0.7890625, + "learning_rate": 1.8863453809987478e-05, + "loss": 0.7985, + "step": 7882 + }, + { + "epoch": 0.5800321913083467, + "grad_norm": 0.79296875, + "learning_rate": 1.8857839566551205e-05, + "loss": 0.734, + "step": 7883 + }, + { + "epoch": 0.5801057714417107, + "grad_norm": 0.69140625, + "learning_rate": 1.885222565275307e-05, + "loss": 0.6292, + "step": 7884 + }, + { + "epoch": 0.5801793515750747, + "grad_norm": 0.87109375, + "learning_rate": 1.8846612068894373e-05, + "loss": 0.7832, + "step": 7885 + }, + { + "epoch": 0.5802529317084387, + "grad_norm": 0.9765625, + "learning_rate": 1.8840998815276387e-05, + "loss": 1.0154, + "step": 7886 + }, + { + "epoch": 0.5803265118418027, + "grad_norm": 1.03125, + "learning_rate": 1.883538589220037e-05, + "loss": 0.9185, + "step": 7887 + }, + { + "epoch": 0.5804000919751667, + "grad_norm": 0.84375, + "learning_rate": 1.882977329996754e-05, + "loss": 1.2152, + "step": 7888 + }, + { + "epoch": 0.5804736721085307, + "grad_norm": 0.828125, + "learning_rate": 1.8824161038879122e-05, + "loss": 0.8345, + "step": 7889 + }, + { + "epoch": 0.5805472522418947, + "grad_norm": 1.078125, + "learning_rate": 1.881854910923632e-05, + "loss": 0.6835, + "step": 7890 + }, + { + "epoch": 0.5806208323752586, + "grad_norm": 0.98828125, + "learning_rate": 1.8812937511340307e-05, + "loss": 1.1624, + "step": 7891 + }, + { + "epoch": 0.5806944125086226, + "grad_norm": 0.8828125, + "learning_rate": 1.8807326245492262e-05, + "loss": 0.9678, + "step": 7892 + }, + { + "epoch": 0.5807679926419866, + "grad_norm": 0.78515625, + "learning_rate": 1.8801715311993315e-05, + "loss": 1.1112, + "step": 7893 + }, + { + "epoch": 0.5808415727753506, + "grad_norm": 1.0859375, + "learning_rate": 1.87961047111446e-05, + "loss": 1.2048, + "step": 7894 + }, + { + "epoch": 0.5809151529087146, + "grad_norm": 0.78515625, + "learning_rate": 1.8790494443247225e-05, + "loss": 0.6695, + "step": 7895 + }, + { + "epoch": 0.5809887330420787, + "grad_norm": 0.703125, + "learning_rate": 1.87848845086023e-05, + "loss": 0.6305, + "step": 7896 + }, + { + "epoch": 0.5810623131754427, + "grad_norm": 0.84765625, + "learning_rate": 1.8779274907510866e-05, + "loss": 0.7055, + "step": 7897 + }, + { + "epoch": 0.5811358933088067, + "grad_norm": 0.94140625, + "learning_rate": 1.8773665640274004e-05, + "loss": 0.9516, + "step": 7898 + }, + { + "epoch": 0.5812094734421707, + "grad_norm": 0.74609375, + "learning_rate": 1.8768056707192748e-05, + "loss": 0.7438, + "step": 7899 + }, + { + "epoch": 0.5812830535755346, + "grad_norm": 0.796875, + "learning_rate": 1.876244810856812e-05, + "loss": 0.8965, + "step": 7900 + }, + { + "epoch": 0.5813566337088986, + "grad_norm": 0.69921875, + "learning_rate": 1.8756839844701126e-05, + "loss": 0.9464, + "step": 7901 + }, + { + "epoch": 0.5814302138422626, + "grad_norm": 0.8125, + "learning_rate": 1.875123191589274e-05, + "loss": 0.8169, + "step": 7902 + }, + { + "epoch": 0.5815037939756266, + "grad_norm": 0.71875, + "learning_rate": 1.8745624322443933e-05, + "loss": 0.7822, + "step": 7903 + }, + { + "epoch": 0.5815773741089906, + "grad_norm": 0.90234375, + "learning_rate": 1.8740017064655655e-05, + "loss": 0.6579, + "step": 7904 + }, + { + "epoch": 0.5816509542423546, + "grad_norm": 0.85546875, + "learning_rate": 1.873441014282884e-05, + "loss": 0.6201, + "step": 7905 + }, + { + "epoch": 0.5817245343757186, + "grad_norm": 0.8828125, + "learning_rate": 1.8728803557264403e-05, + "loss": 0.8544, + "step": 7906 + }, + { + "epoch": 0.5817981145090826, + "grad_norm": 0.8984375, + "learning_rate": 1.8723197308263227e-05, + "loss": 0.6245, + "step": 7907 + }, + { + "epoch": 0.5818716946424466, + "grad_norm": 1.0859375, + "learning_rate": 1.87175913961262e-05, + "loss": 1.4219, + "step": 7908 + }, + { + "epoch": 0.5819452747758105, + "grad_norm": 0.85546875, + "learning_rate": 1.8711985821154172e-05, + "loss": 0.8163, + "step": 7909 + }, + { + "epoch": 0.5820188549091745, + "grad_norm": 0.8125, + "learning_rate": 1.8706380583647998e-05, + "loss": 0.9422, + "step": 7910 + }, + { + "epoch": 0.5820924350425385, + "grad_norm": 0.94921875, + "learning_rate": 1.8700775683908483e-05, + "loss": 0.9302, + "step": 7911 + }, + { + "epoch": 0.5821660151759025, + "grad_norm": 0.9375, + "learning_rate": 1.8695171122236444e-05, + "loss": 0.9279, + "step": 7912 + }, + { + "epoch": 0.5822395953092665, + "grad_norm": 0.99609375, + "learning_rate": 1.868956689893266e-05, + "loss": 1.2156, + "step": 7913 + }, + { + "epoch": 0.5823131754426305, + "grad_norm": 1.0390625, + "learning_rate": 1.86839630142979e-05, + "loss": 1.0219, + "step": 7914 + }, + { + "epoch": 0.5823867555759945, + "grad_norm": 0.8984375, + "learning_rate": 1.8678359468632926e-05, + "loss": 0.8994, + "step": 7915 + }, + { + "epoch": 0.5824603357093585, + "grad_norm": 0.9453125, + "learning_rate": 1.8672756262238454e-05, + "loss": 1.2641, + "step": 7916 + }, + { + "epoch": 0.5825339158427225, + "grad_norm": 0.88671875, + "learning_rate": 1.8667153395415198e-05, + "loss": 0.9052, + "step": 7917 + }, + { + "epoch": 0.5826074959760864, + "grad_norm": 0.875, + "learning_rate": 1.866155086846386e-05, + "loss": 0.8305, + "step": 7918 + }, + { + "epoch": 0.5826810761094504, + "grad_norm": 0.78125, + "learning_rate": 1.8655948681685123e-05, + "loss": 0.7444, + "step": 7919 + }, + { + "epoch": 0.5827546562428144, + "grad_norm": 0.91015625, + "learning_rate": 1.865034683537963e-05, + "loss": 0.7477, + "step": 7920 + }, + { + "epoch": 0.5828282363761784, + "grad_norm": 0.76953125, + "learning_rate": 1.8644745329848027e-05, + "loss": 0.6366, + "step": 7921 + }, + { + "epoch": 0.5829018165095424, + "grad_norm": 0.73828125, + "learning_rate": 1.8639144165390945e-05, + "loss": 0.8134, + "step": 7922 + }, + { + "epoch": 0.5829753966429064, + "grad_norm": 1.0546875, + "learning_rate": 1.863354334230898e-05, + "loss": 1.0882, + "step": 7923 + }, + { + "epoch": 0.5830489767762704, + "grad_norm": 0.80859375, + "learning_rate": 1.862794286090272e-05, + "loss": 0.7208, + "step": 7924 + }, + { + "epoch": 0.5831225569096344, + "grad_norm": 0.76953125, + "learning_rate": 1.8622342721472728e-05, + "loss": 0.5987, + "step": 7925 + }, + { + "epoch": 0.5831961370429984, + "grad_norm": 1.109375, + "learning_rate": 1.861674292431956e-05, + "loss": 1.1597, + "step": 7926 + }, + { + "epoch": 0.5832697171763623, + "grad_norm": 0.625, + "learning_rate": 1.861114346974374e-05, + "loss": 0.7863, + "step": 7927 + }, + { + "epoch": 0.5833432973097263, + "grad_norm": 1.03125, + "learning_rate": 1.8605544358045794e-05, + "loss": 0.9326, + "step": 7928 + }, + { + "epoch": 0.5834168774430903, + "grad_norm": 0.71484375, + "learning_rate": 1.8599945589526198e-05, + "loss": 0.6481, + "step": 7929 + }, + { + "epoch": 0.5834904575764543, + "grad_norm": 0.75, + "learning_rate": 1.8594347164485427e-05, + "loss": 0.803, + "step": 7930 + }, + { + "epoch": 0.5835640377098184, + "grad_norm": 0.80078125, + "learning_rate": 1.858874908322395e-05, + "loss": 0.806, + "step": 7931 + }, + { + "epoch": 0.5836376178431824, + "grad_norm": 0.796875, + "learning_rate": 1.8583151346042203e-05, + "loss": 0.9222, + "step": 7932 + }, + { + "epoch": 0.5837111979765464, + "grad_norm": 0.8125, + "learning_rate": 1.8577553953240604e-05, + "loss": 0.9362, + "step": 7933 + }, + { + "epoch": 0.5837847781099104, + "grad_norm": 0.87109375, + "learning_rate": 1.857195690511955e-05, + "loss": 0.8713, + "step": 7934 + }, + { + "epoch": 0.5838583582432744, + "grad_norm": 0.8203125, + "learning_rate": 1.8566360201979427e-05, + "loss": 0.8874, + "step": 7935 + }, + { + "epoch": 0.5839319383766383, + "grad_norm": 1.015625, + "learning_rate": 1.8560763844120603e-05, + "loss": 0.9917, + "step": 7936 + }, + { + "epoch": 0.5840055185100023, + "grad_norm": 1.0234375, + "learning_rate": 1.8555167831843422e-05, + "loss": 1.3873, + "step": 7937 + }, + { + "epoch": 0.5840790986433663, + "grad_norm": 0.91015625, + "learning_rate": 1.8549572165448214e-05, + "loss": 0.8306, + "step": 7938 + }, + { + "epoch": 0.5841526787767303, + "grad_norm": 0.9140625, + "learning_rate": 1.8543976845235277e-05, + "loss": 1.0889, + "step": 7939 + }, + { + "epoch": 0.5842262589100943, + "grad_norm": 0.80078125, + "learning_rate": 1.8538381871504915e-05, + "loss": 0.8001, + "step": 7940 + }, + { + "epoch": 0.5842998390434583, + "grad_norm": 1.3515625, + "learning_rate": 1.8532787244557393e-05, + "loss": 1.2419, + "step": 7941 + }, + { + "epoch": 0.5843734191768223, + "grad_norm": 0.72265625, + "learning_rate": 1.852719296469297e-05, + "loss": 0.8252, + "step": 7942 + }, + { + "epoch": 0.5844469993101863, + "grad_norm": 0.765625, + "learning_rate": 1.8521599032211866e-05, + "loss": 0.9006, + "step": 7943 + }, + { + "epoch": 0.5845205794435503, + "grad_norm": 0.890625, + "learning_rate": 1.851600544741431e-05, + "loss": 1.0111, + "step": 7944 + }, + { + "epoch": 0.5845941595769142, + "grad_norm": 0.97265625, + "learning_rate": 1.8510412210600493e-05, + "loss": 1.1289, + "step": 7945 + }, + { + "epoch": 0.5846677397102782, + "grad_norm": 0.8828125, + "learning_rate": 1.8504819322070595e-05, + "loss": 0.6984, + "step": 7946 + }, + { + "epoch": 0.5847413198436422, + "grad_norm": 0.7265625, + "learning_rate": 1.849922678212478e-05, + "loss": 0.8641, + "step": 7947 + }, + { + "epoch": 0.5848148999770062, + "grad_norm": 0.60546875, + "learning_rate": 1.8493634591063187e-05, + "loss": 0.5647, + "step": 7948 + }, + { + "epoch": 0.5848884801103702, + "grad_norm": 1.0, + "learning_rate": 1.848804274918593e-05, + "loss": 1.2141, + "step": 7949 + }, + { + "epoch": 0.5849620602437342, + "grad_norm": 0.86328125, + "learning_rate": 1.848245125679312e-05, + "loss": 0.7978, + "step": 7950 + }, + { + "epoch": 0.5850356403770982, + "grad_norm": 1.0078125, + "learning_rate": 1.8476860114184845e-05, + "loss": 0.869, + "step": 7951 + }, + { + "epoch": 0.5851092205104622, + "grad_norm": 0.71484375, + "learning_rate": 1.8471269321661167e-05, + "loss": 0.775, + "step": 7952 + }, + { + "epoch": 0.5851828006438262, + "grad_norm": 0.78125, + "learning_rate": 1.846567887952213e-05, + "loss": 0.596, + "step": 7953 + }, + { + "epoch": 0.5852563807771901, + "grad_norm": 1.09375, + "learning_rate": 1.846008878806777e-05, + "loss": 1.1225, + "step": 7954 + }, + { + "epoch": 0.5853299609105541, + "grad_norm": 0.95703125, + "learning_rate": 1.845449904759809e-05, + "loss": 1.0799, + "step": 7955 + }, + { + "epoch": 0.5854035410439181, + "grad_norm": 0.8125, + "learning_rate": 1.8448909658413093e-05, + "loss": 1.0973, + "step": 7956 + }, + { + "epoch": 0.5854771211772821, + "grad_norm": 0.91796875, + "learning_rate": 1.844332062081273e-05, + "loss": 1.0958, + "step": 7957 + }, + { + "epoch": 0.5855507013106461, + "grad_norm": 0.9765625, + "learning_rate": 1.8437731935096967e-05, + "loss": 0.9853, + "step": 7958 + }, + { + "epoch": 0.5856242814440101, + "grad_norm": 0.984375, + "learning_rate": 1.8432143601565737e-05, + "loss": 0.988, + "step": 7959 + }, + { + "epoch": 0.5856978615773741, + "grad_norm": 0.7578125, + "learning_rate": 1.842655562051896e-05, + "loss": 0.6313, + "step": 7960 + }, + { + "epoch": 0.5857714417107381, + "grad_norm": 0.78125, + "learning_rate": 1.842096799225652e-05, + "loss": 0.8302, + "step": 7961 + }, + { + "epoch": 0.5858450218441021, + "grad_norm": 0.76171875, + "learning_rate": 1.8415380717078305e-05, + "loss": 0.7189, + "step": 7962 + }, + { + "epoch": 0.585918601977466, + "grad_norm": 0.67578125, + "learning_rate": 1.840979379528417e-05, + "loss": 0.6009, + "step": 7963 + }, + { + "epoch": 0.58599218211083, + "grad_norm": 0.921875, + "learning_rate": 1.8404207227173953e-05, + "loss": 0.8535, + "step": 7964 + }, + { + "epoch": 0.586065762244194, + "grad_norm": 0.796875, + "learning_rate": 1.8398621013047483e-05, + "loss": 1.0447, + "step": 7965 + }, + { + "epoch": 0.586139342377558, + "grad_norm": 0.80859375, + "learning_rate": 1.8393035153204547e-05, + "loss": 1.0278, + "step": 7966 + }, + { + "epoch": 0.586212922510922, + "grad_norm": 0.90234375, + "learning_rate": 1.8387449647944938e-05, + "loss": 0.924, + "step": 7967 + }, + { + "epoch": 0.5862865026442861, + "grad_norm": 0.65625, + "learning_rate": 1.838186449756842e-05, + "loss": 0.7968, + "step": 7968 + }, + { + "epoch": 0.5863600827776501, + "grad_norm": 0.87890625, + "learning_rate": 1.837627970237474e-05, + "loss": 1.2992, + "step": 7969 + }, + { + "epoch": 0.5864336629110141, + "grad_norm": 0.83203125, + "learning_rate": 1.837069526266361e-05, + "loss": 0.9364, + "step": 7970 + }, + { + "epoch": 0.5865072430443781, + "grad_norm": 0.91015625, + "learning_rate": 1.8365111178734745e-05, + "loss": 0.7273, + "step": 7971 + }, + { + "epoch": 0.586580823177742, + "grad_norm": 0.890625, + "learning_rate": 1.8359527450887828e-05, + "loss": 0.7213, + "step": 7972 + }, + { + "epoch": 0.586654403311106, + "grad_norm": 0.77734375, + "learning_rate": 1.8353944079422533e-05, + "loss": 0.725, + "step": 7973 + }, + { + "epoch": 0.58672798344447, + "grad_norm": 0.859375, + "learning_rate": 1.8348361064638513e-05, + "loss": 1.2185, + "step": 7974 + }, + { + "epoch": 0.586801563577834, + "grad_norm": 0.71875, + "learning_rate": 1.8342778406835383e-05, + "loss": 0.766, + "step": 7975 + }, + { + "epoch": 0.586875143711198, + "grad_norm": 0.98828125, + "learning_rate": 1.8337196106312766e-05, + "loss": 1.5889, + "step": 7976 + }, + { + "epoch": 0.586948723844562, + "grad_norm": 0.9296875, + "learning_rate": 1.8331614163370247e-05, + "loss": 0.6474, + "step": 7977 + }, + { + "epoch": 0.587022303977926, + "grad_norm": 0.7734375, + "learning_rate": 1.83260325783074e-05, + "loss": 0.7061, + "step": 7978 + }, + { + "epoch": 0.58709588411129, + "grad_norm": 0.73828125, + "learning_rate": 1.832045135142379e-05, + "loss": 0.9112, + "step": 7979 + }, + { + "epoch": 0.587169464244654, + "grad_norm": 0.96875, + "learning_rate": 1.831487048301893e-05, + "loss": 0.6902, + "step": 7980 + }, + { + "epoch": 0.5872430443780179, + "grad_norm": 0.9765625, + "learning_rate": 1.8309289973392347e-05, + "loss": 0.797, + "step": 7981 + }, + { + "epoch": 0.5873166245113819, + "grad_norm": 0.875, + "learning_rate": 1.8303709822843533e-05, + "loss": 0.8609, + "step": 7982 + }, + { + "epoch": 0.5873902046447459, + "grad_norm": 0.83984375, + "learning_rate": 1.8298130031671974e-05, + "loss": 1.0372, + "step": 7983 + }, + { + "epoch": 0.5874637847781099, + "grad_norm": 0.9921875, + "learning_rate": 1.8292550600177112e-05, + "loss": 1.3363, + "step": 7984 + }, + { + "epoch": 0.5875373649114739, + "grad_norm": 0.6875, + "learning_rate": 1.8286971528658386e-05, + "loss": 0.8451, + "step": 7985 + }, + { + "epoch": 0.5876109450448379, + "grad_norm": 0.98828125, + "learning_rate": 1.8281392817415223e-05, + "loss": 1.6296, + "step": 7986 + }, + { + "epoch": 0.5876845251782019, + "grad_norm": 1.3046875, + "learning_rate": 1.827581446674701e-05, + "loss": 1.386, + "step": 7987 + }, + { + "epoch": 0.5877581053115659, + "grad_norm": 1.1796875, + "learning_rate": 1.827023647695315e-05, + "loss": 1.2393, + "step": 7988 + }, + { + "epoch": 0.5878316854449299, + "grad_norm": 0.859375, + "learning_rate": 1.8264658848332977e-05, + "loss": 1.1769, + "step": 7989 + }, + { + "epoch": 0.5879052655782938, + "grad_norm": 0.82421875, + "learning_rate": 1.8259081581185843e-05, + "loss": 0.8126, + "step": 7990 + }, + { + "epoch": 0.5879788457116578, + "grad_norm": 0.9921875, + "learning_rate": 1.8253504675811073e-05, + "loss": 0.9025, + "step": 7991 + }, + { + "epoch": 0.5880524258450218, + "grad_norm": 1.2109375, + "learning_rate": 1.8247928132507962e-05, + "loss": 1.0028, + "step": 7992 + }, + { + "epoch": 0.5881260059783858, + "grad_norm": 0.78125, + "learning_rate": 1.82423519515758e-05, + "loss": 0.5055, + "step": 7993 + }, + { + "epoch": 0.5881995861117498, + "grad_norm": 0.90625, + "learning_rate": 1.8236776133313837e-05, + "loss": 0.7578, + "step": 7994 + }, + { + "epoch": 0.5882731662451138, + "grad_norm": 0.9140625, + "learning_rate": 1.8231200678021325e-05, + "loss": 0.7794, + "step": 7995 + }, + { + "epoch": 0.5883467463784778, + "grad_norm": 0.9140625, + "learning_rate": 1.8225625585997494e-05, + "loss": 1.2828, + "step": 7996 + }, + { + "epoch": 0.5884203265118418, + "grad_norm": 0.77734375, + "learning_rate": 1.8220050857541548e-05, + "loss": 0.993, + "step": 7997 + }, + { + "epoch": 0.5884939066452058, + "grad_norm": 0.98046875, + "learning_rate": 1.8214476492952658e-05, + "loss": 0.8535, + "step": 7998 + }, + { + "epoch": 0.5885674867785697, + "grad_norm": 0.8515625, + "learning_rate": 1.820890249253e-05, + "loss": 0.9465, + "step": 7999 + }, + { + "epoch": 0.5886410669119337, + "grad_norm": 0.6640625, + "learning_rate": 1.8203328856572716e-05, + "loss": 0.6409, + "step": 8000 + } + ], + "logging_steps": 1, + "max_steps": 13591, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0646749664124574e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}