diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,44143 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 220, + "global_step": 4390, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 0.5734136998653412, + "epoch": 0.0022811519817507843, + "grad_norm": 52.5, + "learning_rate": 0.0, + "loss": 1.3674, + "mean_token_accuracy": 0.9076001197099686, + "num_tokens": 120233.0, + "step": 1 + }, + { + "entropy": 0.5703580230474472, + "epoch": 0.004562303963501569, + "grad_norm": 54.25, + "learning_rate": 2.272727272727273e-08, + "loss": 1.3391, + "mean_token_accuracy": 0.906724251806736, + "num_tokens": 239197.0, + "step": 2 + }, + { + "entropy": 0.5650490000844002, + "epoch": 0.006843455945252352, + "grad_norm": 55.5, + "learning_rate": 4.545454545454546e-08, + "loss": 1.4311, + "mean_token_accuracy": 0.9030006751418114, + "num_tokens": 358479.0, + "step": 3 + }, + { + "entropy": 0.5717524662613869, + "epoch": 0.009124607927003137, + "grad_norm": 53.25, + "learning_rate": 6.818181818181819e-08, + "loss": 1.3311, + "mean_token_accuracy": 0.9059764742851257, + "num_tokens": 477607.0, + "step": 4 + }, + { + "entropy": 0.5707872360944748, + "epoch": 0.01140575990875392, + "grad_norm": 52.25, + "learning_rate": 9.090909090909091e-08, + "loss": 1.3821, + "mean_token_accuracy": 0.9057438299059868, + "num_tokens": 597317.0, + "step": 5 + }, + { + "entropy": 0.5681033954024315, + "epoch": 0.013686911890504704, + "grad_norm": 51.5, + "learning_rate": 1.1363636363636364e-07, + "loss": 1.3494, + "mean_token_accuracy": 0.9060439169406891, + "num_tokens": 716462.0, + "step": 6 + }, + { + "entropy": 0.5704384073615074, + "epoch": 0.015968063872255488, + "grad_norm": 53.0, + "learning_rate": 1.3636363636363637e-07, + "loss": 1.3914, + "mean_token_accuracy": 0.904587410390377, + "num_tokens": 836041.0, + "step": 7 + }, + { + "entropy": 0.5671721249818802, + "epoch": 0.018249215854006275, + "grad_norm": 55.5, + "learning_rate": 1.590909090909091e-07, + "loss": 1.3618, + "mean_token_accuracy": 0.9054850861430168, + "num_tokens": 956059.0, + "step": 8 + }, + { + "entropy": 0.5728817656636238, + "epoch": 0.020530367835757058, + "grad_norm": 53.5, + "learning_rate": 1.8181818181818183e-07, + "loss": 1.3792, + "mean_token_accuracy": 0.9061362445354462, + "num_tokens": 1075583.0, + "step": 9 + }, + { + "entropy": 0.5725328251719475, + "epoch": 0.02281151981750784, + "grad_norm": 53.0, + "learning_rate": 2.0454545454545456e-07, + "loss": 1.3341, + "mean_token_accuracy": 0.9036030024290085, + "num_tokens": 1195522.0, + "step": 10 + }, + { + "entropy": 0.5802645534276962, + "epoch": 0.025092671799258625, + "grad_norm": 53.25, + "learning_rate": 2.2727272727272729e-07, + "loss": 1.3528, + "mean_token_accuracy": 0.904089480638504, + "num_tokens": 1315219.0, + "step": 11 + }, + { + "entropy": 0.570266380906105, + "epoch": 0.02737382378100941, + "grad_norm": 53.5, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.3432, + "mean_token_accuracy": 0.908759817481041, + "num_tokens": 1434315.0, + "step": 12 + }, + { + "entropy": 0.5701230838894844, + "epoch": 0.029654975762760195, + "grad_norm": 53.0, + "learning_rate": 2.7272727272727274e-07, + "loss": 1.3304, + "mean_token_accuracy": 0.9080975726246834, + "num_tokens": 1553692.0, + "step": 13 + }, + { + "entropy": 0.5712269619107246, + "epoch": 0.031936127744510975, + "grad_norm": 51.5, + "learning_rate": 2.954545454545455e-07, + "loss": 1.3364, + "mean_token_accuracy": 0.9034472033381462, + "num_tokens": 1673526.0, + "step": 14 + }, + { + "entropy": 0.5682088285684586, + "epoch": 0.03421727972626176, + "grad_norm": 51.75, + "learning_rate": 3.181818181818182e-07, + "loss": 1.3281, + "mean_token_accuracy": 0.9054285809397697, + "num_tokens": 1792623.0, + "step": 15 + }, + { + "entropy": 0.5747991874814034, + "epoch": 0.03649843170801255, + "grad_norm": 54.5, + "learning_rate": 3.409090909090909e-07, + "loss": 1.3899, + "mean_token_accuracy": 0.9053085520863533, + "num_tokens": 1912419.0, + "step": 16 + }, + { + "entropy": 0.5684092417359352, + "epoch": 0.03877958368976333, + "grad_norm": 51.75, + "learning_rate": 3.6363636363636366e-07, + "loss": 1.3117, + "mean_token_accuracy": 0.9127963408827782, + "num_tokens": 2032162.0, + "step": 17 + }, + { + "entropy": 0.5723835676908493, + "epoch": 0.041060735671514116, + "grad_norm": 52.0, + "learning_rate": 3.8636363636363636e-07, + "loss": 1.378, + "mean_token_accuracy": 0.9022952020168304, + "num_tokens": 2152347.0, + "step": 18 + }, + { + "entropy": 0.5732080265879631, + "epoch": 0.043341887653264896, + "grad_norm": 54.75, + "learning_rate": 4.090909090909091e-07, + "loss": 1.4262, + "mean_token_accuracy": 0.902356244623661, + "num_tokens": 2272629.0, + "step": 19 + }, + { + "entropy": 0.5706452056765556, + "epoch": 0.04562303963501568, + "grad_norm": 56.25, + "learning_rate": 4.3181818181818187e-07, + "loss": 1.4329, + "mean_token_accuracy": 0.9016290977597237, + "num_tokens": 2391735.0, + "step": 20 + }, + { + "entropy": 0.576264813542366, + "epoch": 0.04790419161676647, + "grad_norm": 51.5, + "learning_rate": 4.5454545454545457e-07, + "loss": 1.351, + "mean_token_accuracy": 0.9047231674194336, + "num_tokens": 2511712.0, + "step": 21 + }, + { + "entropy": 0.5706119686365128, + "epoch": 0.05018534359851725, + "grad_norm": 54.5, + "learning_rate": 4.772727272727274e-07, + "loss": 1.3559, + "mean_token_accuracy": 0.9085729420185089, + "num_tokens": 2631143.0, + "step": 22 + }, + { + "entropy": 0.5698934495449066, + "epoch": 0.05246649558026804, + "grad_norm": 53.75, + "learning_rate": 5.000000000000001e-07, + "loss": 1.3501, + "mean_token_accuracy": 0.9051879644393921, + "num_tokens": 2750290.0, + "step": 23 + }, + { + "entropy": 0.5708443224430084, + "epoch": 0.05474764756201882, + "grad_norm": 53.25, + "learning_rate": 5.227272727272728e-07, + "loss": 1.3142, + "mean_token_accuracy": 0.9079254195094109, + "num_tokens": 2868777.0, + "step": 24 + }, + { + "entropy": 0.5746314749121666, + "epoch": 0.057028799543769604, + "grad_norm": 49.75, + "learning_rate": 5.454545454545455e-07, + "loss": 1.2755, + "mean_token_accuracy": 0.9074985161423683, + "num_tokens": 2988543.0, + "step": 25 + }, + { + "entropy": 0.5727974995970726, + "epoch": 0.05930995152552039, + "grad_norm": 50.25, + "learning_rate": 5.681818181818182e-07, + "loss": 1.36, + "mean_token_accuracy": 0.903306856751442, + "num_tokens": 3108652.0, + "step": 26 + }, + { + "entropy": 0.5660259872674942, + "epoch": 0.06159110350727117, + "grad_norm": 54.25, + "learning_rate": 5.90909090909091e-07, + "loss": 1.3542, + "mean_token_accuracy": 0.9063449427485466, + "num_tokens": 3227523.0, + "step": 27 + }, + { + "entropy": 0.5720612108707428, + "epoch": 0.06387225548902195, + "grad_norm": 53.75, + "learning_rate": 6.136363636363637e-07, + "loss": 1.358, + "mean_token_accuracy": 0.9061291888356209, + "num_tokens": 3347062.0, + "step": 28 + }, + { + "entropy": 0.5734340623021126, + "epoch": 0.06615340747077274, + "grad_norm": 54.75, + "learning_rate": 6.363636363636364e-07, + "loss": 1.3969, + "mean_token_accuracy": 0.8960991725325584, + "num_tokens": 3466369.0, + "step": 29 + }, + { + "entropy": 0.5688729137182236, + "epoch": 0.06843455945252352, + "grad_norm": 51.5, + "learning_rate": 6.590909090909091e-07, + "loss": 1.2862, + "mean_token_accuracy": 0.9103065803647041, + "num_tokens": 3585151.0, + "step": 30 + }, + { + "entropy": 0.5698296800255775, + "epoch": 0.07071571143427431, + "grad_norm": 53.5, + "learning_rate": 6.818181818181818e-07, + "loss": 1.3128, + "mean_token_accuracy": 0.9050886034965515, + "num_tokens": 3704589.0, + "step": 31 + }, + { + "entropy": 0.5683479458093643, + "epoch": 0.0729968634160251, + "grad_norm": 52.5, + "learning_rate": 7.045454545454545e-07, + "loss": 1.3293, + "mean_token_accuracy": 0.9052295759320259, + "num_tokens": 3824393.0, + "step": 32 + }, + { + "entropy": 0.5746268481016159, + "epoch": 0.07527801539777587, + "grad_norm": 52.0, + "learning_rate": 7.272727272727273e-07, + "loss": 1.3343, + "mean_token_accuracy": 0.9042957797646523, + "num_tokens": 3943582.0, + "step": 33 + }, + { + "entropy": 0.5718071088194847, + "epoch": 0.07755916737952666, + "grad_norm": 54.5, + "learning_rate": 7.5e-07, + "loss": 1.3532, + "mean_token_accuracy": 0.8998819962143898, + "num_tokens": 4063361.0, + "step": 34 + }, + { + "entropy": 0.5760804116725922, + "epoch": 0.07984031936127745, + "grad_norm": 51.25, + "learning_rate": 7.727272727272727e-07, + "loss": 1.2528, + "mean_token_accuracy": 0.9074670895934105, + "num_tokens": 4182322.0, + "step": 35 + }, + { + "entropy": 0.5720374584197998, + "epoch": 0.08212147134302823, + "grad_norm": 52.0, + "learning_rate": 7.954545454545455e-07, + "loss": 1.283, + "mean_token_accuracy": 0.9045210182666779, + "num_tokens": 4301566.0, + "step": 36 + }, + { + "entropy": 0.5763104930520058, + "epoch": 0.08440262332477902, + "grad_norm": 49.0, + "learning_rate": 8.181818181818182e-07, + "loss": 1.2674, + "mean_token_accuracy": 0.9053243845701218, + "num_tokens": 4421766.0, + "step": 37 + }, + { + "entropy": 0.5718304067850113, + "epoch": 0.08668377530652979, + "grad_norm": 51.25, + "learning_rate": 8.409090909090909e-07, + "loss": 1.236, + "mean_token_accuracy": 0.9109514802694321, + "num_tokens": 4540923.0, + "step": 38 + }, + { + "entropy": 0.571284256875515, + "epoch": 0.08896492728828058, + "grad_norm": 51.25, + "learning_rate": 8.636363636363637e-07, + "loss": 1.2506, + "mean_token_accuracy": 0.9077515155076981, + "num_tokens": 4660591.0, + "step": 39 + }, + { + "entropy": 0.573012188076973, + "epoch": 0.09124607927003137, + "grad_norm": 49.5, + "learning_rate": 8.863636363636364e-07, + "loss": 1.2915, + "mean_token_accuracy": 0.903707392513752, + "num_tokens": 4779930.0, + "step": 40 + }, + { + "entropy": 0.5761944428086281, + "epoch": 0.09352723125178215, + "grad_norm": 48.25, + "learning_rate": 9.090909090909091e-07, + "loss": 1.1862, + "mean_token_accuracy": 0.9115582630038261, + "num_tokens": 4900483.0, + "step": 41 + }, + { + "entropy": 0.5728463158011436, + "epoch": 0.09580838323353294, + "grad_norm": 53.0, + "learning_rate": 9.31818181818182e-07, + "loss": 1.2685, + "mean_token_accuracy": 0.9047084301710129, + "num_tokens": 5019007.0, + "step": 42 + }, + { + "entropy": 0.5770085528492928, + "epoch": 0.09808953521528371, + "grad_norm": 52.5, + "learning_rate": 9.545454545454548e-07, + "loss": 1.2548, + "mean_token_accuracy": 0.9081661030650139, + "num_tokens": 5138702.0, + "step": 43 + }, + { + "entropy": 0.5698555111885071, + "epoch": 0.1003706871970345, + "grad_norm": 50.25, + "learning_rate": 9.772727272727275e-07, + "loss": 1.2981, + "mean_token_accuracy": 0.9002158120274544, + "num_tokens": 5257886.0, + "step": 44 + }, + { + "entropy": 0.5708228722214699, + "epoch": 0.10265183917878529, + "grad_norm": 49.25, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.2456, + "mean_token_accuracy": 0.9025211557745934, + "num_tokens": 5376883.0, + "step": 45 + }, + { + "entropy": 0.5735449939966202, + "epoch": 0.10493299116053607, + "grad_norm": 50.5, + "learning_rate": 1.0227272727272729e-06, + "loss": 1.2242, + "mean_token_accuracy": 0.9060110151767731, + "num_tokens": 5496466.0, + "step": 46 + }, + { + "entropy": 0.5749260112643242, + "epoch": 0.10721414314228686, + "grad_norm": 46.0, + "learning_rate": 1.0454545454545456e-06, + "loss": 1.1537, + "mean_token_accuracy": 0.9106873646378517, + "num_tokens": 5616863.0, + "step": 47 + }, + { + "entropy": 0.5738902911543846, + "epoch": 0.10949529512403763, + "grad_norm": 45.75, + "learning_rate": 1.0681818181818183e-06, + "loss": 1.1601, + "mean_token_accuracy": 0.9126409217715263, + "num_tokens": 5736556.0, + "step": 48 + }, + { + "entropy": 0.5758886709809303, + "epoch": 0.11177644710578842, + "grad_norm": 44.5, + "learning_rate": 1.090909090909091e-06, + "loss": 1.1509, + "mean_token_accuracy": 0.908942386507988, + "num_tokens": 5856217.0, + "step": 49 + }, + { + "entropy": 0.5710620582103729, + "epoch": 0.11405759908753921, + "grad_norm": 45.5, + "learning_rate": 1.1136363636363637e-06, + "loss": 1.1709, + "mean_token_accuracy": 0.90366380661726, + "num_tokens": 5975044.0, + "step": 50 + }, + { + "entropy": 0.5815677642822266, + "epoch": 0.11633875106929, + "grad_norm": 47.75, + "learning_rate": 1.1363636363636364e-06, + "loss": 1.1953, + "mean_token_accuracy": 0.8997131288051605, + "num_tokens": 6094304.0, + "step": 51 + }, + { + "entropy": 0.5733738467097282, + "epoch": 0.11861990305104078, + "grad_norm": 42.75, + "learning_rate": 1.159090909090909e-06, + "loss": 1.119, + "mean_token_accuracy": 0.9051766842603683, + "num_tokens": 6213398.0, + "step": 52 + }, + { + "entropy": 0.5745926573872566, + "epoch": 0.12090105503279155, + "grad_norm": 40.25, + "learning_rate": 1.181818181818182e-06, + "loss": 1.0679, + "mean_token_accuracy": 0.9114581048488617, + "num_tokens": 6332996.0, + "step": 53 + }, + { + "entropy": 0.5782195627689362, + "epoch": 0.12318220701454234, + "grad_norm": 42.25, + "learning_rate": 1.2045454545454547e-06, + "loss": 1.1186, + "mean_token_accuracy": 0.9065061584115028, + "num_tokens": 6452429.0, + "step": 54 + }, + { + "entropy": 0.5730918794870377, + "epoch": 0.12546335899629313, + "grad_norm": 39.25, + "learning_rate": 1.2272727272727274e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.910540983080864, + "num_tokens": 6571591.0, + "step": 55 + }, + { + "entropy": 0.57332344353199, + "epoch": 0.1277445109780439, + "grad_norm": 39.75, + "learning_rate": 1.25e-06, + "loss": 1.0772, + "mean_token_accuracy": 0.9099193960428238, + "num_tokens": 6690887.0, + "step": 56 + }, + { + "entropy": 0.5828090459108353, + "epoch": 0.1300256629597947, + "grad_norm": 36.5, + "learning_rate": 1.2727272727272728e-06, + "loss": 1.03, + "mean_token_accuracy": 0.9091303125023842, + "num_tokens": 6811214.0, + "step": 57 + }, + { + "entropy": 0.5775460302829742, + "epoch": 0.13230681494154548, + "grad_norm": 37.0, + "learning_rate": 1.2954545454545455e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.9099651798605919, + "num_tokens": 6930725.0, + "step": 58 + }, + { + "entropy": 0.5785462707281113, + "epoch": 0.13458796692329628, + "grad_norm": 37.25, + "learning_rate": 1.3181818181818182e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.9128212556242943, + "num_tokens": 7050305.0, + "step": 59 + }, + { + "entropy": 0.581469714641571, + "epoch": 0.13686911890504705, + "grad_norm": 34.75, + "learning_rate": 1.3409090909090911e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.9069154113531113, + "num_tokens": 7170055.0, + "step": 60 + }, + { + "entropy": 0.57868592441082, + "epoch": 0.13915027088679782, + "grad_norm": 34.75, + "learning_rate": 1.3636363636363636e-06, + "loss": 1.005, + "mean_token_accuracy": 0.9094131141901016, + "num_tokens": 7289469.0, + "step": 61 + }, + { + "entropy": 0.5818331465125084, + "epoch": 0.14143142286854862, + "grad_norm": 35.5, + "learning_rate": 1.3863636363636365e-06, + "loss": 1.009, + "mean_token_accuracy": 0.9051011204719543, + "num_tokens": 7408460.0, + "step": 62 + }, + { + "entropy": 0.578886553645134, + "epoch": 0.1437125748502994, + "grad_norm": 32.0, + "learning_rate": 1.409090909090909e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.9061152189970016, + "num_tokens": 7528372.0, + "step": 63 + }, + { + "entropy": 0.5784793645143509, + "epoch": 0.1459937268320502, + "grad_norm": 32.75, + "learning_rate": 1.431818181818182e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.9074001610279083, + "num_tokens": 7647793.0, + "step": 64 + }, + { + "entropy": 0.5717402920126915, + "epoch": 0.14827487881380097, + "grad_norm": 32.25, + "learning_rate": 1.4545454545454546e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.9059121236205101, + "num_tokens": 7767246.0, + "step": 65 + }, + { + "entropy": 0.5778979435563087, + "epoch": 0.15055603079555174, + "grad_norm": 30.875, + "learning_rate": 1.4772727272727275e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.9133669435977936, + "num_tokens": 7886226.0, + "step": 66 + }, + { + "entropy": 0.5811870247125626, + "epoch": 0.15283718277730254, + "grad_norm": 30.0, + "learning_rate": 1.5e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.9128276705741882, + "num_tokens": 8005391.0, + "step": 67 + }, + { + "entropy": 0.5817323103547096, + "epoch": 0.15511833475905332, + "grad_norm": 29.125, + "learning_rate": 1.522727272727273e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.9136184528470039, + "num_tokens": 8124768.0, + "step": 68 + }, + { + "entropy": 0.5761328861117363, + "epoch": 0.15739948674080412, + "grad_norm": 27.875, + "learning_rate": 1.5454545454545454e-06, + "loss": 0.8087, + "mean_token_accuracy": 0.9209216013550758, + "num_tokens": 8244554.0, + "step": 69 + }, + { + "entropy": 0.5841421559453011, + "epoch": 0.1596806387225549, + "grad_norm": 27.0, + "learning_rate": 1.5681818181818184e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.913877047598362, + "num_tokens": 8365421.0, + "step": 70 + }, + { + "entropy": 0.5766400545835495, + "epoch": 0.16196179070430566, + "grad_norm": 29.0, + "learning_rate": 1.590909090909091e-06, + "loss": 0.929, + "mean_token_accuracy": 0.9063966274261475, + "num_tokens": 8484652.0, + "step": 71 + }, + { + "entropy": 0.5818845704197884, + "epoch": 0.16424294268605646, + "grad_norm": 27.875, + "learning_rate": 1.613636363636364e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.9109550714492798, + "num_tokens": 8603764.0, + "step": 72 + }, + { + "entropy": 0.580779179930687, + "epoch": 0.16652409466780724, + "grad_norm": 27.375, + "learning_rate": 1.6363636363636365e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.9158348441123962, + "num_tokens": 8722882.0, + "step": 73 + }, + { + "entropy": 0.5787169560790062, + "epoch": 0.16880524664955804, + "grad_norm": 24.125, + "learning_rate": 1.6590909090909094e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.9182551130652428, + "num_tokens": 8842449.0, + "step": 74 + }, + { + "entropy": 0.5825136750936508, + "epoch": 0.1710863986313088, + "grad_norm": 24.625, + "learning_rate": 1.6818181818181819e-06, + "loss": 0.7719, + "mean_token_accuracy": 0.9167500585317612, + "num_tokens": 8961388.0, + "step": 75 + }, + { + "entropy": 0.5806114822626114, + "epoch": 0.17336755061305958, + "grad_norm": 23.0, + "learning_rate": 1.7045454545454546e-06, + "loss": 0.767, + "mean_token_accuracy": 0.9170192554593086, + "num_tokens": 9081298.0, + "step": 76 + }, + { + "entropy": 0.5778767764568329, + "epoch": 0.17564870259481039, + "grad_norm": 22.625, + "learning_rate": 1.7272727272727275e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.9167584925889969, + "num_tokens": 9200374.0, + "step": 77 + }, + { + "entropy": 0.5822614654898643, + "epoch": 0.17792985457656116, + "grad_norm": 20.5, + "learning_rate": 1.75e-06, + "loss": 0.6506, + "mean_token_accuracy": 0.9256497099995613, + "num_tokens": 9319540.0, + "step": 78 + }, + { + "entropy": 0.5792432054877281, + "epoch": 0.18021100655831196, + "grad_norm": 21.25, + "learning_rate": 1.7727272727272729e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.9163892567157745, + "num_tokens": 9439220.0, + "step": 79 + }, + { + "entropy": 0.5805418267846107, + "epoch": 0.18249215854006273, + "grad_norm": 21.125, + "learning_rate": 1.7954545454545456e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.9185236766934395, + "num_tokens": 9558386.0, + "step": 80 + }, + { + "entropy": 0.5810528621077538, + "epoch": 0.1847733105218135, + "grad_norm": 19.75, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.6641, + "mean_token_accuracy": 0.9253975301980972, + "num_tokens": 9677836.0, + "step": 81 + }, + { + "entropy": 0.5840262919664383, + "epoch": 0.1870544625035643, + "grad_norm": 19.0, + "learning_rate": 1.840909090909091e-06, + "loss": 0.65, + "mean_token_accuracy": 0.9248125478625298, + "num_tokens": 9797543.0, + "step": 82 + }, + { + "entropy": 0.5836752355098724, + "epoch": 0.18933561448531508, + "grad_norm": 19.125, + "learning_rate": 1.863636363636364e-06, + "loss": 0.6378, + "mean_token_accuracy": 0.9245976060628891, + "num_tokens": 9916465.0, + "step": 83 + }, + { + "entropy": 0.5778796598315239, + "epoch": 0.19161676646706588, + "grad_norm": 19.125, + "learning_rate": 1.8863636363636364e-06, + "loss": 0.631, + "mean_token_accuracy": 0.92706698179245, + "num_tokens": 10035404.0, + "step": 84 + }, + { + "entropy": 0.5819105505943298, + "epoch": 0.19389791844881665, + "grad_norm": 17.75, + "learning_rate": 1.9090909090909095e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.9258014112710953, + "num_tokens": 10154792.0, + "step": 85 + }, + { + "entropy": 0.574604719877243, + "epoch": 0.19617907043056743, + "grad_norm": 17.375, + "learning_rate": 1.931818181818182e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.9272690638899803, + "num_tokens": 10274861.0, + "step": 86 + }, + { + "entropy": 0.5808912217617035, + "epoch": 0.19846022241231823, + "grad_norm": 18.25, + "learning_rate": 1.954545454545455e-06, + "loss": 0.6017, + "mean_token_accuracy": 0.9215075969696045, + "num_tokens": 10393749.0, + "step": 87 + }, + { + "entropy": 0.5790002271533012, + "epoch": 0.200741374394069, + "grad_norm": 17.25, + "learning_rate": 1.977272727272727e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.9249177202582359, + "num_tokens": 10512863.0, + "step": 88 + }, + { + "entropy": 0.5799324288964272, + "epoch": 0.2030225263758198, + "grad_norm": 17.5, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.9279793202877045, + "num_tokens": 10631680.0, + "step": 89 + }, + { + "entropy": 0.5860020443797112, + "epoch": 0.20530367835757057, + "grad_norm": 17.0, + "learning_rate": 2.022727272727273e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.9273342564702034, + "num_tokens": 10751091.0, + "step": 90 + }, + { + "entropy": 0.5789448618888855, + "epoch": 0.20758483033932135, + "grad_norm": 16.75, + "learning_rate": 2.0454545454545457e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.927060566842556, + "num_tokens": 10869623.0, + "step": 91 + }, + { + "entropy": 0.5739471390843391, + "epoch": 0.20986598232107215, + "grad_norm": 16.875, + "learning_rate": 2.0681818181818184e-06, + "loss": 0.455, + "mean_token_accuracy": 0.9310638904571533, + "num_tokens": 10988093.0, + "step": 92 + }, + { + "entropy": 0.5817026495933533, + "epoch": 0.21214713430282292, + "grad_norm": 16.75, + "learning_rate": 2.090909090909091e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.9315683022141457, + "num_tokens": 11107421.0, + "step": 93 + }, + { + "entropy": 0.578296422958374, + "epoch": 0.21442828628457372, + "grad_norm": 17.25, + "learning_rate": 2.113636363636364e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.9340006485581398, + "num_tokens": 11226580.0, + "step": 94 + }, + { + "entropy": 0.5821548402309418, + "epoch": 0.2167094382663245, + "grad_norm": 16.5, + "learning_rate": 2.1363636363636365e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.9314954876899719, + "num_tokens": 11346427.0, + "step": 95 + }, + { + "entropy": 0.5768998041749, + "epoch": 0.21899059024807527, + "grad_norm": 17.375, + "learning_rate": 2.1590909090909092e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.9311308413743973, + "num_tokens": 11465280.0, + "step": 96 + }, + { + "entropy": 0.575069434940815, + "epoch": 0.22127174222982607, + "grad_norm": 17.875, + "learning_rate": 2.181818181818182e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.9266713336110115, + "num_tokens": 11584144.0, + "step": 97 + }, + { + "entropy": 0.5791313648223877, + "epoch": 0.22355289421157684, + "grad_norm": 17.625, + "learning_rate": 2.2045454545454547e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.9306733831763268, + "num_tokens": 11704064.0, + "step": 98 + }, + { + "entropy": 0.5747585669159889, + "epoch": 0.22583404619332764, + "grad_norm": 17.625, + "learning_rate": 2.2272727272727274e-06, + "loss": 0.2721, + "mean_token_accuracy": 0.9320469424128532, + "num_tokens": 11823033.0, + "step": 99 + }, + { + "entropy": 0.5839685648679733, + "epoch": 0.22811519817507842, + "grad_norm": 16.375, + "learning_rate": 2.25e-06, + "loss": 0.2499, + "mean_token_accuracy": 0.9301044270396233, + "num_tokens": 11942892.0, + "step": 100 + }, + { + "entropy": 0.5741793066263199, + "epoch": 0.2303963501568292, + "grad_norm": 13.0, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.2125, + "mean_token_accuracy": 0.9526615142822266, + "num_tokens": 12061926.0, + "step": 101 + }, + { + "entropy": 0.5711572021245956, + "epoch": 0.23267750213858, + "grad_norm": 10.25, + "learning_rate": 2.295454545454546e-06, + "loss": 0.1687, + "mean_token_accuracy": 0.9580799341201782, + "num_tokens": 12180625.0, + "step": 102 + }, + { + "entropy": 0.5671057850122452, + "epoch": 0.23495865412033076, + "grad_norm": 6.71875, + "learning_rate": 2.318181818181818e-06, + "loss": 0.1629, + "mean_token_accuracy": 0.9616034477949142, + "num_tokens": 12300300.0, + "step": 103 + }, + { + "entropy": 0.5693527311086655, + "epoch": 0.23723980610208156, + "grad_norm": 6.65625, + "learning_rate": 2.3409090909090913e-06, + "loss": 0.1727, + "mean_token_accuracy": 0.9629861861467361, + "num_tokens": 12419193.0, + "step": 104 + }, + { + "entropy": 0.5628181397914886, + "epoch": 0.23952095808383234, + "grad_norm": 5.21875, + "learning_rate": 2.363636363636364e-06, + "loss": 0.1289, + "mean_token_accuracy": 0.9711545035243034, + "num_tokens": 12538005.0, + "step": 105 + }, + { + "entropy": 0.5669457167387009, + "epoch": 0.2418021100655831, + "grad_norm": 6.9375, + "learning_rate": 2.3863636363636367e-06, + "loss": 0.188, + "mean_token_accuracy": 0.9620686247944832, + "num_tokens": 12657222.0, + "step": 106 + }, + { + "entropy": 0.5646533519029617, + "epoch": 0.2440832620473339, + "grad_norm": 4.40625, + "learning_rate": 2.4090909090909094e-06, + "loss": 0.1209, + "mean_token_accuracy": 0.9728151708841324, + "num_tokens": 12776642.0, + "step": 107 + }, + { + "entropy": 0.5660450905561447, + "epoch": 0.24636441402908468, + "grad_norm": 4.4375, + "learning_rate": 2.431818181818182e-06, + "loss": 0.1149, + "mean_token_accuracy": 0.9723346084356308, + "num_tokens": 12895828.0, + "step": 108 + }, + { + "entropy": 0.5642238110303879, + "epoch": 0.24864556601083548, + "grad_norm": 4.9375, + "learning_rate": 2.454545454545455e-06, + "loss": 0.1354, + "mean_token_accuracy": 0.9705220386385918, + "num_tokens": 13015594.0, + "step": 109 + }, + { + "entropy": 0.5598783493041992, + "epoch": 0.25092671799258626, + "grad_norm": 3.828125, + "learning_rate": 2.4772727272727275e-06, + "loss": 0.1122, + "mean_token_accuracy": 0.9752013236284256, + "num_tokens": 13133936.0, + "step": 110 + }, + { + "entropy": 0.5576607659459114, + "epoch": 0.25320786997433703, + "grad_norm": 3.125, + "learning_rate": 2.5e-06, + "loss": 0.1007, + "mean_token_accuracy": 0.9725182577967644, + "num_tokens": 13253274.0, + "step": 111 + }, + { + "entropy": 0.5653363168239594, + "epoch": 0.2554890219560878, + "grad_norm": 3.140625, + "learning_rate": 2.522727272727273e-06, + "loss": 0.1041, + "mean_token_accuracy": 0.9740225225687027, + "num_tokens": 13372166.0, + "step": 112 + }, + { + "entropy": 0.566494345664978, + "epoch": 0.25777017393783863, + "grad_norm": 3.109375, + "learning_rate": 2.5454545454545456e-06, + "loss": 0.108, + "mean_token_accuracy": 0.9746614545583725, + "num_tokens": 13491649.0, + "step": 113 + }, + { + "entropy": 0.563439629971981, + "epoch": 0.2600513259195894, + "grad_norm": 2.515625, + "learning_rate": 2.5681818181818187e-06, + "loss": 0.096, + "mean_token_accuracy": 0.9751686751842499, + "num_tokens": 13610906.0, + "step": 114 + }, + { + "entropy": 0.5674616098403931, + "epoch": 0.2623324779013402, + "grad_norm": 2.84375, + "learning_rate": 2.590909090909091e-06, + "loss": 0.1018, + "mean_token_accuracy": 0.9723179414868355, + "num_tokens": 13730508.0, + "step": 115 + }, + { + "entropy": 0.5624644309282303, + "epoch": 0.26461362988309095, + "grad_norm": 2.421875, + "learning_rate": 2.6136363636363637e-06, + "loss": 0.0931, + "mean_token_accuracy": 0.9802921339869499, + "num_tokens": 13849486.0, + "step": 116 + }, + { + "entropy": 0.5664628148078918, + "epoch": 0.2668947818648417, + "grad_norm": 2.59375, + "learning_rate": 2.6363636363636364e-06, + "loss": 0.0927, + "mean_token_accuracy": 0.9759863987565041, + "num_tokens": 13968481.0, + "step": 117 + }, + { + "entropy": 0.5751423835754395, + "epoch": 0.26917593384659255, + "grad_norm": 2.34375, + "learning_rate": 2.6590909090909095e-06, + "loss": 0.0781, + "mean_token_accuracy": 0.9776225313544273, + "num_tokens": 14087996.0, + "step": 118 + }, + { + "entropy": 0.5694226920604706, + "epoch": 0.2714570858283433, + "grad_norm": 1.96875, + "learning_rate": 2.6818181818181822e-06, + "loss": 0.0868, + "mean_token_accuracy": 0.9800565242767334, + "num_tokens": 14207152.0, + "step": 119 + }, + { + "entropy": 0.5768900290131569, + "epoch": 0.2737382378100941, + "grad_norm": 2.234375, + "learning_rate": 2.7045454545454545e-06, + "loss": 0.0948, + "mean_token_accuracy": 0.9744055718183517, + "num_tokens": 14327279.0, + "step": 120 + }, + { + "entropy": 0.5770423859357834, + "epoch": 0.27601938979184487, + "grad_norm": 2.421875, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.0924, + "mean_token_accuracy": 0.9754351228475571, + "num_tokens": 14446607.0, + "step": 121 + }, + { + "entropy": 0.5726769268512726, + "epoch": 0.27830054177359564, + "grad_norm": 2.125, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0929, + "mean_token_accuracy": 0.9764109626412392, + "num_tokens": 14565669.0, + "step": 122 + }, + { + "entropy": 0.5758297368884087, + "epoch": 0.2805816937553465, + "grad_norm": 1.8125, + "learning_rate": 2.772727272727273e-06, + "loss": 0.0699, + "mean_token_accuracy": 0.9837547391653061, + "num_tokens": 14684468.0, + "step": 123 + }, + { + "entropy": 0.5852851271629333, + "epoch": 0.28286284573709725, + "grad_norm": 1.828125, + "learning_rate": 2.7954545454545458e-06, + "loss": 0.0817, + "mean_token_accuracy": 0.9763156399130821, + "num_tokens": 14804194.0, + "step": 124 + }, + { + "entropy": 0.5781034901738167, + "epoch": 0.285143997718848, + "grad_norm": 2.078125, + "learning_rate": 2.818181818181818e-06, + "loss": 0.0896, + "mean_token_accuracy": 0.9771495833992958, + "num_tokens": 14923297.0, + "step": 125 + }, + { + "entropy": 0.5813179761171341, + "epoch": 0.2874251497005988, + "grad_norm": 1.859375, + "learning_rate": 2.8409090909090916e-06, + "loss": 0.0737, + "mean_token_accuracy": 0.9808864295482635, + "num_tokens": 15042624.0, + "step": 126 + }, + { + "entropy": 0.58285191655159, + "epoch": 0.28970630168234957, + "grad_norm": 1.4609375, + "learning_rate": 2.863636363636364e-06, + "loss": 0.0633, + "mean_token_accuracy": 0.9820951223373413, + "num_tokens": 15161900.0, + "step": 127 + }, + { + "entropy": 0.5854439809918404, + "epoch": 0.2919874536641004, + "grad_norm": 1.546875, + "learning_rate": 2.8863636363636366e-06, + "loss": 0.067, + "mean_token_accuracy": 0.9814109355211258, + "num_tokens": 15280942.0, + "step": 128 + }, + { + "entropy": 0.580375611782074, + "epoch": 0.29426860564585117, + "grad_norm": 1.546875, + "learning_rate": 2.9090909090909093e-06, + "loss": 0.0631, + "mean_token_accuracy": 0.9831164106726646, + "num_tokens": 15400056.0, + "step": 129 + }, + { + "entropy": 0.585293747484684, + "epoch": 0.29654975762760194, + "grad_norm": 1.4453125, + "learning_rate": 2.931818181818182e-06, + "loss": 0.0517, + "mean_token_accuracy": 0.9853853285312653, + "num_tokens": 15519664.0, + "step": 130 + }, + { + "entropy": 0.5874677374958992, + "epoch": 0.2988309096093527, + "grad_norm": 1.328125, + "learning_rate": 2.954545454545455e-06, + "loss": 0.0622, + "mean_token_accuracy": 0.9839051514863968, + "num_tokens": 15639225.0, + "step": 131 + }, + { + "entropy": 0.587916225194931, + "epoch": 0.3011120615911035, + "grad_norm": 1.4453125, + "learning_rate": 2.9772727272727274e-06, + "loss": 0.0743, + "mean_token_accuracy": 0.9831549376249313, + "num_tokens": 15758796.0, + "step": 132 + }, + { + "entropy": 0.5895832628011703, + "epoch": 0.3033932135728543, + "grad_norm": 1.4609375, + "learning_rate": 3e-06, + "loss": 0.0612, + "mean_token_accuracy": 0.982489787042141, + "num_tokens": 15878224.0, + "step": 133 + }, + { + "entropy": 0.5908492207527161, + "epoch": 0.3056743655546051, + "grad_norm": 1.1484375, + "learning_rate": 3.0227272727272728e-06, + "loss": 0.0508, + "mean_token_accuracy": 0.9875214323401451, + "num_tokens": 15997759.0, + "step": 134 + }, + { + "entropy": 0.595714882016182, + "epoch": 0.30795551753635586, + "grad_norm": 1.2890625, + "learning_rate": 3.045454545454546e-06, + "loss": 0.0673, + "mean_token_accuracy": 0.980923555791378, + "num_tokens": 16117622.0, + "step": 135 + }, + { + "entropy": 0.5900622010231018, + "epoch": 0.31023666951810663, + "grad_norm": 1.2265625, + "learning_rate": 3.0681818181818186e-06, + "loss": 0.0504, + "mean_token_accuracy": 0.9848937466740608, + "num_tokens": 16237476.0, + "step": 136 + }, + { + "entropy": 0.5910263136029243, + "epoch": 0.3125178214998574, + "grad_norm": 1.2734375, + "learning_rate": 3.090909090909091e-06, + "loss": 0.0629, + "mean_token_accuracy": 0.9823866933584213, + "num_tokens": 16356353.0, + "step": 137 + }, + { + "entropy": 0.5922799929976463, + "epoch": 0.31479897348160824, + "grad_norm": 1.078125, + "learning_rate": 3.1136363636363636e-06, + "loss": 0.0504, + "mean_token_accuracy": 0.9865441173315048, + "num_tokens": 16475307.0, + "step": 138 + }, + { + "entropy": 0.5950908437371254, + "epoch": 0.317080125463359, + "grad_norm": 1.4609375, + "learning_rate": 3.1363636363636367e-06, + "loss": 0.0501, + "mean_token_accuracy": 0.9847164377570152, + "num_tokens": 16594490.0, + "step": 139 + }, + { + "entropy": 0.5962411016225815, + "epoch": 0.3193612774451098, + "grad_norm": 1.34375, + "learning_rate": 3.1590909090909094e-06, + "loss": 0.0517, + "mean_token_accuracy": 0.9846132472157478, + "num_tokens": 16713953.0, + "step": 140 + }, + { + "entropy": 0.5948521047830582, + "epoch": 0.32164242942686055, + "grad_norm": 1.21875, + "learning_rate": 3.181818181818182e-06, + "loss": 0.063, + "mean_token_accuracy": 0.9835354760289192, + "num_tokens": 16834033.0, + "step": 141 + }, + { + "entropy": 0.5959697589278221, + "epoch": 0.3239235814086113, + "grad_norm": 1.2578125, + "learning_rate": 3.204545454545455e-06, + "loss": 0.0618, + "mean_token_accuracy": 0.9832647070288658, + "num_tokens": 16953847.0, + "step": 142 + }, + { + "entropy": 0.5913911014795303, + "epoch": 0.32620473339036216, + "grad_norm": 1.21875, + "learning_rate": 3.227272727272728e-06, + "loss": 0.0484, + "mean_token_accuracy": 0.9855299293994904, + "num_tokens": 17072974.0, + "step": 143 + }, + { + "entropy": 0.591980017721653, + "epoch": 0.32848588537211293, + "grad_norm": 1.0546875, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0573, + "mean_token_accuracy": 0.9841192588210106, + "num_tokens": 17191866.0, + "step": 144 + }, + { + "entropy": 0.5970142707228661, + "epoch": 0.3307670373538637, + "grad_norm": 1.046875, + "learning_rate": 3.272727272727273e-06, + "loss": 0.0541, + "mean_token_accuracy": 0.9840571284294128, + "num_tokens": 17310983.0, + "step": 145 + }, + { + "entropy": 0.5991972535848618, + "epoch": 0.3330481893356145, + "grad_norm": 1.0078125, + "learning_rate": 3.2954545454545456e-06, + "loss": 0.054, + "mean_token_accuracy": 0.9856197834014893, + "num_tokens": 17430718.0, + "step": 146 + }, + { + "entropy": 0.5930428057909012, + "epoch": 0.33532934131736525, + "grad_norm": 1.0, + "learning_rate": 3.3181818181818188e-06, + "loss": 0.0442, + "mean_token_accuracy": 0.9851992428302765, + "num_tokens": 17550290.0, + "step": 147 + }, + { + "entropy": 0.597101092338562, + "epoch": 0.3376104932991161, + "grad_norm": 1.1953125, + "learning_rate": 3.3409090909090915e-06, + "loss": 0.0513, + "mean_token_accuracy": 0.987654335796833, + "num_tokens": 17670093.0, + "step": 148 + }, + { + "entropy": 0.6017784476280212, + "epoch": 0.33989164528086685, + "grad_norm": 0.98046875, + "learning_rate": 3.3636363636363637e-06, + "loss": 0.052, + "mean_token_accuracy": 0.9840261936187744, + "num_tokens": 17788953.0, + "step": 149 + }, + { + "entropy": 0.5995773375034332, + "epoch": 0.3421727972626176, + "grad_norm": 1.046875, + "learning_rate": 3.3863636363636364e-06, + "loss": 0.0488, + "mean_token_accuracy": 0.9836256727576256, + "num_tokens": 17908298.0, + "step": 150 + }, + { + "entropy": 0.5998736247420311, + "epoch": 0.3444539492443684, + "grad_norm": 1.078125, + "learning_rate": 3.409090909090909e-06, + "loss": 0.0421, + "mean_token_accuracy": 0.9883076623082161, + "num_tokens": 18027243.0, + "step": 151 + }, + { + "entropy": 0.5996088534593582, + "epoch": 0.34673510122611917, + "grad_norm": 1.234375, + "learning_rate": 3.4318181818181823e-06, + "loss": 0.0462, + "mean_token_accuracy": 0.9839991182088852, + "num_tokens": 18145729.0, + "step": 152 + }, + { + "entropy": 0.5992863178253174, + "epoch": 0.34901625320787, + "grad_norm": 0.7734375, + "learning_rate": 3.454545454545455e-06, + "loss": 0.0323, + "mean_token_accuracy": 0.9896834045648575, + "num_tokens": 18264942.0, + "step": 153 + }, + { + "entropy": 0.5995401740074158, + "epoch": 0.35129740518962077, + "grad_norm": 1.0859375, + "learning_rate": 3.4772727272727277e-06, + "loss": 0.0548, + "mean_token_accuracy": 0.985793225467205, + "num_tokens": 18384365.0, + "step": 154 + }, + { + "entropy": 0.5997815579175949, + "epoch": 0.35357855717137154, + "grad_norm": 1.015625, + "learning_rate": 3.5e-06, + "loss": 0.0492, + "mean_token_accuracy": 0.9854667708277702, + "num_tokens": 18503886.0, + "step": 155 + }, + { + "entropy": 0.6035747826099396, + "epoch": 0.3558597091531223, + "grad_norm": 0.875, + "learning_rate": 3.522727272727273e-06, + "loss": 0.0432, + "mean_token_accuracy": 0.9869493171572685, + "num_tokens": 18623306.0, + "step": 156 + }, + { + "entropy": 0.5995756909251213, + "epoch": 0.3581408611348731, + "grad_norm": 1.1796875, + "learning_rate": 3.5454545454545458e-06, + "loss": 0.0618, + "mean_token_accuracy": 0.9843136370182037, + "num_tokens": 18742750.0, + "step": 157 + }, + { + "entropy": 0.6071174517273903, + "epoch": 0.3604220131166239, + "grad_norm": 0.94921875, + "learning_rate": 3.5681818181818185e-06, + "loss": 0.0493, + "mean_token_accuracy": 0.9871895685791969, + "num_tokens": 18862602.0, + "step": 158 + }, + { + "entropy": 0.6019565984606743, + "epoch": 0.3627031650983747, + "grad_norm": 1.046875, + "learning_rate": 3.590909090909091e-06, + "loss": 0.054, + "mean_token_accuracy": 0.9862797483801842, + "num_tokens": 18981488.0, + "step": 159 + }, + { + "entropy": 0.5990791544318199, + "epoch": 0.36498431708012546, + "grad_norm": 0.7890625, + "learning_rate": 3.6136363636363643e-06, + "loss": 0.0358, + "mean_token_accuracy": 0.9891152530908585, + "num_tokens": 19100627.0, + "step": 160 + }, + { + "entropy": 0.6006933972239494, + "epoch": 0.36726546906187624, + "grad_norm": 1.078125, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.0555, + "mean_token_accuracy": 0.9844802990555763, + "num_tokens": 19220206.0, + "step": 161 + }, + { + "entropy": 0.6048427298665047, + "epoch": 0.369546621043627, + "grad_norm": 0.98046875, + "learning_rate": 3.6590909090909093e-06, + "loss": 0.0447, + "mean_token_accuracy": 0.9887258186936378, + "num_tokens": 19339357.0, + "step": 162 + }, + { + "entropy": 0.6065418422222137, + "epoch": 0.37182777302537784, + "grad_norm": 0.74609375, + "learning_rate": 3.681818181818182e-06, + "loss": 0.0405, + "mean_token_accuracy": 0.9921270906925201, + "num_tokens": 19459247.0, + "step": 163 + }, + { + "entropy": 0.6037385165691376, + "epoch": 0.3741089250071286, + "grad_norm": 1.0625, + "learning_rate": 3.704545454545455e-06, + "loss": 0.0451, + "mean_token_accuracy": 0.9858360216021538, + "num_tokens": 19578452.0, + "step": 164 + }, + { + "entropy": 0.6031596437096596, + "epoch": 0.3763900769888794, + "grad_norm": 0.84765625, + "learning_rate": 3.727272727272728e-06, + "loss": 0.0366, + "mean_token_accuracy": 0.9886760339140892, + "num_tokens": 19697566.0, + "step": 165 + }, + { + "entropy": 0.6016586646437645, + "epoch": 0.37867122897063016, + "grad_norm": 0.875, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0465, + "mean_token_accuracy": 0.9878827854990959, + "num_tokens": 19816871.0, + "step": 166 + }, + { + "entropy": 0.6079351529479027, + "epoch": 0.38095238095238093, + "grad_norm": 0.9765625, + "learning_rate": 3.772727272727273e-06, + "loss": 0.0489, + "mean_token_accuracy": 0.9861183390021324, + "num_tokens": 19936910.0, + "step": 167 + }, + { + "entropy": 0.6108661890029907, + "epoch": 0.38323353293413176, + "grad_norm": 0.86328125, + "learning_rate": 3.7954545454545455e-06, + "loss": 0.0419, + "mean_token_accuracy": 0.9867944195866585, + "num_tokens": 20056236.0, + "step": 168 + }, + { + "entropy": 0.6097797527909279, + "epoch": 0.38551468491588253, + "grad_norm": 0.92578125, + "learning_rate": 3.818181818181819e-06, + "loss": 0.0462, + "mean_token_accuracy": 0.9884969517588615, + "num_tokens": 20175618.0, + "step": 169 + }, + { + "entropy": 0.6033886298537254, + "epoch": 0.3877958368976333, + "grad_norm": 0.91796875, + "learning_rate": 3.840909090909091e-06, + "loss": 0.0421, + "mean_token_accuracy": 0.9885569885373116, + "num_tokens": 20294557.0, + "step": 170 + }, + { + "entropy": 0.605580247938633, + "epoch": 0.3900769888793841, + "grad_norm": 1.03125, + "learning_rate": 3.863636363636364e-06, + "loss": 0.0501, + "mean_token_accuracy": 0.9867831692099571, + "num_tokens": 20413505.0, + "step": 171 + }, + { + "entropy": 0.6040163710713387, + "epoch": 0.39235814086113485, + "grad_norm": 1.2109375, + "learning_rate": 3.886363636363637e-06, + "loss": 0.0506, + "mean_token_accuracy": 0.9871018826961517, + "num_tokens": 20532293.0, + "step": 172 + }, + { + "entropy": 0.6123924478888512, + "epoch": 0.3946392928428857, + "grad_norm": 0.77734375, + "learning_rate": 3.90909090909091e-06, + "loss": 0.04, + "mean_token_accuracy": 0.9871955290436745, + "num_tokens": 20652122.0, + "step": 173 + }, + { + "entropy": 0.6161603555083275, + "epoch": 0.39692044482463645, + "grad_norm": 0.6953125, + "learning_rate": 3.931818181818182e-06, + "loss": 0.0324, + "mean_token_accuracy": 0.9926881417632103, + "num_tokens": 20771419.0, + "step": 174 + }, + { + "entropy": 0.6049137562513351, + "epoch": 0.3992015968063872, + "grad_norm": 0.88671875, + "learning_rate": 3.954545454545454e-06, + "loss": 0.0486, + "mean_token_accuracy": 0.988007090985775, + "num_tokens": 20891334.0, + "step": 175 + }, + { + "entropy": 0.6094447746872902, + "epoch": 0.401482748788138, + "grad_norm": 0.82421875, + "learning_rate": 3.9772727272727275e-06, + "loss": 0.0335, + "mean_token_accuracy": 0.9907999411225319, + "num_tokens": 21010898.0, + "step": 176 + }, + { + "entropy": 0.6112618297338486, + "epoch": 0.4037639007698888, + "grad_norm": 0.75390625, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0363, + "mean_token_accuracy": 0.9884474650025368, + "num_tokens": 21130204.0, + "step": 177 + }, + { + "entropy": 0.6063515245914459, + "epoch": 0.4060450527516396, + "grad_norm": 0.8984375, + "learning_rate": 4.022727272727273e-06, + "loss": 0.042, + "mean_token_accuracy": 0.985651969909668, + "num_tokens": 21248953.0, + "step": 178 + }, + { + "entropy": 0.6130170971155167, + "epoch": 0.4083262047333904, + "grad_norm": 0.984375, + "learning_rate": 4.045454545454546e-06, + "loss": 0.0511, + "mean_token_accuracy": 0.9862892627716064, + "num_tokens": 21368038.0, + "step": 179 + }, + { + "entropy": 0.6138986647129059, + "epoch": 0.41060735671514115, + "grad_norm": 1.0234375, + "learning_rate": 4.068181818181818e-06, + "loss": 0.0439, + "mean_token_accuracy": 0.9883160963654518, + "num_tokens": 21488121.0, + "step": 180 + }, + { + "entropy": 0.6127195656299591, + "epoch": 0.4128885086968919, + "grad_norm": 0.859375, + "learning_rate": 4.0909090909090915e-06, + "loss": 0.0347, + "mean_token_accuracy": 0.9898446500301361, + "num_tokens": 21607715.0, + "step": 181 + }, + { + "entropy": 0.6113751977682114, + "epoch": 0.4151696606786427, + "grad_norm": 0.62890625, + "learning_rate": 4.113636363636364e-06, + "loss": 0.032, + "mean_token_accuracy": 0.9926407486200333, + "num_tokens": 21727669.0, + "step": 182 + }, + { + "entropy": 0.6115597188472748, + "epoch": 0.4174508126603935, + "grad_norm": 0.79296875, + "learning_rate": 4.136363636363637e-06, + "loss": 0.0367, + "mean_token_accuracy": 0.9892321601510048, + "num_tokens": 21847212.0, + "step": 183 + }, + { + "entropy": 0.6128954961895943, + "epoch": 0.4197319646421443, + "grad_norm": 0.63671875, + "learning_rate": 4.159090909090909e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.9941311627626419, + "num_tokens": 21966143.0, + "step": 184 + }, + { + "entropy": 0.6118448004126549, + "epoch": 0.42201311662389507, + "grad_norm": 0.96875, + "learning_rate": 4.181818181818182e-06, + "loss": 0.0478, + "mean_token_accuracy": 0.9850431233644485, + "num_tokens": 22085443.0, + "step": 185 + }, + { + "entropy": 0.6099176928400993, + "epoch": 0.42429426860564584, + "grad_norm": 0.765625, + "learning_rate": 4.204545454545455e-06, + "loss": 0.0381, + "mean_token_accuracy": 0.9873407259583473, + "num_tokens": 22205138.0, + "step": 186 + }, + { + "entropy": 0.611331433057785, + "epoch": 0.4265754205873966, + "grad_norm": 0.828125, + "learning_rate": 4.227272727272728e-06, + "loss": 0.0373, + "mean_token_accuracy": 0.9899755716323853, + "num_tokens": 22324876.0, + "step": 187 + }, + { + "entropy": 0.6087890416383743, + "epoch": 0.42885657256914744, + "grad_norm": 0.94921875, + "learning_rate": 4.25e-06, + "loss": 0.0495, + "mean_token_accuracy": 0.9863386675715446, + "num_tokens": 22443520.0, + "step": 188 + }, + { + "entropy": 0.6162934675812721, + "epoch": 0.4311377245508982, + "grad_norm": 1.015625, + "learning_rate": 4.272727272727273e-06, + "loss": 0.0472, + "mean_token_accuracy": 0.986853152513504, + "num_tokens": 22562809.0, + "step": 189 + }, + { + "entropy": 0.6112047731876373, + "epoch": 0.433418876532649, + "grad_norm": 0.96875, + "learning_rate": 4.295454545454546e-06, + "loss": 0.0383, + "mean_token_accuracy": 0.9893319383263588, + "num_tokens": 22681436.0, + "step": 190 + }, + { + "entropy": 0.6135398969054222, + "epoch": 0.43570002851439976, + "grad_norm": 0.75390625, + "learning_rate": 4.3181818181818185e-06, + "loss": 0.0344, + "mean_token_accuracy": 0.9891260042786598, + "num_tokens": 22800905.0, + "step": 191 + }, + { + "entropy": 0.6095651909708977, + "epoch": 0.43798118049615054, + "grad_norm": 0.74609375, + "learning_rate": 4.340909090909091e-06, + "loss": 0.0446, + "mean_token_accuracy": 0.9909109845757484, + "num_tokens": 22920614.0, + "step": 192 + }, + { + "entropy": 0.6105966717004776, + "epoch": 0.44026233247790136, + "grad_norm": 0.8984375, + "learning_rate": 4.363636363636364e-06, + "loss": 0.0365, + "mean_token_accuracy": 0.9888356775045395, + "num_tokens": 23039967.0, + "step": 193 + }, + { + "entropy": 0.6109149008989334, + "epoch": 0.44254348445965214, + "grad_norm": 0.93359375, + "learning_rate": 4.386363636363637e-06, + "loss": 0.0445, + "mean_token_accuracy": 0.98627919703722, + "num_tokens": 23159748.0, + "step": 194 + }, + { + "entropy": 0.6152952015399933, + "epoch": 0.4448246364414029, + "grad_norm": 0.8828125, + "learning_rate": 4.409090909090909e-06, + "loss": 0.0292, + "mean_token_accuracy": 0.9923720732331276, + "num_tokens": 23279315.0, + "step": 195 + }, + { + "entropy": 0.6137658432126045, + "epoch": 0.4471057884231537, + "grad_norm": 0.9609375, + "learning_rate": 4.4318181818181824e-06, + "loss": 0.0315, + "mean_token_accuracy": 0.989959329366684, + "num_tokens": 23398597.0, + "step": 196 + }, + { + "entropy": 0.6117070913314819, + "epoch": 0.44938694040490446, + "grad_norm": 0.72265625, + "learning_rate": 4.454545454545455e-06, + "loss": 0.0317, + "mean_token_accuracy": 0.9911105185747147, + "num_tokens": 23517373.0, + "step": 197 + }, + { + "entropy": 0.6122345998883247, + "epoch": 0.4516680923866553, + "grad_norm": 0.99609375, + "learning_rate": 4.477272727272728e-06, + "loss": 0.0414, + "mean_token_accuracy": 0.9901923090219498, + "num_tokens": 23636855.0, + "step": 198 + }, + { + "entropy": 0.613350622355938, + "epoch": 0.45394924436840606, + "grad_norm": 0.8671875, + "learning_rate": 4.5e-06, + "loss": 0.0383, + "mean_token_accuracy": 0.9877027496695518, + "num_tokens": 23756527.0, + "step": 199 + }, + { + "entropy": 0.6122613921761513, + "epoch": 0.45623039635015683, + "grad_norm": 0.82421875, + "learning_rate": 4.522727272727273e-06, + "loss": 0.0335, + "mean_token_accuracy": 0.9898212254047394, + "num_tokens": 23876365.0, + "step": 200 + }, + { + "entropy": 0.6074912548065186, + "epoch": 0.4585115483319076, + "grad_norm": 0.6640625, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.0292, + "mean_token_accuracy": 0.9918408766388893, + "num_tokens": 23996520.0, + "step": 201 + }, + { + "entropy": 0.6094929724931717, + "epoch": 0.4607927003136584, + "grad_norm": 1.1875, + "learning_rate": 4.568181818181819e-06, + "loss": 0.0421, + "mean_token_accuracy": 0.9861375615000725, + "num_tokens": 24116155.0, + "step": 202 + }, + { + "entropy": 0.6125953420996666, + "epoch": 0.4630738522954092, + "grad_norm": 0.81640625, + "learning_rate": 4.590909090909092e-06, + "loss": 0.0401, + "mean_token_accuracy": 0.9886692762374878, + "num_tokens": 24235301.0, + "step": 203 + }, + { + "entropy": 0.6131700202822685, + "epoch": 0.46535500427716, + "grad_norm": 0.828125, + "learning_rate": 4.613636363636364e-06, + "loss": 0.0314, + "mean_token_accuracy": 0.9896050915122032, + "num_tokens": 24355384.0, + "step": 204 + }, + { + "entropy": 0.6142597198486328, + "epoch": 0.46763615625891075, + "grad_norm": 1.1640625, + "learning_rate": 4.636363636363636e-06, + "loss": 0.0495, + "mean_token_accuracy": 0.9864913001656532, + "num_tokens": 24474736.0, + "step": 205 + }, + { + "entropy": 0.6135124862194061, + "epoch": 0.4699173082406615, + "grad_norm": 0.7109375, + "learning_rate": 4.6590909090909095e-06, + "loss": 0.0305, + "mean_token_accuracy": 0.990912102162838, + "num_tokens": 24594735.0, + "step": 206 + }, + { + "entropy": 0.6121883243322372, + "epoch": 0.4721984602224123, + "grad_norm": 0.640625, + "learning_rate": 4.681818181818183e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9936311393976212, + "num_tokens": 24714934.0, + "step": 207 + }, + { + "entropy": 0.6171885058283806, + "epoch": 0.4744796122041631, + "grad_norm": 0.8046875, + "learning_rate": 4.704545454545455e-06, + "loss": 0.0435, + "mean_token_accuracy": 0.9866737350821495, + "num_tokens": 24834831.0, + "step": 208 + }, + { + "entropy": 0.6145942360162735, + "epoch": 0.4767607641859139, + "grad_norm": 0.953125, + "learning_rate": 4.727272727272728e-06, + "loss": 0.0439, + "mean_token_accuracy": 0.9873807728290558, + "num_tokens": 24953675.0, + "step": 209 + }, + { + "entropy": 0.6117600351572037, + "epoch": 0.47904191616766467, + "grad_norm": 0.78515625, + "learning_rate": 4.75e-06, + "loss": 0.0364, + "mean_token_accuracy": 0.9890453815460205, + "num_tokens": 25072728.0, + "step": 210 + }, + { + "entropy": 0.6110787317156792, + "epoch": 0.48132306814941545, + "grad_norm": 0.68359375, + "learning_rate": 4.772727272727273e-06, + "loss": 0.0333, + "mean_token_accuracy": 0.9930140674114227, + "num_tokens": 25191554.0, + "step": 211 + }, + { + "entropy": 0.6167597323656082, + "epoch": 0.4836042201311662, + "grad_norm": 0.8203125, + "learning_rate": 4.795454545454546e-06, + "loss": 0.0394, + "mean_token_accuracy": 0.988898366689682, + "num_tokens": 25311608.0, + "step": 212 + }, + { + "entropy": 0.6091660633683205, + "epoch": 0.48588537211291705, + "grad_norm": 0.73046875, + "learning_rate": 4.818181818181819e-06, + "loss": 0.0323, + "mean_token_accuracy": 0.9917086586356163, + "num_tokens": 25430735.0, + "step": 213 + }, + { + "entropy": 0.6136890277266502, + "epoch": 0.4881665240946678, + "grad_norm": 0.9140625, + "learning_rate": 4.840909090909091e-06, + "loss": 0.0411, + "mean_token_accuracy": 0.9892996400594711, + "num_tokens": 25549752.0, + "step": 214 + }, + { + "entropy": 0.6150210723280907, + "epoch": 0.4904476760764186, + "grad_norm": 0.9140625, + "learning_rate": 4.863636363636364e-06, + "loss": 0.0416, + "mean_token_accuracy": 0.987892210483551, + "num_tokens": 25669520.0, + "step": 215 + }, + { + "entropy": 0.6180777624249458, + "epoch": 0.49272882805816937, + "grad_norm": 0.78515625, + "learning_rate": 4.8863636363636365e-06, + "loss": 0.0333, + "mean_token_accuracy": 0.9883030727505684, + "num_tokens": 25789135.0, + "step": 216 + }, + { + "entropy": 0.6149895712733269, + "epoch": 0.49500998003992014, + "grad_norm": 0.625, + "learning_rate": 4.90909090909091e-06, + "loss": 0.0257, + "mean_token_accuracy": 0.9932944625616074, + "num_tokens": 25908521.0, + "step": 217 + }, + { + "entropy": 0.6094222217798233, + "epoch": 0.49729113202167097, + "grad_norm": 0.86328125, + "learning_rate": 4.931818181818182e-06, + "loss": 0.0435, + "mean_token_accuracy": 0.9872719570994377, + "num_tokens": 26027985.0, + "step": 218 + }, + { + "entropy": 0.6239000931382179, + "epoch": 0.49957228400342174, + "grad_norm": 0.63671875, + "learning_rate": 4.954545454545455e-06, + "loss": 0.0266, + "mean_token_accuracy": 0.99217489361763, + "num_tokens": 26148445.0, + "step": 219 + }, + { + "entropy": 0.6077604964375496, + "epoch": 0.5018534359851725, + "grad_norm": 0.55078125, + "learning_rate": 4.977272727272728e-06, + "loss": 0.0223, + "mean_token_accuracy": 0.9936507567763329, + "num_tokens": 26267085.0, + "step": 220 + }, + { + "epoch": 0.5018534359851725, + "eval_entropy": 0.6166857485535481, + "eval_loss": 0.0355016253888607, + "eval_mean_token_accuracy": 0.9897458052453886, + "eval_num_tokens": 26267085.0, + "eval_runtime": 177.6251, + "eval_samples_per_second": 47.206, + "eval_steps_per_second": 1.481, + "step": 220 + }, + { + "entropy": 0.6187715381383896, + "epoch": 0.5041345879669233, + "grad_norm": 0.71484375, + "learning_rate": 5e-06, + "loss": 0.0304, + "mean_token_accuracy": 0.9892142415046692, + "num_tokens": 26386691.0, + "step": 221 + }, + { + "entropy": 0.6130868121981621, + "epoch": 0.5064157399486741, + "grad_norm": 0.7265625, + "learning_rate": 4.999999290524132e-06, + "loss": 0.033, + "mean_token_accuracy": 0.9919474571943283, + "num_tokens": 26506030.0, + "step": 222 + }, + { + "entropy": 0.618166871368885, + "epoch": 0.5086968919304249, + "grad_norm": 0.8359375, + "learning_rate": 4.999997162096932e-06, + "loss": 0.0393, + "mean_token_accuracy": 0.9902554601430893, + "num_tokens": 26625779.0, + "step": 223 + }, + { + "entropy": 0.6186254993081093, + "epoch": 0.5109780439121756, + "grad_norm": 0.77734375, + "learning_rate": 4.999993614719606e-06, + "loss": 0.039, + "mean_token_accuracy": 0.9879956096410751, + "num_tokens": 26745240.0, + "step": 224 + }, + { + "entropy": 0.6144696474075317, + "epoch": 0.5132591958939264, + "grad_norm": 0.94921875, + "learning_rate": 4.999988648394169e-06, + "loss": 0.0396, + "mean_token_accuracy": 0.9891506060957909, + "num_tokens": 26865201.0, + "step": 225 + }, + { + "entropy": 0.6158009767532349, + "epoch": 0.5155403478756773, + "grad_norm": 0.703125, + "learning_rate": 4.99998226312344e-06, + "loss": 0.0314, + "mean_token_accuracy": 0.9909180551767349, + "num_tokens": 26984311.0, + "step": 226 + }, + { + "entropy": 0.6173355579376221, + "epoch": 0.517821499857428, + "grad_norm": 0.734375, + "learning_rate": 4.999974458911041e-06, + "loss": 0.0363, + "mean_token_accuracy": 0.9874273240566254, + "num_tokens": 27103583.0, + "step": 227 + }, + { + "entropy": 0.6162387728691101, + "epoch": 0.5201026518391788, + "grad_norm": 0.8515625, + "learning_rate": 4.999965235761404e-06, + "loss": 0.043, + "mean_token_accuracy": 0.9893777891993523, + "num_tokens": 27222648.0, + "step": 228 + }, + { + "entropy": 0.6175979673862457, + "epoch": 0.5223838038209295, + "grad_norm": 0.79296875, + "learning_rate": 4.999954593679762e-06, + "loss": 0.0347, + "mean_token_accuracy": 0.9891248270869255, + "num_tokens": 27341946.0, + "step": 229 + }, + { + "entropy": 0.6163488551974297, + "epoch": 0.5246649558026804, + "grad_norm": 0.6328125, + "learning_rate": 4.999942532672157e-06, + "loss": 0.027, + "mean_token_accuracy": 0.9911708384752274, + "num_tokens": 27460786.0, + "step": 230 + }, + { + "entropy": 0.6108046844601631, + "epoch": 0.5269461077844312, + "grad_norm": 0.90625, + "learning_rate": 4.999929052745434e-06, + "loss": 0.0333, + "mean_token_accuracy": 0.9909431710839272, + "num_tokens": 27580369.0, + "step": 231 + }, + { + "entropy": 0.6116471812129021, + "epoch": 0.5292272597661819, + "grad_norm": 0.7734375, + "learning_rate": 4.999914153907243e-06, + "loss": 0.0381, + "mean_token_accuracy": 0.9896859154105186, + "num_tokens": 27699786.0, + "step": 232 + }, + { + "entropy": 0.6161288917064667, + "epoch": 0.5315084117479327, + "grad_norm": 0.6015625, + "learning_rate": 4.999897836166041e-06, + "loss": 0.0297, + "mean_token_accuracy": 0.9920491427183151, + "num_tokens": 27818736.0, + "step": 233 + }, + { + "entropy": 0.6138700991868973, + "epoch": 0.5337895637296834, + "grad_norm": 0.87890625, + "learning_rate": 4.999880099531089e-06, + "loss": 0.0391, + "mean_token_accuracy": 0.9898124188184738, + "num_tokens": 27937074.0, + "step": 234 + }, + { + "entropy": 0.6135752275586128, + "epoch": 0.5360707157114343, + "grad_norm": 0.68359375, + "learning_rate": 4.999860944012455e-06, + "loss": 0.0249, + "mean_token_accuracy": 0.9930165857076645, + "num_tokens": 28056646.0, + "step": 235 + }, + { + "entropy": 0.612888477742672, + "epoch": 0.5383518676931851, + "grad_norm": 1.015625, + "learning_rate": 4.999840369621011e-06, + "loss": 0.0371, + "mean_token_accuracy": 0.9873201325535774, + "num_tokens": 28176116.0, + "step": 236 + }, + { + "entropy": 0.6163707077503204, + "epoch": 0.5406330196749358, + "grad_norm": 0.7265625, + "learning_rate": 4.999818376368435e-06, + "loss": 0.0254, + "mean_token_accuracy": 0.9919315874576569, + "num_tokens": 28295186.0, + "step": 237 + }, + { + "entropy": 0.6170571818947792, + "epoch": 0.5429141716566867, + "grad_norm": 0.76953125, + "learning_rate": 4.999794964267208e-06, + "loss": 0.0365, + "mean_token_accuracy": 0.9886098653078079, + "num_tokens": 28414632.0, + "step": 238 + }, + { + "entropy": 0.6128563433885574, + "epoch": 0.5451953236384374, + "grad_norm": 0.78125, + "learning_rate": 4.9997701333306215e-06, + "loss": 0.0353, + "mean_token_accuracy": 0.9905475154519081, + "num_tokens": 28534507.0, + "step": 239 + }, + { + "entropy": 0.6240319088101387, + "epoch": 0.5474764756201882, + "grad_norm": 0.7578125, + "learning_rate": 4.999743883572766e-06, + "loss": 0.0375, + "mean_token_accuracy": 0.9888799786567688, + "num_tokens": 28654053.0, + "step": 240 + }, + { + "entropy": 0.6174348294734955, + "epoch": 0.549757627601939, + "grad_norm": 0.6875, + "learning_rate": 4.999716215008542e-06, + "loss": 0.031, + "mean_token_accuracy": 0.9917779788374901, + "num_tokens": 28772759.0, + "step": 241 + }, + { + "entropy": 0.6177357211709023, + "epoch": 0.5520387795836897, + "grad_norm": 0.6640625, + "learning_rate": 4.999687127653654e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9931489452719688, + "num_tokens": 28892174.0, + "step": 242 + }, + { + "entropy": 0.6171262636780739, + "epoch": 0.5543199315654406, + "grad_norm": 0.69921875, + "learning_rate": 4.99965662152461e-06, + "loss": 0.0286, + "mean_token_accuracy": 0.9911364018917084, + "num_tokens": 29012276.0, + "step": 243 + }, + { + "entropy": 0.6229426488280296, + "epoch": 0.5566010835471913, + "grad_norm": 0.6796875, + "learning_rate": 4.999624696638725e-06, + "loss": 0.0285, + "mean_token_accuracy": 0.992081768810749, + "num_tokens": 29131621.0, + "step": 244 + }, + { + "entropy": 0.6219870373606682, + "epoch": 0.5588822355289421, + "grad_norm": 0.7734375, + "learning_rate": 4.999591353014119e-06, + "loss": 0.0304, + "mean_token_accuracy": 0.9891917929053307, + "num_tokens": 29250871.0, + "step": 245 + }, + { + "entropy": 0.6164985746145248, + "epoch": 0.561163387510693, + "grad_norm": 0.86328125, + "learning_rate": 4.999556590669718e-06, + "loss": 0.0365, + "mean_token_accuracy": 0.988916739821434, + "num_tokens": 29370532.0, + "step": 246 + }, + { + "entropy": 0.6160407066345215, + "epoch": 0.5634445394924437, + "grad_norm": 0.78125, + "learning_rate": 4.999520409625253e-06, + "loss": 0.0425, + "mean_token_accuracy": 0.9887879565358162, + "num_tokens": 29489753.0, + "step": 247 + }, + { + "entropy": 0.6182164773344994, + "epoch": 0.5657256914741945, + "grad_norm": 0.67578125, + "learning_rate": 4.999482809901257e-06, + "loss": 0.0303, + "mean_token_accuracy": 0.9913942143321037, + "num_tokens": 29608880.0, + "step": 248 + }, + { + "entropy": 0.6205332577228546, + "epoch": 0.5680068434559452, + "grad_norm": 0.7109375, + "learning_rate": 4.999443791519074e-06, + "loss": 0.0336, + "mean_token_accuracy": 0.9898478239774704, + "num_tokens": 29728142.0, + "step": 249 + }, + { + "entropy": 0.6171325743198395, + "epoch": 0.570287995437696, + "grad_norm": 0.6953125, + "learning_rate": 4.999403354500847e-06, + "loss": 0.0258, + "mean_token_accuracy": 0.9924390316009521, + "num_tokens": 29847359.0, + "step": 250 + }, + { + "entropy": 0.6182190850377083, + "epoch": 0.5725691474194469, + "grad_norm": 0.9375, + "learning_rate": 4.99936149886953e-06, + "loss": 0.0289, + "mean_token_accuracy": 0.989742286503315, + "num_tokens": 29966698.0, + "step": 251 + }, + { + "entropy": 0.6218460127711296, + "epoch": 0.5748502994011976, + "grad_norm": 0.75, + "learning_rate": 4.999318224648878e-06, + "loss": 0.0359, + "mean_token_accuracy": 0.9881131649017334, + "num_tokens": 30086253.0, + "step": 252 + }, + { + "entropy": 0.6241303980350494, + "epoch": 0.5771314513829484, + "grad_norm": 0.70703125, + "learning_rate": 4.999273531863453e-06, + "loss": 0.0337, + "mean_token_accuracy": 0.9925993457436562, + "num_tokens": 30206067.0, + "step": 253 + }, + { + "entropy": 0.615654431283474, + "epoch": 0.5794126033646991, + "grad_norm": 0.81640625, + "learning_rate": 4.999227420538622e-06, + "loss": 0.0471, + "mean_token_accuracy": 0.9873010069131851, + "num_tokens": 30325767.0, + "step": 254 + }, + { + "entropy": 0.6183354258537292, + "epoch": 0.58169375534645, + "grad_norm": 0.73046875, + "learning_rate": 4.999179890700555e-06, + "loss": 0.0361, + "mean_token_accuracy": 0.9895073994994164, + "num_tokens": 30444648.0, + "step": 255 + }, + { + "entropy": 0.6166637688875198, + "epoch": 0.5839749073282008, + "grad_norm": 0.6953125, + "learning_rate": 4.999130942376232e-06, + "loss": 0.0286, + "mean_token_accuracy": 0.9902183637022972, + "num_tokens": 30563748.0, + "step": 256 + }, + { + "entropy": 0.6167695596814156, + "epoch": 0.5862560593099515, + "grad_norm": 0.703125, + "learning_rate": 4.999080575593433e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9932547882199287, + "num_tokens": 30682664.0, + "step": 257 + }, + { + "entropy": 0.6190900579094887, + "epoch": 0.5885372112917023, + "grad_norm": 0.78125, + "learning_rate": 4.999028790380746e-06, + "loss": 0.0247, + "mean_token_accuracy": 0.9931515231728554, + "num_tokens": 30801904.0, + "step": 258 + }, + { + "entropy": 0.6241631060838699, + "epoch": 0.590818363273453, + "grad_norm": 0.609375, + "learning_rate": 4.9989755867675635e-06, + "loss": 0.0354, + "mean_token_accuracy": 0.9898207560181618, + "num_tokens": 30921249.0, + "step": 259 + }, + { + "entropy": 0.6214651986956596, + "epoch": 0.5930995152552039, + "grad_norm": 0.765625, + "learning_rate": 4.998920964784082e-06, + "loss": 0.0345, + "mean_token_accuracy": 0.9897768124938011, + "num_tokens": 31040646.0, + "step": 260 + }, + { + "entropy": 0.6230965703725815, + "epoch": 0.5953806672369547, + "grad_norm": 0.546875, + "learning_rate": 4.998864924461305e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.9911249652504921, + "num_tokens": 31159597.0, + "step": 261 + }, + { + "entropy": 0.6213866919279099, + "epoch": 0.5976618192187054, + "grad_norm": 0.63671875, + "learning_rate": 4.998807465831039e-06, + "loss": 0.0355, + "mean_token_accuracy": 0.9902068078517914, + "num_tokens": 31279037.0, + "step": 262 + }, + { + "entropy": 0.6219976097345352, + "epoch": 0.5999429712004563, + "grad_norm": 0.6796875, + "learning_rate": 4.998748588925897e-06, + "loss": 0.0303, + "mean_token_accuracy": 0.9906950816512108, + "num_tokens": 31398398.0, + "step": 263 + }, + { + "entropy": 0.619347982108593, + "epoch": 0.602224123182207, + "grad_norm": 0.7109375, + "learning_rate": 4.998688293779297e-06, + "loss": 0.0287, + "mean_token_accuracy": 0.9914028570055962, + "num_tokens": 31518164.0, + "step": 264 + }, + { + "entropy": 0.6249741539359093, + "epoch": 0.6045052751639578, + "grad_norm": 0.63671875, + "learning_rate": 4.998626580425459e-06, + "loss": 0.0295, + "mean_token_accuracy": 0.9911616072058678, + "num_tokens": 31637612.0, + "step": 265 + }, + { + "entropy": 0.6189699470996857, + "epoch": 0.6067864271457086, + "grad_norm": 0.8359375, + "learning_rate": 4.998563448899413e-06, + "loss": 0.0383, + "mean_token_accuracy": 0.9867794513702393, + "num_tokens": 31757186.0, + "step": 266 + }, + { + "entropy": 0.6203187555074692, + "epoch": 0.6090675791274593, + "grad_norm": 0.63671875, + "learning_rate": 4.998498899236989e-06, + "loss": 0.0324, + "mean_token_accuracy": 0.9913295954465866, + "num_tokens": 31876905.0, + "step": 267 + }, + { + "entropy": 0.6222784593701363, + "epoch": 0.6113487311092102, + "grad_norm": 0.71484375, + "learning_rate": 4.998432931474825e-06, + "loss": 0.0299, + "mean_token_accuracy": 0.9907274171710014, + "num_tokens": 31996344.0, + "step": 268 + }, + { + "entropy": 0.6224788203835487, + "epoch": 0.6136298830909609, + "grad_norm": 0.72265625, + "learning_rate": 4.998365545650365e-06, + "loss": 0.0304, + "mean_token_accuracy": 0.9900897741317749, + "num_tokens": 32116881.0, + "step": 269 + }, + { + "entropy": 0.6243728622794151, + "epoch": 0.6159110350727117, + "grad_norm": 0.83984375, + "learning_rate": 4.998296741801852e-06, + "loss": 0.0344, + "mean_token_accuracy": 0.9895648658275604, + "num_tokens": 32235654.0, + "step": 270 + }, + { + "entropy": 0.6188279837369919, + "epoch": 0.6181921870544625, + "grad_norm": 0.7578125, + "learning_rate": 4.998226519968341e-06, + "loss": 0.0277, + "mean_token_accuracy": 0.9915757179260254, + "num_tokens": 32354762.0, + "step": 271 + }, + { + "entropy": 0.6232392713427544, + "epoch": 0.6204733390362133, + "grad_norm": 0.76171875, + "learning_rate": 4.998154880189688e-06, + "loss": 0.024, + "mean_token_accuracy": 0.9917184114456177, + "num_tokens": 32474107.0, + "step": 272 + }, + { + "entropy": 0.6230553910136223, + "epoch": 0.6227544910179641, + "grad_norm": 0.6796875, + "learning_rate": 4.998081822506552e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9935934320092201, + "num_tokens": 32593765.0, + "step": 273 + }, + { + "entropy": 0.6298941746354103, + "epoch": 0.6250356429997148, + "grad_norm": 0.65625, + "learning_rate": 4.998007346960402e-06, + "loss": 0.0266, + "mean_token_accuracy": 0.9926442205905914, + "num_tokens": 32713759.0, + "step": 274 + }, + { + "entropy": 0.6233949437737465, + "epoch": 0.6273167949814656, + "grad_norm": 0.62890625, + "learning_rate": 4.997931453593507e-06, + "loss": 0.0306, + "mean_token_accuracy": 0.9919460639357567, + "num_tokens": 32833246.0, + "step": 275 + }, + { + "entropy": 0.6247470676898956, + "epoch": 0.6295979469632165, + "grad_norm": 0.77734375, + "learning_rate": 4.997854142448944e-06, + "loss": 0.0337, + "mean_token_accuracy": 0.9905920177698135, + "num_tokens": 32952715.0, + "step": 276 + }, + { + "entropy": 0.6210713386535645, + "epoch": 0.6318790989449672, + "grad_norm": 0.765625, + "learning_rate": 4.997775413570593e-06, + "loss": 0.0391, + "mean_token_accuracy": 0.9893451631069183, + "num_tokens": 33072651.0, + "step": 277 + }, + { + "entropy": 0.6217482313513756, + "epoch": 0.634160250926718, + "grad_norm": 0.71875, + "learning_rate": 4.997695267003139e-06, + "loss": 0.0249, + "mean_token_accuracy": 0.9912695586681366, + "num_tokens": 33191765.0, + "step": 278 + }, + { + "entropy": 0.6206020340323448, + "epoch": 0.6364414029084687, + "grad_norm": 0.94140625, + "learning_rate": 4.99761370279207e-06, + "loss": 0.0355, + "mean_token_accuracy": 0.9895498529076576, + "num_tokens": 33310156.0, + "step": 279 + }, + { + "entropy": 0.6181144714355469, + "epoch": 0.6387225548902196, + "grad_norm": 0.76953125, + "learning_rate": 4.997530720983682e-06, + "loss": 0.0333, + "mean_token_accuracy": 0.9894939661026001, + "num_tokens": 33429808.0, + "step": 280 + }, + { + "entropy": 0.6218525841832161, + "epoch": 0.6410037068719704, + "grad_norm": 0.92578125, + "learning_rate": 4.9974463216250735e-06, + "loss": 0.0314, + "mean_token_accuracy": 0.991098515689373, + "num_tokens": 33549220.0, + "step": 281 + }, + { + "entropy": 0.621181808412075, + "epoch": 0.6432848588537211, + "grad_norm": 0.921875, + "learning_rate": 4.997360504764148e-06, + "loss": 0.0365, + "mean_token_accuracy": 0.9885584115982056, + "num_tokens": 33668394.0, + "step": 282 + }, + { + "entropy": 0.6217759549617767, + "epoch": 0.6455660108354719, + "grad_norm": 0.8203125, + "learning_rate": 4.997273270449614e-06, + "loss": 0.0296, + "mean_token_accuracy": 0.992251567542553, + "num_tokens": 33787863.0, + "step": 283 + }, + { + "entropy": 0.625351220369339, + "epoch": 0.6478471628172227, + "grad_norm": 0.80078125, + "learning_rate": 4.997184618730983e-06, + "loss": 0.0368, + "mean_token_accuracy": 0.9882866144180298, + "num_tokens": 33907818.0, + "step": 284 + }, + { + "entropy": 0.6167212650179863, + "epoch": 0.6501283147989735, + "grad_norm": 0.7421875, + "learning_rate": 4.997094549658572e-06, + "loss": 0.0334, + "mean_token_accuracy": 0.9898555502295494, + "num_tokens": 34028297.0, + "step": 285 + }, + { + "entropy": 0.6235809996724129, + "epoch": 0.6524094667807243, + "grad_norm": 0.7734375, + "learning_rate": 4.997003063283503e-06, + "loss": 0.0408, + "mean_token_accuracy": 0.9875974059104919, + "num_tokens": 34147211.0, + "step": 286 + }, + { + "entropy": 0.6180080771446228, + "epoch": 0.654690618762475, + "grad_norm": 0.83984375, + "learning_rate": 4.996910159657703e-06, + "loss": 0.0364, + "mean_token_accuracy": 0.9897789135575294, + "num_tokens": 34267187.0, + "step": 287 + }, + { + "entropy": 0.6233905479311943, + "epoch": 0.6569717707442259, + "grad_norm": 0.8515625, + "learning_rate": 4.996815838833899e-06, + "loss": 0.0456, + "mean_token_accuracy": 0.9868253543972969, + "num_tokens": 34387439.0, + "step": 288 + }, + { + "entropy": 0.6206233724951744, + "epoch": 0.6592529227259766, + "grad_norm": 0.6953125, + "learning_rate": 4.99672010086563e-06, + "loss": 0.0258, + "mean_token_accuracy": 0.9914453700184822, + "num_tokens": 34507591.0, + "step": 289 + }, + { + "entropy": 0.6227139756083488, + "epoch": 0.6615340747077274, + "grad_norm": 0.76171875, + "learning_rate": 4.996622945807231e-06, + "loss": 0.0287, + "mean_token_accuracy": 0.9916034564375877, + "num_tokens": 34626957.0, + "step": 290 + }, + { + "entropy": 0.6283985748887062, + "epoch": 0.6638152266894782, + "grad_norm": 0.7421875, + "learning_rate": 4.996524373713848e-06, + "loss": 0.0281, + "mean_token_accuracy": 0.992324523627758, + "num_tokens": 34747352.0, + "step": 291 + }, + { + "entropy": 0.6219638139009476, + "epoch": 0.666096378671229, + "grad_norm": 1.03125, + "learning_rate": 4.996424384641428e-06, + "loss": 0.0332, + "mean_token_accuracy": 0.9892084151506424, + "num_tokens": 34866484.0, + "step": 292 + }, + { + "entropy": 0.6235440447926521, + "epoch": 0.6683775306529798, + "grad_norm": 0.609375, + "learning_rate": 4.996322978646722e-06, + "loss": 0.032, + "mean_token_accuracy": 0.989060677587986, + "num_tokens": 34985945.0, + "step": 293 + }, + { + "entropy": 0.6248641386628151, + "epoch": 0.6706586826347305, + "grad_norm": 0.6484375, + "learning_rate": 4.996220155787287e-06, + "loss": 0.0274, + "mean_token_accuracy": 0.9939559176564217, + "num_tokens": 35105331.0, + "step": 294 + }, + { + "entropy": 0.6268787086009979, + "epoch": 0.6729398346164813, + "grad_norm": 0.71484375, + "learning_rate": 4.996115916121483e-06, + "loss": 0.0357, + "mean_token_accuracy": 0.9910590127110481, + "num_tokens": 35224973.0, + "step": 295 + }, + { + "entropy": 0.6267981678247452, + "epoch": 0.6752209865982322, + "grad_norm": 0.734375, + "learning_rate": 4.996010259708475e-06, + "loss": 0.0388, + "mean_token_accuracy": 0.9882482066750526, + "num_tokens": 35344237.0, + "step": 296 + }, + { + "entropy": 0.6264387667179108, + "epoch": 0.6775021385799829, + "grad_norm": 0.68359375, + "learning_rate": 4.99590318660823e-06, + "loss": 0.0292, + "mean_token_accuracy": 0.9921920374035835, + "num_tokens": 35464109.0, + "step": 297 + }, + { + "entropy": 0.6294894739985466, + "epoch": 0.6797832905617337, + "grad_norm": 0.78515625, + "learning_rate": 4.9957946968815215e-06, + "loss": 0.0312, + "mean_token_accuracy": 0.9908606261014938, + "num_tokens": 35583193.0, + "step": 298 + }, + { + "entropy": 0.6286262199282646, + "epoch": 0.6820644425434844, + "grad_norm": 0.58203125, + "learning_rate": 4.995684790589927e-06, + "loss": 0.0302, + "mean_token_accuracy": 0.9924826696515083, + "num_tokens": 35702587.0, + "step": 299 + }, + { + "entropy": 0.6250946968793869, + "epoch": 0.6843455945252352, + "grad_norm": 0.76953125, + "learning_rate": 4.995573467795825e-06, + "loss": 0.0335, + "mean_token_accuracy": 0.9894522354006767, + "num_tokens": 35822364.0, + "step": 300 + }, + { + "entropy": 0.6303327158093452, + "epoch": 0.6866267465069861, + "grad_norm": 0.78515625, + "learning_rate": 4.995460728562403e-06, + "loss": 0.0321, + "mean_token_accuracy": 0.9913279265165329, + "num_tokens": 35941607.0, + "step": 301 + }, + { + "entropy": 0.6272570788860321, + "epoch": 0.6889078984887368, + "grad_norm": 0.65625, + "learning_rate": 4.9953465729536475e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9925729483366013, + "num_tokens": 36060891.0, + "step": 302 + }, + { + "entropy": 0.6217986792325974, + "epoch": 0.6911890504704876, + "grad_norm": 0.58203125, + "learning_rate": 4.995231001034352e-06, + "loss": 0.0276, + "mean_token_accuracy": 0.9900646954774857, + "num_tokens": 36179679.0, + "step": 303 + }, + { + "entropy": 0.628930889070034, + "epoch": 0.6934702024522383, + "grad_norm": 0.53125, + "learning_rate": 4.995114012870112e-06, + "loss": 0.0276, + "mean_token_accuracy": 0.9927457273006439, + "num_tokens": 36299083.0, + "step": 304 + }, + { + "entropy": 0.6250144615769386, + "epoch": 0.6957513544339892, + "grad_norm": 0.87890625, + "learning_rate": 4.99499560852733e-06, + "loss": 0.0433, + "mean_token_accuracy": 0.9897830933332443, + "num_tokens": 36418482.0, + "step": 305 + }, + { + "entropy": 0.6293129399418831, + "epoch": 0.69803250641574, + "grad_norm": 0.65234375, + "learning_rate": 4.994875788073207e-06, + "loss": 0.0319, + "mean_token_accuracy": 0.9918276891112328, + "num_tokens": 36538612.0, + "step": 306 + }, + { + "entropy": 0.6300635784864426, + "epoch": 0.7003136583974907, + "grad_norm": 0.69140625, + "learning_rate": 4.994754551575752e-06, + "loss": 0.0297, + "mean_token_accuracy": 0.9919877350330353, + "num_tokens": 36657422.0, + "step": 307 + }, + { + "entropy": 0.6314110085368156, + "epoch": 0.7025948103792415, + "grad_norm": 0.70703125, + "learning_rate": 4.994631899103777e-06, + "loss": 0.0264, + "mean_token_accuracy": 0.9912672117352486, + "num_tokens": 36776795.0, + "step": 308 + }, + { + "entropy": 0.6292708441615105, + "epoch": 0.7048759623609923, + "grad_norm": 0.68359375, + "learning_rate": 4.9945078307268974e-06, + "loss": 0.0335, + "mean_token_accuracy": 0.991038903594017, + "num_tokens": 36895668.0, + "step": 309 + }, + { + "entropy": 0.6292010992765427, + "epoch": 0.7071571143427431, + "grad_norm": 0.7734375, + "learning_rate": 4.994382346515531e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.9932238236069679, + "num_tokens": 37015549.0, + "step": 310 + }, + { + "entropy": 0.6271852403879166, + "epoch": 0.7094382663244939, + "grad_norm": 0.75, + "learning_rate": 4.9942554465409e-06, + "loss": 0.0247, + "mean_token_accuracy": 0.9923034831881523, + "num_tokens": 37134278.0, + "step": 311 + }, + { + "entropy": 0.6301863566040993, + "epoch": 0.7117194183062446, + "grad_norm": 0.62109375, + "learning_rate": 4.994127130875032e-06, + "loss": 0.0257, + "mean_token_accuracy": 0.9917689710855484, + "num_tokens": 37253603.0, + "step": 312 + }, + { + "entropy": 0.6275805309414864, + "epoch": 0.7140005702879955, + "grad_norm": 0.73828125, + "learning_rate": 4.993997399590755e-06, + "loss": 0.0351, + "mean_token_accuracy": 0.9897992387413979, + "num_tokens": 37372932.0, + "step": 313 + }, + { + "entropy": 0.6335499510169029, + "epoch": 0.7162817222697462, + "grad_norm": 0.578125, + "learning_rate": 4.993866252761702e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9937060624361038, + "num_tokens": 37492041.0, + "step": 314 + }, + { + "entropy": 0.6351433545351028, + "epoch": 0.718562874251497, + "grad_norm": 0.87109375, + "learning_rate": 4.993733690462311e-06, + "loss": 0.0257, + "mean_token_accuracy": 0.9912951961159706, + "num_tokens": 37611310.0, + "step": 315 + }, + { + "entropy": 0.6320750117301941, + "epoch": 0.7208440262332478, + "grad_norm": 0.6328125, + "learning_rate": 4.99359971276782e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9912988618016243, + "num_tokens": 37730659.0, + "step": 316 + }, + { + "entropy": 0.6239347383379936, + "epoch": 0.7231251782149986, + "grad_norm": 0.56640625, + "learning_rate": 4.993464319754273e-06, + "loss": 0.0254, + "mean_token_accuracy": 0.9906005337834358, + "num_tokens": 37849754.0, + "step": 317 + }, + { + "entropy": 0.6348524391651154, + "epoch": 0.7254063301967494, + "grad_norm": 0.609375, + "learning_rate": 4.993327511498516e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9938281625509262, + "num_tokens": 37969283.0, + "step": 318 + }, + { + "entropy": 0.6262785270810127, + "epoch": 0.7276874821785001, + "grad_norm": 0.64453125, + "learning_rate": 4.9931892880782e-06, + "loss": 0.0309, + "mean_token_accuracy": 0.9926499500870705, + "num_tokens": 38088575.0, + "step": 319 + }, + { + "entropy": 0.6276945248246193, + "epoch": 0.7299686341602509, + "grad_norm": 0.6328125, + "learning_rate": 4.993049649571775e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.9919878020882607, + "num_tokens": 38208057.0, + "step": 320 + }, + { + "entropy": 0.6286417320370674, + "epoch": 0.7322497861420018, + "grad_norm": 0.72265625, + "learning_rate": 4.992908596058501e-06, + "loss": 0.0351, + "mean_token_accuracy": 0.9885993376374245, + "num_tokens": 38327572.0, + "step": 321 + }, + { + "entropy": 0.629097543656826, + "epoch": 0.7345309381237525, + "grad_norm": 0.68359375, + "learning_rate": 4.992766127618434e-06, + "loss": 0.0339, + "mean_token_accuracy": 0.9891510456800461, + "num_tokens": 38446741.0, + "step": 322 + }, + { + "entropy": 0.6339373365044594, + "epoch": 0.7368120901055033, + "grad_norm": 0.75390625, + "learning_rate": 4.992622244332439e-06, + "loss": 0.0316, + "mean_token_accuracy": 0.9906061366200447, + "num_tokens": 38565680.0, + "step": 323 + }, + { + "entropy": 0.6267269998788834, + "epoch": 0.739093242087254, + "grad_norm": 0.62890625, + "learning_rate": 4.992476946282179e-06, + "loss": 0.0276, + "mean_token_accuracy": 0.9913189113140106, + "num_tokens": 38685293.0, + "step": 324 + }, + { + "entropy": 0.6283107399940491, + "epoch": 0.7413743940690048, + "grad_norm": 0.63671875, + "learning_rate": 4.992330233550124e-06, + "loss": 0.0281, + "mean_token_accuracy": 0.9920041486620903, + "num_tokens": 38804713.0, + "step": 325 + }, + { + "entropy": 0.6292331963777542, + "epoch": 0.7436555460507557, + "grad_norm": 0.61328125, + "learning_rate": 4.9921821062195445e-06, + "loss": 0.0223, + "mean_token_accuracy": 0.992826372385025, + "num_tokens": 38923330.0, + "step": 326 + }, + { + "entropy": 0.6240185052156448, + "epoch": 0.7459366980325064, + "grad_norm": 0.63671875, + "learning_rate": 4.9920325643745145e-06, + "loss": 0.0271, + "mean_token_accuracy": 0.9905824810266495, + "num_tokens": 39042768.0, + "step": 327 + }, + { + "entropy": 0.6285588145256042, + "epoch": 0.7482178500142572, + "grad_norm": 0.55859375, + "learning_rate": 4.991881608099912e-06, + "loss": 0.0255, + "mean_token_accuracy": 0.991863988339901, + "num_tokens": 39162565.0, + "step": 328 + }, + { + "entropy": 0.6296004801988602, + "epoch": 0.7504990019960079, + "grad_norm": 0.72265625, + "learning_rate": 4.991729237481417e-06, + "loss": 0.0331, + "mean_token_accuracy": 0.9905829802155495, + "num_tokens": 39281780.0, + "step": 329 + }, + { + "entropy": 0.6297445818781853, + "epoch": 0.7527801539777588, + "grad_norm": 0.703125, + "learning_rate": 4.991575452605511e-06, + "loss": 0.0311, + "mean_token_accuracy": 0.989808015525341, + "num_tokens": 39401311.0, + "step": 330 + }, + { + "entropy": 0.6319060102105141, + "epoch": 0.7550613059595096, + "grad_norm": 1.0703125, + "learning_rate": 4.9914202535594795e-06, + "loss": 0.0364, + "mean_token_accuracy": 0.988189198076725, + "num_tokens": 39520415.0, + "step": 331 + }, + { + "entropy": 0.6275013089179993, + "epoch": 0.7573424579412603, + "grad_norm": 0.6328125, + "learning_rate": 4.991263640431411e-06, + "loss": 0.0288, + "mean_token_accuracy": 0.9902486503124237, + "num_tokens": 39639927.0, + "step": 332 + }, + { + "entropy": 0.6285593658685684, + "epoch": 0.7596236099230111, + "grad_norm": 0.6015625, + "learning_rate": 4.9911056133101965e-06, + "loss": 0.0291, + "mean_token_accuracy": 0.992401234805584, + "num_tokens": 39759007.0, + "step": 333 + }, + { + "entropy": 0.6343879401683807, + "epoch": 0.7619047619047619, + "grad_norm": 0.875, + "learning_rate": 4.990946172285528e-06, + "loss": 0.0291, + "mean_token_accuracy": 0.9910958483815193, + "num_tokens": 39878299.0, + "step": 334 + }, + { + "entropy": 0.6331684067845345, + "epoch": 0.7641859138865127, + "grad_norm": 0.5078125, + "learning_rate": 4.990785317447901e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.993746742606163, + "num_tokens": 39997793.0, + "step": 335 + }, + { + "entropy": 0.6350944265723228, + "epoch": 0.7664670658682635, + "grad_norm": 0.68359375, + "learning_rate": 4.990623048888615e-06, + "loss": 0.0332, + "mean_token_accuracy": 0.9876480028033257, + "num_tokens": 40116816.0, + "step": 336 + }, + { + "entropy": 0.6304894685745239, + "epoch": 0.7687482178500142, + "grad_norm": 0.9609375, + "learning_rate": 4.9904593666997704e-06, + "loss": 0.0458, + "mean_token_accuracy": 0.9874863177537918, + "num_tokens": 40235855.0, + "step": 337 + }, + { + "entropy": 0.6315995007753372, + "epoch": 0.7710293698317651, + "grad_norm": 0.765625, + "learning_rate": 4.990294270974268e-06, + "loss": 0.0332, + "mean_token_accuracy": 0.9903906658291817, + "num_tokens": 40355182.0, + "step": 338 + }, + { + "entropy": 0.627842590212822, + "epoch": 0.7733105218135158, + "grad_norm": 0.6328125, + "learning_rate": 4.990127761805816e-06, + "loss": 0.0287, + "mean_token_accuracy": 0.9924337193369865, + "num_tokens": 40473558.0, + "step": 339 + }, + { + "entropy": 0.635308712720871, + "epoch": 0.7755916737952666, + "grad_norm": 0.71875, + "learning_rate": 4.989959839288919e-06, + "loss": 0.0282, + "mean_token_accuracy": 0.9901068136096001, + "num_tokens": 40593753.0, + "step": 340 + }, + { + "entropy": 0.630740687251091, + "epoch": 0.7778728257770174, + "grad_norm": 0.58984375, + "learning_rate": 4.989790503518888e-06, + "loss": 0.0333, + "mean_token_accuracy": 0.98983483761549, + "num_tokens": 40713114.0, + "step": 341 + }, + { + "entropy": 0.6332992985844612, + "epoch": 0.7801539777587682, + "grad_norm": 0.6328125, + "learning_rate": 4.9896197545918345e-06, + "loss": 0.0332, + "mean_token_accuracy": 0.9899295046925545, + "num_tokens": 40832953.0, + "step": 342 + }, + { + "entropy": 0.6373177543282509, + "epoch": 0.782435129740519, + "grad_norm": 0.609375, + "learning_rate": 4.989447592604673e-06, + "loss": 0.0243, + "mean_token_accuracy": 0.9929884001612663, + "num_tokens": 40952803.0, + "step": 343 + }, + { + "entropy": 0.6320662945508957, + "epoch": 0.7847162817222697, + "grad_norm": 0.470703125, + "learning_rate": 4.989274017655117e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9946706518530846, + "num_tokens": 41072210.0, + "step": 344 + }, + { + "entropy": 0.6310200020670891, + "epoch": 0.7869974337040205, + "grad_norm": 0.6484375, + "learning_rate": 4.989099029841687e-06, + "loss": 0.0258, + "mean_token_accuracy": 0.989687442779541, + "num_tokens": 41190985.0, + "step": 345 + }, + { + "entropy": 0.6312302127480507, + "epoch": 0.7892785856857714, + "grad_norm": 0.71484375, + "learning_rate": 4.988922629263701e-06, + "loss": 0.0381, + "mean_token_accuracy": 0.9886831492185593, + "num_tokens": 41309913.0, + "step": 346 + }, + { + "entropy": 0.6356581896543503, + "epoch": 0.7915597376675221, + "grad_norm": 0.6328125, + "learning_rate": 4.988744816021283e-06, + "loss": 0.0372, + "mean_token_accuracy": 0.9891534447669983, + "num_tokens": 41430025.0, + "step": 347 + }, + { + "entropy": 0.6311512738466263, + "epoch": 0.7938408896492729, + "grad_norm": 0.70703125, + "learning_rate": 4.988565590215352e-06, + "loss": 0.0308, + "mean_token_accuracy": 0.9901867583394051, + "num_tokens": 41549153.0, + "step": 348 + }, + { + "entropy": 0.6334821730852127, + "epoch": 0.7961220416310236, + "grad_norm": 0.62109375, + "learning_rate": 4.9883849519476364e-06, + "loss": 0.0321, + "mean_token_accuracy": 0.9912395998835564, + "num_tokens": 41668733.0, + "step": 349 + }, + { + "entropy": 0.633660726249218, + "epoch": 0.7984031936127745, + "grad_norm": 0.609375, + "learning_rate": 4.988202901320663e-06, + "loss": 0.0281, + "mean_token_accuracy": 0.9917131811380386, + "num_tokens": 41788355.0, + "step": 350 + }, + { + "entropy": 0.6264280527830124, + "epoch": 0.8006843455945253, + "grad_norm": 0.67578125, + "learning_rate": 4.988019438437759e-06, + "loss": 0.0245, + "mean_token_accuracy": 0.9931059330701828, + "num_tokens": 41907129.0, + "step": 351 + }, + { + "entropy": 0.6357913464307785, + "epoch": 0.802965497576276, + "grad_norm": 0.49609375, + "learning_rate": 4.987834563403055e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9948461428284645, + "num_tokens": 42027495.0, + "step": 352 + }, + { + "entropy": 0.6331300735473633, + "epoch": 0.8052466495580268, + "grad_norm": 0.71484375, + "learning_rate": 4.987648276321482e-06, + "loss": 0.0323, + "mean_token_accuracy": 0.9900515675544739, + "num_tokens": 42146126.0, + "step": 353 + }, + { + "entropy": 0.6349668279290199, + "epoch": 0.8075278015397775, + "grad_norm": 0.71484375, + "learning_rate": 4.987460577298774e-06, + "loss": 0.0274, + "mean_token_accuracy": 0.9915566816926003, + "num_tokens": 42265838.0, + "step": 354 + }, + { + "entropy": 0.6288145929574966, + "epoch": 0.8098089535215284, + "grad_norm": 0.7421875, + "learning_rate": 4.9872714664414635e-06, + "loss": 0.0295, + "mean_token_accuracy": 0.9920939430594444, + "num_tokens": 42384289.0, + "step": 355 + }, + { + "entropy": 0.6304789409041405, + "epoch": 0.8120901055032792, + "grad_norm": 0.625, + "learning_rate": 4.987080943856887e-06, + "loss": 0.0287, + "mean_token_accuracy": 0.9920516312122345, + "num_tokens": 42503208.0, + "step": 356 + }, + { + "entropy": 0.6340020447969437, + "epoch": 0.8143712574850299, + "grad_norm": 0.75390625, + "learning_rate": 4.986889009653183e-06, + "loss": 0.0269, + "mean_token_accuracy": 0.9925616234540939, + "num_tokens": 42623051.0, + "step": 357 + }, + { + "entropy": 0.6323644071817398, + "epoch": 0.8166524094667807, + "grad_norm": 0.625, + "learning_rate": 4.986695663939288e-06, + "loss": 0.0253, + "mean_token_accuracy": 0.9930807650089264, + "num_tokens": 42742589.0, + "step": 358 + }, + { + "entropy": 0.6344643980264664, + "epoch": 0.8189335614485315, + "grad_norm": 0.51953125, + "learning_rate": 4.986500906824942e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9953058287501335, + "num_tokens": 42861772.0, + "step": 359 + }, + { + "entropy": 0.6348003447055817, + "epoch": 0.8212147134302823, + "grad_norm": 0.66796875, + "learning_rate": 4.986304738420684e-06, + "loss": 0.03, + "mean_token_accuracy": 0.9894232228398323, + "num_tokens": 42981067.0, + "step": 360 + }, + { + "entropy": 0.6325874924659729, + "epoch": 0.8234958654120331, + "grad_norm": 0.71484375, + "learning_rate": 4.9861071588378565e-06, + "loss": 0.03, + "mean_token_accuracy": 0.9924087449908257, + "num_tokens": 43100280.0, + "step": 361 + }, + { + "entropy": 0.6260631904006004, + "epoch": 0.8257770173937838, + "grad_norm": 0.51953125, + "learning_rate": 4.985908168188602e-06, + "loss": 0.025, + "mean_token_accuracy": 0.9923560544848442, + "num_tokens": 43219373.0, + "step": 362 + }, + { + "entropy": 0.6274162903428078, + "epoch": 0.8280581693755347, + "grad_norm": 0.74609375, + "learning_rate": 4.985707766585865e-06, + "loss": 0.0305, + "mean_token_accuracy": 0.9918733611702919, + "num_tokens": 43338310.0, + "step": 363 + }, + { + "entropy": 0.6347053945064545, + "epoch": 0.8303393213572854, + "grad_norm": 0.5625, + "learning_rate": 4.985505954143387e-06, + "loss": 0.0255, + "mean_token_accuracy": 0.9923129454255104, + "num_tokens": 43457859.0, + "step": 364 + }, + { + "entropy": 0.6303003653883934, + "epoch": 0.8326204733390362, + "grad_norm": 0.7265625, + "learning_rate": 4.985302730975713e-06, + "loss": 0.029, + "mean_token_accuracy": 0.9918788895010948, + "num_tokens": 43577270.0, + "step": 365 + }, + { + "entropy": 0.635373093187809, + "epoch": 0.834901625320787, + "grad_norm": 0.68359375, + "learning_rate": 4.9850980971981914e-06, + "loss": 0.0344, + "mean_token_accuracy": 0.9904117211699486, + "num_tokens": 43696251.0, + "step": 366 + }, + { + "entropy": 0.6313088983297348, + "epoch": 0.8371827773025378, + "grad_norm": 0.6484375, + "learning_rate": 4.984892052926965e-06, + "loss": 0.0307, + "mean_token_accuracy": 0.9903543144464493, + "num_tokens": 43815516.0, + "step": 367 + }, + { + "entropy": 0.6313188746571541, + "epoch": 0.8394639292842886, + "grad_norm": 0.61328125, + "learning_rate": 4.984684598278982e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9921409264206886, + "num_tokens": 43934382.0, + "step": 368 + }, + { + "entropy": 0.6344654932618141, + "epoch": 0.8417450812660393, + "grad_norm": 0.6640625, + "learning_rate": 4.984475733371991e-06, + "loss": 0.0311, + "mean_token_accuracy": 0.9895618408918381, + "num_tokens": 44054135.0, + "step": 369 + }, + { + "entropy": 0.6296713724732399, + "epoch": 0.8440262332477901, + "grad_norm": 0.875, + "learning_rate": 4.984265458324538e-06, + "loss": 0.0444, + "mean_token_accuracy": 0.9878969714045525, + "num_tokens": 44172909.0, + "step": 370 + }, + { + "entropy": 0.6305619925260544, + "epoch": 0.846307385229541, + "grad_norm": 0.71484375, + "learning_rate": 4.984053773255971e-06, + "loss": 0.0282, + "mean_token_accuracy": 0.9922169148921967, + "num_tokens": 44292823.0, + "step": 371 + }, + { + "entropy": 0.6333710178732872, + "epoch": 0.8485885372112917, + "grad_norm": 0.6875, + "learning_rate": 4.9838406782864394e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9942256286740303, + "num_tokens": 44412311.0, + "step": 372 + }, + { + "entropy": 0.6325075179338455, + "epoch": 0.8508696891930425, + "grad_norm": 0.5625, + "learning_rate": 4.983626173536891e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9945259541273117, + "num_tokens": 44531839.0, + "step": 373 + }, + { + "entropy": 0.6389484703540802, + "epoch": 0.8531508411747932, + "grad_norm": 0.66015625, + "learning_rate": 4.983410259129075e-06, + "loss": 0.0283, + "mean_token_accuracy": 0.99203772097826, + "num_tokens": 44651089.0, + "step": 374 + }, + { + "entropy": 0.6317842677235603, + "epoch": 0.8554319931565441, + "grad_norm": 0.7265625, + "learning_rate": 4.983192935185539e-06, + "loss": 0.0306, + "mean_token_accuracy": 0.9909524023532867, + "num_tokens": 44769904.0, + "step": 375 + }, + { + "entropy": 0.6322771683335304, + "epoch": 0.8577131451382949, + "grad_norm": 0.59375, + "learning_rate": 4.9829742018296335e-06, + "loss": 0.0268, + "mean_token_accuracy": 0.9904268234968185, + "num_tokens": 44889363.0, + "step": 376 + }, + { + "entropy": 0.6292531788349152, + "epoch": 0.8599942971200456, + "grad_norm": 0.5, + "learning_rate": 4.9827540591855064e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.993001751601696, + "num_tokens": 45008489.0, + "step": 377 + }, + { + "entropy": 0.6332198604941368, + "epoch": 0.8622754491017964, + "grad_norm": 3.484375, + "learning_rate": 4.9825325073781075e-06, + "loss": 0.0285, + "mean_token_accuracy": 0.9898903071880341, + "num_tokens": 45127701.0, + "step": 378 + }, + { + "entropy": 0.6328987553715706, + "epoch": 0.8645566010835471, + "grad_norm": 0.625, + "learning_rate": 4.982309546533184e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9922367334365845, + "num_tokens": 45246674.0, + "step": 379 + }, + { + "entropy": 0.6309941336512566, + "epoch": 0.866837753065298, + "grad_norm": 0.6953125, + "learning_rate": 4.982085176777285e-06, + "loss": 0.0311, + "mean_token_accuracy": 0.9908452853560448, + "num_tokens": 45366217.0, + "step": 380 + }, + { + "entropy": 0.6321151852607727, + "epoch": 0.8691189050470488, + "grad_norm": 1.1953125, + "learning_rate": 4.981859398237758e-06, + "loss": 0.0332, + "mean_token_accuracy": 0.9927432015538216, + "num_tokens": 45485636.0, + "step": 381 + }, + { + "entropy": 0.6335924714803696, + "epoch": 0.8714000570287995, + "grad_norm": 0.69921875, + "learning_rate": 4.9816322110427505e-06, + "loss": 0.03, + "mean_token_accuracy": 0.9922157377004623, + "num_tokens": 45605376.0, + "step": 382 + }, + { + "entropy": 0.6401765421032906, + "epoch": 0.8736812090105504, + "grad_norm": 0.78125, + "learning_rate": 4.98140361532121e-06, + "loss": 0.0393, + "mean_token_accuracy": 0.9891323000192642, + "num_tokens": 45724395.0, + "step": 383 + }, + { + "entropy": 0.6335688680410385, + "epoch": 0.8759623609923011, + "grad_norm": 0.60546875, + "learning_rate": 4.981173611202883e-06, + "loss": 0.0291, + "mean_token_accuracy": 0.9920138269662857, + "num_tokens": 45843526.0, + "step": 384 + }, + { + "entropy": 0.6334913000464439, + "epoch": 0.8782435129740519, + "grad_norm": 0.6484375, + "learning_rate": 4.980942198818315e-06, + "loss": 0.0276, + "mean_token_accuracy": 0.9917786866426468, + "num_tokens": 45963192.0, + "step": 385 + }, + { + "entropy": 0.6313852146267891, + "epoch": 0.8805246649558027, + "grad_norm": 0.55078125, + "learning_rate": 4.980709378298851e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9934649914503098, + "num_tokens": 46082491.0, + "step": 386 + }, + { + "entropy": 0.6299383863806725, + "epoch": 0.8828058169375534, + "grad_norm": 0.71875, + "learning_rate": 4.980475149776636e-06, + "loss": 0.0391, + "mean_token_accuracy": 0.9879358112812042, + "num_tokens": 46201619.0, + "step": 387 + }, + { + "entropy": 0.6360099762678146, + "epoch": 0.8850869689193043, + "grad_norm": 0.58203125, + "learning_rate": 4.980239513384614e-06, + "loss": 0.0245, + "mean_token_accuracy": 0.99192313849926, + "num_tokens": 46321076.0, + "step": 388 + }, + { + "entropy": 0.626785583794117, + "epoch": 0.887368120901055, + "grad_norm": 0.6171875, + "learning_rate": 4.980002469256527e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.99393280595541, + "num_tokens": 46440417.0, + "step": 389 + }, + { + "entropy": 0.6304146945476532, + "epoch": 0.8896492728828058, + "grad_norm": 0.6953125, + "learning_rate": 4.979764017526916e-06, + "loss": 0.0296, + "mean_token_accuracy": 0.9919715449213982, + "num_tokens": 46559820.0, + "step": 390 + }, + { + "entropy": 0.6335912868380547, + "epoch": 0.8919304248645566, + "grad_norm": 0.74609375, + "learning_rate": 4.979524158331123e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.9924125820398331, + "num_tokens": 46678713.0, + "step": 391 + }, + { + "entropy": 0.6295593604445457, + "epoch": 0.8942115768463074, + "grad_norm": 0.6640625, + "learning_rate": 4.979282891805287e-06, + "loss": 0.0342, + "mean_token_accuracy": 0.9894499778747559, + "num_tokens": 46798423.0, + "step": 392 + }, + { + "entropy": 0.6302126199007034, + "epoch": 0.8964927288280582, + "grad_norm": 1.0234375, + "learning_rate": 4.979040218086345e-06, + "loss": 0.0327, + "mean_token_accuracy": 0.9911797866225243, + "num_tokens": 46917456.0, + "step": 393 + }, + { + "entropy": 0.6292528361082077, + "epoch": 0.8987738808098089, + "grad_norm": 0.53515625, + "learning_rate": 4.978796137312036e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9950311183929443, + "num_tokens": 47036632.0, + "step": 394 + }, + { + "entropy": 0.6327809244394302, + "epoch": 0.9010550327915597, + "grad_norm": 0.69140625, + "learning_rate": 4.978550649620894e-06, + "loss": 0.0321, + "mean_token_accuracy": 0.9897983595728874, + "num_tokens": 47156402.0, + "step": 395 + }, + { + "entropy": 0.6352833062410355, + "epoch": 0.9033361847733106, + "grad_norm": 0.5625, + "learning_rate": 4.978303755152254e-06, + "loss": 0.0264, + "mean_token_accuracy": 0.9929775670170784, + "num_tokens": 47276296.0, + "step": 396 + }, + { + "entropy": 0.6325918436050415, + "epoch": 0.9056173367550613, + "grad_norm": 0.5078125, + "learning_rate": 4.978055454046247e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9928143471479416, + "num_tokens": 47395438.0, + "step": 397 + }, + { + "entropy": 0.6268999874591827, + "epoch": 0.9078984887368121, + "grad_norm": 0.60546875, + "learning_rate": 4.977805746443807e-06, + "loss": 0.036, + "mean_token_accuracy": 0.9914239719510078, + "num_tokens": 47514645.0, + "step": 398 + }, + { + "entropy": 0.633933886885643, + "epoch": 0.9101796407185628, + "grad_norm": 0.46875, + "learning_rate": 4.9775546324866596e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9941469132900238, + "num_tokens": 47633862.0, + "step": 399 + }, + { + "entropy": 0.6324601396918297, + "epoch": 0.9124607927003137, + "grad_norm": 0.81640625, + "learning_rate": 4.977302112317334e-06, + "loss": 0.034, + "mean_token_accuracy": 0.9899535179138184, + "num_tokens": 47753099.0, + "step": 400 + }, + { + "entropy": 0.6379967853426933, + "epoch": 0.9147419446820645, + "grad_norm": 0.640625, + "learning_rate": 4.977048186079155e-06, + "loss": 0.028, + "mean_token_accuracy": 0.9918258935213089, + "num_tokens": 47872312.0, + "step": 401 + }, + { + "entropy": 0.6345786452293396, + "epoch": 0.9170230966638152, + "grad_norm": 0.734375, + "learning_rate": 4.976792853916248e-06, + "loss": 0.033, + "mean_token_accuracy": 0.9886585846543312, + "num_tokens": 47991181.0, + "step": 402 + }, + { + "entropy": 0.6402681022882462, + "epoch": 0.919304248645566, + "grad_norm": 0.72265625, + "learning_rate": 4.9765361159735335e-06, + "loss": 0.03, + "mean_token_accuracy": 0.9904176071286201, + "num_tokens": 48110903.0, + "step": 403 + }, + { + "entropy": 0.6352676749229431, + "epoch": 0.9215854006273168, + "grad_norm": 0.6796875, + "learning_rate": 4.97627797239673e-06, + "loss": 0.0293, + "mean_token_accuracy": 0.9897886216640472, + "num_tokens": 48229897.0, + "step": 404 + }, + { + "entropy": 0.632750004529953, + "epoch": 0.9238665526090676, + "grad_norm": 0.70703125, + "learning_rate": 4.976018423332357e-06, + "loss": 0.0321, + "mean_token_accuracy": 0.9912810623645782, + "num_tokens": 48348910.0, + "step": 405 + }, + { + "entropy": 0.6377396062016487, + "epoch": 0.9261477045908184, + "grad_norm": 0.67578125, + "learning_rate": 4.975757468927727e-06, + "loss": 0.0235, + "mean_token_accuracy": 0.9939157515764236, + "num_tokens": 48468533.0, + "step": 406 + }, + { + "entropy": 0.6392417252063751, + "epoch": 0.9284288565725691, + "grad_norm": 0.53125, + "learning_rate": 4.975495109330954e-06, + "loss": 0.0282, + "mean_token_accuracy": 0.9914219453930855, + "num_tokens": 48587491.0, + "step": 407 + }, + { + "entropy": 0.6372505724430084, + "epoch": 0.93071000855432, + "grad_norm": 0.6171875, + "learning_rate": 4.97523134469095e-06, + "loss": 0.026, + "mean_token_accuracy": 0.991726890206337, + "num_tokens": 48706774.0, + "step": 408 + }, + { + "entropy": 0.6389384269714355, + "epoch": 0.9329911605360707, + "grad_norm": 0.60546875, + "learning_rate": 4.97496617515742e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9927498921751976, + "num_tokens": 48826457.0, + "step": 409 + }, + { + "entropy": 0.6363738626241684, + "epoch": 0.9352723125178215, + "grad_norm": 0.5390625, + "learning_rate": 4.974699600880869e-06, + "loss": 0.0294, + "mean_token_accuracy": 0.9931856095790863, + "num_tokens": 48946801.0, + "step": 410 + }, + { + "entropy": 0.6360675618052483, + "epoch": 0.9375534644995723, + "grad_norm": 0.75, + "learning_rate": 4.974431622012601e-06, + "loss": 0.0423, + "mean_token_accuracy": 0.9894614368677139, + "num_tokens": 49065763.0, + "step": 411 + }, + { + "entropy": 0.636858381330967, + "epoch": 0.939834616481323, + "grad_norm": 0.625, + "learning_rate": 4.974162238704716e-06, + "loss": 0.025, + "mean_token_accuracy": 0.9912799373269081, + "num_tokens": 49185141.0, + "step": 412 + }, + { + "entropy": 0.6399965584278107, + "epoch": 0.9421157684630739, + "grad_norm": 0.6171875, + "learning_rate": 4.973891451110109e-06, + "loss": 0.0286, + "mean_token_accuracy": 0.990425206720829, + "num_tokens": 49304648.0, + "step": 413 + }, + { + "entropy": 0.6403591856360435, + "epoch": 0.9443969204448246, + "grad_norm": 0.65234375, + "learning_rate": 4.973619259382475e-06, + "loss": 0.0267, + "mean_token_accuracy": 0.9916224926710129, + "num_tokens": 49424419.0, + "step": 414 + }, + { + "entropy": 0.636610209941864, + "epoch": 0.9466780724265754, + "grad_norm": 0.64453125, + "learning_rate": 4.973345663676305e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9927762299776077, + "num_tokens": 49543273.0, + "step": 415 + }, + { + "entropy": 0.6427663639187813, + "epoch": 0.9489592244083263, + "grad_norm": 0.5703125, + "learning_rate": 4.973070664146885e-06, + "loss": 0.0302, + "mean_token_accuracy": 0.990881122648716, + "num_tokens": 49662667.0, + "step": 416 + }, + { + "entropy": 0.6406388878822327, + "epoch": 0.951240376390077, + "grad_norm": 0.58203125, + "learning_rate": 4.972794260950301e-06, + "loss": 0.0235, + "mean_token_accuracy": 0.9926784634590149, + "num_tokens": 49781955.0, + "step": 417 + }, + { + "entropy": 0.638671875, + "epoch": 0.9535215283718278, + "grad_norm": 0.7265625, + "learning_rate": 4.972516454243433e-06, + "loss": 0.0275, + "mean_token_accuracy": 0.991005577147007, + "num_tokens": 49902139.0, + "step": 418 + }, + { + "entropy": 0.6376098021864891, + "epoch": 0.9558026803535785, + "grad_norm": 0.56640625, + "learning_rate": 4.972237244183961e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9940855801105499, + "num_tokens": 50021861.0, + "step": 419 + }, + { + "entropy": 0.6337656602263451, + "epoch": 0.9580838323353293, + "grad_norm": 0.48828125, + "learning_rate": 4.971956630930356e-06, + "loss": 0.0253, + "mean_token_accuracy": 0.9910890683531761, + "num_tokens": 50141649.0, + "step": 420 + }, + { + "entropy": 0.6375783607363701, + "epoch": 0.9603649843170802, + "grad_norm": 0.5234375, + "learning_rate": 4.971674614641891e-06, + "loss": 0.022, + "mean_token_accuracy": 0.994614265859127, + "num_tokens": 50261272.0, + "step": 421 + }, + { + "entropy": 0.6356885358691216, + "epoch": 0.9626461362988309, + "grad_norm": 0.671875, + "learning_rate": 4.971391195478632e-06, + "loss": 0.0317, + "mean_token_accuracy": 0.9922126904129982, + "num_tokens": 50380394.0, + "step": 422 + }, + { + "entropy": 0.6366880387067795, + "epoch": 0.9649272882805817, + "grad_norm": 0.58203125, + "learning_rate": 4.971106373601443e-06, + "loss": 0.028, + "mean_token_accuracy": 0.9909306168556213, + "num_tokens": 50499095.0, + "step": 423 + }, + { + "entropy": 0.6338107138872147, + "epoch": 0.9672084402623324, + "grad_norm": 0.73828125, + "learning_rate": 4.9708201491719825e-06, + "loss": 0.0307, + "mean_token_accuracy": 0.9910071790218353, + "num_tokens": 50618195.0, + "step": 424 + }, + { + "entropy": 0.6367650181055069, + "epoch": 0.9694895922440833, + "grad_norm": 0.56640625, + "learning_rate": 4.9705325223527055e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9934993907809258, + "num_tokens": 50737571.0, + "step": 425 + }, + { + "entropy": 0.6415835171937943, + "epoch": 0.9717707442258341, + "grad_norm": 0.671875, + "learning_rate": 4.970243493306865e-06, + "loss": 0.0287, + "mean_token_accuracy": 0.9924728497862816, + "num_tokens": 50856995.0, + "step": 426 + }, + { + "entropy": 0.634221576154232, + "epoch": 0.9740518962075848, + "grad_norm": 0.60546875, + "learning_rate": 4.969953062198508e-06, + "loss": 0.0299, + "mean_token_accuracy": 0.9917934089899063, + "num_tokens": 50976471.0, + "step": 427 + }, + { + "entropy": 0.6341130882501602, + "epoch": 0.9763330481893356, + "grad_norm": 0.6015625, + "learning_rate": 4.969661229192477e-06, + "loss": 0.0279, + "mean_token_accuracy": 0.9893144592642784, + "num_tokens": 51095865.0, + "step": 428 + }, + { + "entropy": 0.6377880498766899, + "epoch": 0.9786142001710864, + "grad_norm": 0.75, + "learning_rate": 4.969367994454412e-06, + "loss": 0.0325, + "mean_token_accuracy": 0.9905389100313187, + "num_tokens": 51215906.0, + "step": 429 + }, + { + "entropy": 0.6376537978649139, + "epoch": 0.9808953521528372, + "grad_norm": 0.64453125, + "learning_rate": 4.9690733581507445e-06, + "loss": 0.0296, + "mean_token_accuracy": 0.9894515573978424, + "num_tokens": 51335060.0, + "step": 430 + }, + { + "entropy": 0.6390247568488121, + "epoch": 0.983176504134588, + "grad_norm": 0.65234375, + "learning_rate": 4.968777320448707e-06, + "loss": 0.0306, + "mean_token_accuracy": 0.9900486841797829, + "num_tokens": 51454714.0, + "step": 431 + }, + { + "entropy": 0.6362894997000694, + "epoch": 0.9854576561163387, + "grad_norm": 0.51171875, + "learning_rate": 4.9684798815163235e-06, + "loss": 0.0269, + "mean_token_accuracy": 0.991004191339016, + "num_tokens": 51573836.0, + "step": 432 + }, + { + "entropy": 0.6357378140091896, + "epoch": 0.9877388080980896, + "grad_norm": 1.1171875, + "learning_rate": 4.968181041522416e-06, + "loss": 0.0322, + "mean_token_accuracy": 0.9914013743400574, + "num_tokens": 51693392.0, + "step": 433 + }, + { + "entropy": 0.6380757689476013, + "epoch": 0.9900199600798403, + "grad_norm": 0.65625, + "learning_rate": 4.967880800636599e-06, + "loss": 0.0246, + "mean_token_accuracy": 0.9931723475456238, + "num_tokens": 51812738.0, + "step": 434 + }, + { + "entropy": 0.6363464668393135, + "epoch": 0.9923011120615911, + "grad_norm": 0.5859375, + "learning_rate": 4.967579159029284e-06, + "loss": 0.0313, + "mean_token_accuracy": 0.9909289851784706, + "num_tokens": 51931831.0, + "step": 435 + }, + { + "entropy": 0.6351185590028763, + "epoch": 0.9945822640433419, + "grad_norm": 0.671875, + "learning_rate": 4.9672761168716766e-06, + "loss": 0.0292, + "mean_token_accuracy": 0.9910148307681084, + "num_tokens": 52051128.0, + "step": 436 + }, + { + "entropy": 0.6391474083065987, + "epoch": 0.9968634160250927, + "grad_norm": 0.66796875, + "learning_rate": 4.966971674335778e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.9928521141409874, + "num_tokens": 52170136.0, + "step": 437 + }, + { + "entropy": 0.6394452452659607, + "epoch": 0.9991445680068435, + "grad_norm": 0.52734375, + "learning_rate": 4.966665831594383e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.9921982362866402, + "num_tokens": 52289725.0, + "step": 438 + }, + { + "entropy": 0.6332884828249613, + "epoch": 1.0, + "grad_norm": 1.0, + "learning_rate": 4.966358588821084e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9954027334849039, + "num_tokens": 52333478.0, + "step": 439 + }, + { + "entropy": 0.6371900886297226, + "epoch": 1.0022811519817507, + "grad_norm": 0.427734375, + "learning_rate": 4.966049946190265e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.995305560529232, + "num_tokens": 52452831.0, + "step": 440 + }, + { + "epoch": 1.0022811519817507, + "eval_entropy": 0.6384326545911143, + "eval_loss": 0.02715149149298668, + "eval_mean_token_accuracy": 0.9917137085711548, + "eval_num_tokens": 52452831.0, + "eval_runtime": 177.4898, + "eval_samples_per_second": 47.242, + "eval_steps_per_second": 1.482, + "step": 440 + }, + { + "entropy": 0.637593612074852, + "epoch": 1.0045623039635017, + "grad_norm": 0.60546875, + "learning_rate": 4.9657399038771045e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9933919906616211, + "num_tokens": 52571914.0, + "step": 441 + }, + { + "entropy": 0.640336737036705, + "epoch": 1.0068434559452524, + "grad_norm": 0.73046875, + "learning_rate": 4.965428462057578e-06, + "loss": 0.0313, + "mean_token_accuracy": 0.9905356541275978, + "num_tokens": 52691359.0, + "step": 442 + }, + { + "entropy": 0.6336232572793961, + "epoch": 1.009124607927003, + "grad_norm": 0.498046875, + "learning_rate": 4.965115620908453e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.9934027120471001, + "num_tokens": 52810821.0, + "step": 443 + }, + { + "entropy": 0.6394954770803452, + "epoch": 1.011405759908754, + "grad_norm": 0.5234375, + "learning_rate": 4.964801380607293e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9939763993024826, + "num_tokens": 52930206.0, + "step": 444 + }, + { + "entropy": 0.6353138089179993, + "epoch": 1.0136869118905047, + "grad_norm": 0.64453125, + "learning_rate": 4.964485741332453e-06, + "loss": 0.0309, + "mean_token_accuracy": 0.9919798970222473, + "num_tokens": 53050730.0, + "step": 445 + }, + { + "entropy": 0.6386236399412155, + "epoch": 1.0159680638722555, + "grad_norm": 0.66015625, + "learning_rate": 4.964168703263086e-06, + "loss": 0.0316, + "mean_token_accuracy": 0.9912258833646774, + "num_tokens": 53170642.0, + "step": 446 + }, + { + "entropy": 0.6409043520689011, + "epoch": 1.0182492158540062, + "grad_norm": 0.640625, + "learning_rate": 4.963850266579136e-06, + "loss": 0.032, + "mean_token_accuracy": 0.9897626861929893, + "num_tokens": 53290576.0, + "step": 447 + }, + { + "entropy": 0.6358297541737556, + "epoch": 1.0205303678357571, + "grad_norm": 0.52734375, + "learning_rate": 4.963530431461341e-06, + "loss": 0.0271, + "mean_token_accuracy": 0.9908745512366295, + "num_tokens": 53409662.0, + "step": 448 + }, + { + "entropy": 0.6352142915129662, + "epoch": 1.0228115198175078, + "grad_norm": 0.51953125, + "learning_rate": 4.963209198091232e-06, + "loss": 0.0269, + "mean_token_accuracy": 0.9935048967599869, + "num_tokens": 53529748.0, + "step": 449 + }, + { + "entropy": 0.6408175528049469, + "epoch": 1.0250926717992586, + "grad_norm": 0.423828125, + "learning_rate": 4.962886566651138e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9952007383108139, + "num_tokens": 53649828.0, + "step": 450 + }, + { + "entropy": 0.6325621977448463, + "epoch": 1.0273738237810095, + "grad_norm": 0.6796875, + "learning_rate": 4.962562537324176e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9922916814684868, + "num_tokens": 53769888.0, + "step": 451 + }, + { + "entropy": 0.6397062987089157, + "epoch": 1.0296549757627602, + "grad_norm": 0.65625, + "learning_rate": 4.96223711029426e-06, + "loss": 0.0289, + "mean_token_accuracy": 0.9925474300980568, + "num_tokens": 53888803.0, + "step": 452 + }, + { + "entropy": 0.6359882652759552, + "epoch": 1.031936127744511, + "grad_norm": 0.80859375, + "learning_rate": 4.961910285746094e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9939285293221474, + "num_tokens": 54007986.0, + "step": 453 + }, + { + "entropy": 0.6368442699313164, + "epoch": 1.0342172797262617, + "grad_norm": 0.52734375, + "learning_rate": 4.9615820638651805e-06, + "loss": 0.0271, + "mean_token_accuracy": 0.9924737140536308, + "num_tokens": 54126643.0, + "step": 454 + }, + { + "entropy": 0.639216922223568, + "epoch": 1.0364984317080126, + "grad_norm": 0.67578125, + "learning_rate": 4.961252444837809e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9934102594852448, + "num_tokens": 54245561.0, + "step": 455 + }, + { + "entropy": 0.6420523822307587, + "epoch": 1.0387795836897633, + "grad_norm": 0.62109375, + "learning_rate": 4.960921428851066e-06, + "loss": 0.023, + "mean_token_accuracy": 0.991694875061512, + "num_tokens": 54364383.0, + "step": 456 + }, + { + "entropy": 0.6417715027928352, + "epoch": 1.041060735671514, + "grad_norm": 0.890625, + "learning_rate": 4.960589016092832e-06, + "loss": 0.0343, + "mean_token_accuracy": 0.9897043704986572, + "num_tokens": 54484602.0, + "step": 457 + }, + { + "entropy": 0.6355913281440735, + "epoch": 1.043341887653265, + "grad_norm": 0.796875, + "learning_rate": 4.960255206751774e-06, + "loss": 0.0402, + "mean_token_accuracy": 0.9892151057720184, + "num_tokens": 54603861.0, + "step": 458 + }, + { + "entropy": 0.6418633982539177, + "epoch": 1.0456230396350157, + "grad_norm": 0.6171875, + "learning_rate": 4.959920001017358e-06, + "loss": 0.0286, + "mean_token_accuracy": 0.9913401082158089, + "num_tokens": 54723416.0, + "step": 459 + }, + { + "entropy": 0.6363398507237434, + "epoch": 1.0479041916167664, + "grad_norm": 0.60546875, + "learning_rate": 4.95958339907984e-06, + "loss": 0.0274, + "mean_token_accuracy": 0.9928280264139175, + "num_tokens": 54843030.0, + "step": 460 + }, + { + "entropy": 0.6355079263448715, + "epoch": 1.0501853435985173, + "grad_norm": 0.60546875, + "learning_rate": 4.959245401130269e-06, + "loss": 0.0313, + "mean_token_accuracy": 0.9894143790006638, + "num_tokens": 54961773.0, + "step": 461 + }, + { + "entropy": 0.6369510889053345, + "epoch": 1.052466495580268, + "grad_norm": 0.49609375, + "learning_rate": 4.958906007360487e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9937010854482651, + "num_tokens": 55081037.0, + "step": 462 + }, + { + "entropy": 0.6401446759700775, + "epoch": 1.0547476475620188, + "grad_norm": 0.6171875, + "learning_rate": 4.958565217963125e-06, + "loss": 0.0267, + "mean_token_accuracy": 0.9913331121206284, + "num_tokens": 55200622.0, + "step": 463 + }, + { + "entropy": 0.6404896229505539, + "epoch": 1.0570287995437697, + "grad_norm": 0.6015625, + "learning_rate": 4.95822303313161e-06, + "loss": 0.027, + "mean_token_accuracy": 0.9930299893021584, + "num_tokens": 55319906.0, + "step": 464 + }, + { + "entropy": 0.6390992403030396, + "epoch": 1.0593099515255204, + "grad_norm": 0.64453125, + "learning_rate": 4.957879453060159e-06, + "loss": 0.026, + "mean_token_accuracy": 0.9934355244040489, + "num_tokens": 55439997.0, + "step": 465 + }, + { + "entropy": 0.6406941264867783, + "epoch": 1.0615911035072711, + "grad_norm": 0.6640625, + "learning_rate": 4.957534477943782e-06, + "loss": 0.0266, + "mean_token_accuracy": 0.9907289743423462, + "num_tokens": 55559864.0, + "step": 466 + }, + { + "entropy": 0.6463066563010216, + "epoch": 1.0638722554890219, + "grad_norm": 0.5234375, + "learning_rate": 4.957188107978279e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9945512562990189, + "num_tokens": 55679589.0, + "step": 467 + }, + { + "entropy": 0.6434222012758255, + "epoch": 1.0661534074707728, + "grad_norm": 0.72265625, + "learning_rate": 4.956840343360245e-06, + "loss": 0.0377, + "mean_token_accuracy": 0.987898476421833, + "num_tokens": 55799464.0, + "step": 468 + }, + { + "entropy": 0.6344132125377655, + "epoch": 1.0684345594525235, + "grad_norm": 0.7265625, + "learning_rate": 4.956491184287062e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.9923848509788513, + "num_tokens": 55918989.0, + "step": 469 + }, + { + "entropy": 0.6383634433150291, + "epoch": 1.0707157114342742, + "grad_norm": 0.578125, + "learning_rate": 4.9561406309569084e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9922046512365341, + "num_tokens": 56038600.0, + "step": 470 + }, + { + "entropy": 0.638044461607933, + "epoch": 1.0729968634160252, + "grad_norm": 0.578125, + "learning_rate": 4.955788683568749e-06, + "loss": 0.025, + "mean_token_accuracy": 0.9924884513020515, + "num_tokens": 56157722.0, + "step": 471 + }, + { + "entropy": 0.641399659216404, + "epoch": 1.075278015397776, + "grad_norm": 0.5390625, + "learning_rate": 4.955435342322345e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9941096231341362, + "num_tokens": 56276788.0, + "step": 472 + }, + { + "entropy": 0.6376670971512794, + "epoch": 1.0775591673795266, + "grad_norm": 0.59375, + "learning_rate": 4.955080607418244e-06, + "loss": 0.0266, + "mean_token_accuracy": 0.9924849793314934, + "num_tokens": 56396375.0, + "step": 473 + }, + { + "entropy": 0.6377646699547768, + "epoch": 1.0798403193612773, + "grad_norm": 0.59765625, + "learning_rate": 4.954724479057788e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.991156093776226, + "num_tokens": 56514859.0, + "step": 474 + }, + { + "entropy": 0.6386540904641151, + "epoch": 1.0821214713430283, + "grad_norm": 0.5078125, + "learning_rate": 4.954366957443107e-06, + "loss": 0.0247, + "mean_token_accuracy": 0.9908105358481407, + "num_tokens": 56633577.0, + "step": 475 + }, + { + "entropy": 0.6367008835077286, + "epoch": 1.084402623324779, + "grad_norm": 0.51171875, + "learning_rate": 4.954008042777125e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9936517551541328, + "num_tokens": 56752983.0, + "step": 476 + }, + { + "entropy": 0.643406517803669, + "epoch": 1.0866837753065297, + "grad_norm": 0.49609375, + "learning_rate": 4.953647735263555e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9914189800620079, + "num_tokens": 56872267.0, + "step": 477 + }, + { + "entropy": 0.6361510753631592, + "epoch": 1.0889649272882806, + "grad_norm": 0.5, + "learning_rate": 4.953286035106898e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9946249052882195, + "num_tokens": 56991912.0, + "step": 478 + }, + { + "entropy": 0.6364426389336586, + "epoch": 1.0912460792700314, + "grad_norm": 0.6328125, + "learning_rate": 4.952922942512452e-06, + "loss": 0.0249, + "mean_token_accuracy": 0.992235966026783, + "num_tokens": 57111089.0, + "step": 479 + }, + { + "entropy": 0.6413366049528122, + "epoch": 1.093527231251782, + "grad_norm": 0.51171875, + "learning_rate": 4.9525584576862985e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9947041943669319, + "num_tokens": 57230150.0, + "step": 480 + }, + { + "entropy": 0.6437407806515694, + "epoch": 1.095808383233533, + "grad_norm": 0.62109375, + "learning_rate": 4.952192580835313e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.9935091510415077, + "num_tokens": 57349725.0, + "step": 481 + }, + { + "entropy": 0.6372997835278511, + "epoch": 1.0980895352152837, + "grad_norm": 0.68359375, + "learning_rate": 4.9518253121671595e-06, + "loss": 0.0299, + "mean_token_accuracy": 0.9926347956061363, + "num_tokens": 57469610.0, + "step": 482 + }, + { + "entropy": 0.6435587182641029, + "epoch": 1.1003706871970345, + "grad_norm": 0.416015625, + "learning_rate": 4.951456651890294e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9947889745235443, + "num_tokens": 57589717.0, + "step": 483 + }, + { + "entropy": 0.6399586200714111, + "epoch": 1.1026518391787854, + "grad_norm": 0.53125, + "learning_rate": 4.951086600213959e-06, + "loss": 0.026, + "mean_token_accuracy": 0.9909392669796944, + "num_tokens": 57708926.0, + "step": 484 + }, + { + "entropy": 0.6329101994633675, + "epoch": 1.1049329911605361, + "grad_norm": 0.5078125, + "learning_rate": 4.950715157348191e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9937612786889076, + "num_tokens": 57828005.0, + "step": 485 + }, + { + "entropy": 0.633719839155674, + "epoch": 1.1072141431422868, + "grad_norm": 0.546875, + "learning_rate": 4.950342323503812e-06, + "loss": 0.0318, + "mean_token_accuracy": 0.9910379201173782, + "num_tokens": 57947004.0, + "step": 486 + }, + { + "entropy": 0.6388240680098534, + "epoch": 1.1094952951240376, + "grad_norm": 0.71875, + "learning_rate": 4.949968098892436e-06, + "loss": 0.0337, + "mean_token_accuracy": 0.9904163330793381, + "num_tokens": 58065476.0, + "step": 487 + }, + { + "entropy": 0.63455580919981, + "epoch": 1.1117764471057885, + "grad_norm": 0.59765625, + "learning_rate": 4.949592483726465e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9927629679441452, + "num_tokens": 58185175.0, + "step": 488 + }, + { + "entropy": 0.6306882128119469, + "epoch": 1.1140575990875392, + "grad_norm": 0.63671875, + "learning_rate": 4.949215478219092e-06, + "loss": 0.0323, + "mean_token_accuracy": 0.9907538890838623, + "num_tokens": 58303810.0, + "step": 489 + }, + { + "entropy": 0.6412769258022308, + "epoch": 1.11633875106929, + "grad_norm": 0.6171875, + "learning_rate": 4.948837082584298e-06, + "loss": 0.0334, + "mean_token_accuracy": 0.9919233545660973, + "num_tokens": 58423364.0, + "step": 490 + }, + { + "entropy": 0.6388815566897392, + "epoch": 1.1186199030510409, + "grad_norm": 0.6171875, + "learning_rate": 4.9484572970368516e-06, + "loss": 0.0288, + "mean_token_accuracy": 0.9914741218090057, + "num_tokens": 58542201.0, + "step": 491 + }, + { + "entropy": 0.6325994282960892, + "epoch": 1.1209010550327916, + "grad_norm": 0.63671875, + "learning_rate": 4.948076121792313e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9939880445599556, + "num_tokens": 58660699.0, + "step": 492 + }, + { + "entropy": 0.639816515147686, + "epoch": 1.1231822070145423, + "grad_norm": 0.5390625, + "learning_rate": 4.9476935570670294e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9924480319023132, + "num_tokens": 58780440.0, + "step": 493 + }, + { + "entropy": 0.637451633810997, + "epoch": 1.125463358996293, + "grad_norm": 0.65625, + "learning_rate": 4.947309603078138e-06, + "loss": 0.0349, + "mean_token_accuracy": 0.9910256564617157, + "num_tokens": 58900059.0, + "step": 494 + }, + { + "entropy": 0.6351741328835487, + "epoch": 1.127744510978044, + "grad_norm": 0.64453125, + "learning_rate": 4.946924260043563e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.9935736134648323, + "num_tokens": 59019187.0, + "step": 495 + }, + { + "entropy": 0.6440933719277382, + "epoch": 1.1300256629597947, + "grad_norm": 0.51171875, + "learning_rate": 4.946537528182017e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9944522529840469, + "num_tokens": 59138139.0, + "step": 496 + }, + { + "entropy": 0.6404507979750633, + "epoch": 1.1323068149415454, + "grad_norm": 0.75390625, + "learning_rate": 4.946149407713002e-06, + "loss": 0.0315, + "mean_token_accuracy": 0.991683341562748, + "num_tokens": 59257625.0, + "step": 497 + }, + { + "entropy": 0.6372439339756966, + "epoch": 1.1345879669232963, + "grad_norm": 0.609375, + "learning_rate": 4.945759898856809e-06, + "loss": 0.0273, + "mean_token_accuracy": 0.9924319311976433, + "num_tokens": 59376941.0, + "step": 498 + }, + { + "entropy": 0.6377488449215889, + "epoch": 1.136869118905047, + "grad_norm": 0.68359375, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.0304, + "mean_token_accuracy": 0.9926893040537834, + "num_tokens": 59495733.0, + "step": 499 + }, + { + "entropy": 0.6397875472903252, + "epoch": 1.1391502708867978, + "grad_norm": 0.625, + "learning_rate": 4.944976716867984e-06, + "loss": 0.0298, + "mean_token_accuracy": 0.9928758442401886, + "num_tokens": 59614379.0, + "step": 500 + }, + { + "entropy": 0.6411793157458305, + "epoch": 1.1414314228685487, + "grad_norm": 0.59375, + "learning_rate": 4.944583044179871e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.993398629128933, + "num_tokens": 59733923.0, + "step": 501 + }, + { + "entropy": 0.6437911912798882, + "epoch": 1.1437125748502994, + "grad_norm": 0.50390625, + "learning_rate": 4.944187983993617e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.994205430150032, + "num_tokens": 59853786.0, + "step": 502 + }, + { + "entropy": 0.6426619589328766, + "epoch": 1.1459937268320501, + "grad_norm": 0.703125, + "learning_rate": 4.94379153653345e-06, + "loss": 0.027, + "mean_token_accuracy": 0.9903893172740936, + "num_tokens": 59972375.0, + "step": 503 + }, + { + "entropy": 0.6379608660936356, + "epoch": 1.148274878813801, + "grad_norm": 0.81640625, + "learning_rate": 4.9433937020243854e-06, + "loss": 0.0313, + "mean_token_accuracy": 0.989438571035862, + "num_tokens": 60091599.0, + "step": 504 + }, + { + "entropy": 0.6394264996051788, + "epoch": 1.1505560307955518, + "grad_norm": 0.62109375, + "learning_rate": 4.942994480692228e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9913732185959816, + "num_tokens": 60210578.0, + "step": 505 + }, + { + "entropy": 0.634573221206665, + "epoch": 1.1528371827773025, + "grad_norm": 0.419921875, + "learning_rate": 4.942593872763566e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9939863383769989, + "num_tokens": 60329564.0, + "step": 506 + }, + { + "entropy": 0.6426278650760651, + "epoch": 1.1551183347590532, + "grad_norm": 0.66796875, + "learning_rate": 4.9421918784657795e-06, + "loss": 0.0263, + "mean_token_accuracy": 0.993289977312088, + "num_tokens": 60448999.0, + "step": 507 + }, + { + "entropy": 0.6422138884663582, + "epoch": 1.1573994867408042, + "grad_norm": 0.55859375, + "learning_rate": 4.94178849802703e-06, + "loss": 0.0237, + "mean_token_accuracy": 0.9925984516739845, + "num_tokens": 60568063.0, + "step": 508 + }, + { + "entropy": 0.636064924299717, + "epoch": 1.159680638722555, + "grad_norm": 0.66796875, + "learning_rate": 4.9413837316762705e-06, + "loss": 0.0346, + "mean_token_accuracy": 0.9881532117724419, + "num_tokens": 60686887.0, + "step": 509 + }, + { + "entropy": 0.6412410289049149, + "epoch": 1.1619617907043056, + "grad_norm": 0.52734375, + "learning_rate": 4.940977579643237e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9946831986308098, + "num_tokens": 60806067.0, + "step": 510 + }, + { + "entropy": 0.6364694610238075, + "epoch": 1.1642429426860565, + "grad_norm": 0.5390625, + "learning_rate": 4.940570042158454e-06, + "loss": 0.0243, + "mean_token_accuracy": 0.9925849065184593, + "num_tokens": 60925520.0, + "step": 511 + }, + { + "entropy": 0.6342663615942001, + "epoch": 1.1665240946678073, + "grad_norm": 0.6484375, + "learning_rate": 4.940161119453232e-06, + "loss": 0.0365, + "mean_token_accuracy": 0.9902976974844933, + "num_tokens": 61045135.0, + "step": 512 + }, + { + "entropy": 0.6360703259706497, + "epoch": 1.168805246649558, + "grad_norm": 0.6875, + "learning_rate": 4.939750811759668e-06, + "loss": 0.033, + "mean_token_accuracy": 0.9906287267804146, + "num_tokens": 61164396.0, + "step": 513 + }, + { + "entropy": 0.6385461315512657, + "epoch": 1.1710863986313087, + "grad_norm": 0.734375, + "learning_rate": 4.939339119310645e-06, + "loss": 0.0354, + "mean_token_accuracy": 0.989618718624115, + "num_tokens": 61284041.0, + "step": 514 + }, + { + "entropy": 0.6429248005151749, + "epoch": 1.1733675506130596, + "grad_norm": 0.609375, + "learning_rate": 4.93892604233983e-06, + "loss": 0.0376, + "mean_token_accuracy": 0.9894492328166962, + "num_tokens": 61403299.0, + "step": 515 + }, + { + "entropy": 0.6422001421451569, + "epoch": 1.1756487025948104, + "grad_norm": 0.60546875, + "learning_rate": 4.93851158108168e-06, + "loss": 0.0316, + "mean_token_accuracy": 0.9923984929919243, + "num_tokens": 61523377.0, + "step": 516 + }, + { + "entropy": 0.6378173306584358, + "epoch": 1.177929854576561, + "grad_norm": 0.484375, + "learning_rate": 4.938095735771433e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.9918532744050026, + "num_tokens": 61642772.0, + "step": 517 + }, + { + "entropy": 0.6349778920412064, + "epoch": 1.180211006558312, + "grad_norm": 0.47265625, + "learning_rate": 4.937678506645116e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9937770813703537, + "num_tokens": 61762393.0, + "step": 518 + }, + { + "entropy": 0.6417742446064949, + "epoch": 1.1824921585400627, + "grad_norm": 0.671875, + "learning_rate": 4.937259893939539e-06, + "loss": 0.0228, + "mean_token_accuracy": 0.9921541288495064, + "num_tokens": 61882229.0, + "step": 519 + }, + { + "entropy": 0.6391768679022789, + "epoch": 1.1847733105218134, + "grad_norm": 0.65625, + "learning_rate": 4.9368398978923e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.992104634642601, + "num_tokens": 62001717.0, + "step": 520 + }, + { + "entropy": 0.6390099748969078, + "epoch": 1.1870544625035644, + "grad_norm": 0.63671875, + "learning_rate": 4.93641851874178e-06, + "loss": 0.0263, + "mean_token_accuracy": 0.9913691431283951, + "num_tokens": 62120862.0, + "step": 521 + }, + { + "entropy": 0.6361878290772438, + "epoch": 1.189335614485315, + "grad_norm": 0.546875, + "learning_rate": 4.935995756727146e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9929168447852135, + "num_tokens": 62240593.0, + "step": 522 + }, + { + "entropy": 0.6448950469493866, + "epoch": 1.1916167664670658, + "grad_norm": 0.48046875, + "learning_rate": 4.935571612088349e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.994476929306984, + "num_tokens": 62360495.0, + "step": 523 + }, + { + "entropy": 0.6394361332058907, + "epoch": 1.1938979184488168, + "grad_norm": 0.57421875, + "learning_rate": 4.935146085066125e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.9923677518963814, + "num_tokens": 62479502.0, + "step": 524 + }, + { + "entropy": 0.6435981467366219, + "epoch": 1.1961790704305675, + "grad_norm": 0.478515625, + "learning_rate": 4.934719175901996e-06, + "loss": 0.0237, + "mean_token_accuracy": 0.9929273575544357, + "num_tokens": 62598688.0, + "step": 525 + }, + { + "entropy": 0.6455018743872643, + "epoch": 1.1984602224123182, + "grad_norm": 0.51171875, + "learning_rate": 4.934290884838266e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9942518100142479, + "num_tokens": 62718206.0, + "step": 526 + }, + { + "entropy": 0.6450193151831627, + "epoch": 1.200741374394069, + "grad_norm": 0.57421875, + "learning_rate": 4.933861212118027e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9948318898677826, + "num_tokens": 62837786.0, + "step": 527 + }, + { + "entropy": 0.6405033245682716, + "epoch": 1.2030225263758199, + "grad_norm": 0.53515625, + "learning_rate": 4.933430157985151e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9952510446310043, + "num_tokens": 62956819.0, + "step": 528 + }, + { + "entropy": 0.6408663392066956, + "epoch": 1.2053036783575706, + "grad_norm": 0.53125, + "learning_rate": 4.932997722684296e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9936221241950989, + "num_tokens": 63075967.0, + "step": 529 + }, + { + "entropy": 0.63957130163908, + "epoch": 1.2075848303393213, + "grad_norm": 0.494140625, + "learning_rate": 4.932563906460905e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9928315207362175, + "num_tokens": 63195009.0, + "step": 530 + }, + { + "entropy": 0.6380795687437057, + "epoch": 1.2098659823210722, + "grad_norm": 0.5546875, + "learning_rate": 4.932128709561202e-06, + "loss": 0.0272, + "mean_token_accuracy": 0.9928601831197739, + "num_tokens": 63314990.0, + "step": 531 + }, + { + "entropy": 0.6388819739222527, + "epoch": 1.212147134302823, + "grad_norm": 0.53515625, + "learning_rate": 4.931692132232198e-06, + "loss": 0.0255, + "mean_token_accuracy": 0.9925848692655563, + "num_tokens": 63434781.0, + "step": 532 + }, + { + "entropy": 0.6449277997016907, + "epoch": 1.2144282862845737, + "grad_norm": 0.8046875, + "learning_rate": 4.931254174721687e-06, + "loss": 0.0326, + "mean_token_accuracy": 0.9901185259222984, + "num_tokens": 63553859.0, + "step": 533 + }, + { + "entropy": 0.6424869075417519, + "epoch": 1.2167094382663244, + "grad_norm": 0.64453125, + "learning_rate": 4.930814837278242e-06, + "loss": 0.0301, + "mean_token_accuracy": 0.9905510172247887, + "num_tokens": 63674036.0, + "step": 534 + }, + { + "entropy": 0.6420894414186478, + "epoch": 1.2189905902480753, + "grad_norm": 0.5859375, + "learning_rate": 4.930374120151225e-06, + "loss": 0.0287, + "mean_token_accuracy": 0.9911350831389427, + "num_tokens": 63793457.0, + "step": 535 + }, + { + "entropy": 0.6375390514731407, + "epoch": 1.221271742229826, + "grad_norm": 0.71484375, + "learning_rate": 4.929932023590776e-06, + "loss": 0.0303, + "mean_token_accuracy": 0.9917834922671318, + "num_tokens": 63912034.0, + "step": 536 + }, + { + "entropy": 0.640324629843235, + "epoch": 1.2235528942115768, + "grad_norm": 0.51171875, + "learning_rate": 4.929488547847823e-06, + "loss": 0.0282, + "mean_token_accuracy": 0.9895867556333542, + "num_tokens": 64031019.0, + "step": 537 + }, + { + "entropy": 0.6401369869709015, + "epoch": 1.2258340461933277, + "grad_norm": 0.5390625, + "learning_rate": 4.9290436931740735e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9938716143369675, + "num_tokens": 64150757.0, + "step": 538 + }, + { + "entropy": 0.6316568776965141, + "epoch": 1.2281151981750784, + "grad_norm": 0.67578125, + "learning_rate": 4.928597459822018e-06, + "loss": 0.0285, + "mean_token_accuracy": 0.9896878302097321, + "num_tokens": 64269970.0, + "step": 539 + }, + { + "entropy": 0.6392024904489517, + "epoch": 1.2303963501568291, + "grad_norm": 0.5703125, + "learning_rate": 4.928149848044931e-06, + "loss": 0.0304, + "mean_token_accuracy": 0.9915527030825615, + "num_tokens": 64389192.0, + "step": 540 + }, + { + "entropy": 0.6411287263035774, + "epoch": 1.23267750213858, + "grad_norm": 0.54296875, + "learning_rate": 4.9277008580968665e-06, + "loss": 0.027, + "mean_token_accuracy": 0.9926761612296104, + "num_tokens": 64508682.0, + "step": 541 + }, + { + "entropy": 0.6376341134309769, + "epoch": 1.2349586541203308, + "grad_norm": 0.46875, + "learning_rate": 4.927250490232664e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9936070144176483, + "num_tokens": 64627844.0, + "step": 542 + }, + { + "entropy": 0.6399098783731461, + "epoch": 1.2372398061020815, + "grad_norm": 0.578125, + "learning_rate": 4.926798744707943e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.9922181591391563, + "num_tokens": 64747713.0, + "step": 543 + }, + { + "entropy": 0.6397632658481598, + "epoch": 1.2395209580838324, + "grad_norm": 0.5390625, + "learning_rate": 4.926345621779106e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9946469962596893, + "num_tokens": 64867312.0, + "step": 544 + }, + { + "entropy": 0.6410968378186226, + "epoch": 1.2418021100655832, + "grad_norm": 0.640625, + "learning_rate": 4.9258911217033355e-06, + "loss": 0.0223, + "mean_token_accuracy": 0.9923366233706474, + "num_tokens": 64986361.0, + "step": 545 + }, + { + "entropy": 0.6418982148170471, + "epoch": 1.2440832620473339, + "grad_norm": 0.68359375, + "learning_rate": 4.925435244738599e-06, + "loss": 0.0249, + "mean_token_accuracy": 0.9919616281986237, + "num_tokens": 65105616.0, + "step": 546 + }, + { + "entropy": 0.6374618783593178, + "epoch": 1.2463644140290846, + "grad_norm": 0.4921875, + "learning_rate": 4.924977991143642e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.994633175432682, + "num_tokens": 65226105.0, + "step": 547 + }, + { + "entropy": 0.6415504515171051, + "epoch": 1.2486455660108355, + "grad_norm": 0.466796875, + "learning_rate": 4.924519361177993e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9917597025632858, + "num_tokens": 65345969.0, + "step": 548 + }, + { + "entropy": 0.6404975727200508, + "epoch": 1.2509267179925863, + "grad_norm": 0.54296875, + "learning_rate": 4.9240593551019625e-06, + "loss": 0.0273, + "mean_token_accuracy": 0.9906909465789795, + "num_tokens": 65466003.0, + "step": 549 + }, + { + "entropy": 0.6436318904161453, + "epoch": 1.253207869974337, + "grad_norm": 0.56640625, + "learning_rate": 4.92359797317664e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9925400242209435, + "num_tokens": 65585394.0, + "step": 550 + }, + { + "entropy": 0.6309810355305672, + "epoch": 1.255489021956088, + "grad_norm": 0.55859375, + "learning_rate": 4.923135215663897e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.994232639670372, + "num_tokens": 65704795.0, + "step": 551 + }, + { + "entropy": 0.6407207325100899, + "epoch": 1.2577701739378386, + "grad_norm": 0.490234375, + "learning_rate": 4.922671082826386e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9945311397314072, + "num_tokens": 65823632.0, + "step": 552 + }, + { + "entropy": 0.6333111599087715, + "epoch": 1.2600513259195893, + "grad_norm": 0.65234375, + "learning_rate": 4.92220557492754e-06, + "loss": 0.0253, + "mean_token_accuracy": 0.9924682453274727, + "num_tokens": 65943164.0, + "step": 553 + }, + { + "entropy": 0.6387123242020607, + "epoch": 1.26233247790134, + "grad_norm": 0.62890625, + "learning_rate": 4.921738692231572e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9928248450160027, + "num_tokens": 66062267.0, + "step": 554 + }, + { + "entropy": 0.6341886296868324, + "epoch": 1.264613629883091, + "grad_norm": 0.5859375, + "learning_rate": 4.9212704350034764e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.9921679720282555, + "num_tokens": 66181525.0, + "step": 555 + }, + { + "entropy": 0.6331890746951103, + "epoch": 1.2668947818648417, + "grad_norm": 0.51953125, + "learning_rate": 4.920800803509026e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9921646639704704, + "num_tokens": 66300500.0, + "step": 556 + }, + { + "entropy": 0.6371569186449051, + "epoch": 1.2691759338465927, + "grad_norm": 0.609375, + "learning_rate": 4.920329798014775e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9949021339416504, + "num_tokens": 66420160.0, + "step": 557 + }, + { + "entropy": 0.6365426406264305, + "epoch": 1.2714570858283434, + "grad_norm": 0.59375, + "learning_rate": 4.919857418788056e-06, + "loss": 0.026, + "mean_token_accuracy": 0.9898040294647217, + "num_tokens": 66539052.0, + "step": 558 + }, + { + "entropy": 0.6390432715415955, + "epoch": 1.273738237810094, + "grad_norm": 0.578125, + "learning_rate": 4.919383666096985e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.993528351187706, + "num_tokens": 66658172.0, + "step": 559 + }, + { + "entropy": 0.6408077627420425, + "epoch": 1.2760193897918448, + "grad_norm": 0.5859375, + "learning_rate": 4.918908540210452e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.9932637810707092, + "num_tokens": 66777812.0, + "step": 560 + }, + { + "entropy": 0.6319929510354996, + "epoch": 1.2783005417735955, + "grad_norm": 0.5859375, + "learning_rate": 4.91843204139813e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9931629449129105, + "num_tokens": 66897215.0, + "step": 561 + }, + { + "entropy": 0.6330813020467758, + "epoch": 1.2805816937553465, + "grad_norm": 0.765625, + "learning_rate": 4.917954169930472e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.991722859442234, + "num_tokens": 67016798.0, + "step": 562 + }, + { + "entropy": 0.6401809006929398, + "epoch": 1.2828628457370972, + "grad_norm": 0.84375, + "learning_rate": 4.917474926078707e-06, + "loss": 0.0363, + "mean_token_accuracy": 0.9912207946181297, + "num_tokens": 67137045.0, + "step": 563 + }, + { + "entropy": 0.6339130699634552, + "epoch": 1.2851439977188481, + "grad_norm": 0.466796875, + "learning_rate": 4.916994310114845e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9939794316887856, + "num_tokens": 67257133.0, + "step": 564 + }, + { + "entropy": 0.6368776336312294, + "epoch": 1.2874251497005988, + "grad_norm": 0.58984375, + "learning_rate": 4.916512322311675e-06, + "loss": 0.0269, + "mean_token_accuracy": 0.9937662929296494, + "num_tokens": 67376478.0, + "step": 565 + }, + { + "entropy": 0.6407509595155716, + "epoch": 1.2897063016823496, + "grad_norm": 0.51953125, + "learning_rate": 4.916028962942763e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9949904605746269, + "num_tokens": 67496136.0, + "step": 566 + }, + { + "entropy": 0.6404800787568092, + "epoch": 1.2919874536641003, + "grad_norm": 0.578125, + "learning_rate": 4.915544232282455e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9928790852427483, + "num_tokens": 67615791.0, + "step": 567 + }, + { + "entropy": 0.6316845342516899, + "epoch": 1.2942686056458512, + "grad_norm": 0.546875, + "learning_rate": 4.915058130605874e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.9925213232636452, + "num_tokens": 67735035.0, + "step": 568 + }, + { + "entropy": 0.6380633190274239, + "epoch": 1.296549757627602, + "grad_norm": 0.5, + "learning_rate": 4.9145706581889235e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9939476996660233, + "num_tokens": 67853772.0, + "step": 569 + }, + { + "entropy": 0.6372907161712646, + "epoch": 1.2988309096093527, + "grad_norm": 0.54296875, + "learning_rate": 4.914081815308283e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9948429092764854, + "num_tokens": 67972897.0, + "step": 570 + }, + { + "entropy": 0.6363494470715523, + "epoch": 1.3011120615911036, + "grad_norm": 0.546875, + "learning_rate": 4.913591602241409e-06, + "loss": 0.026, + "mean_token_accuracy": 0.9922598898410797, + "num_tokens": 68092244.0, + "step": 571 + }, + { + "entropy": 0.6394603103399277, + "epoch": 1.3033932135728543, + "grad_norm": 0.71875, + "learning_rate": 4.9131000192665365e-06, + "loss": 0.032, + "mean_token_accuracy": 0.989642046391964, + "num_tokens": 68211340.0, + "step": 572 + }, + { + "entropy": 0.6353276073932648, + "epoch": 1.305674365554605, + "grad_norm": 0.578125, + "learning_rate": 4.9126070666626815e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9922065734863281, + "num_tokens": 68330648.0, + "step": 573 + }, + { + "entropy": 0.6357185021042824, + "epoch": 1.3079555175363557, + "grad_norm": 0.62109375, + "learning_rate": 4.912112744709632e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9927729368209839, + "num_tokens": 68449557.0, + "step": 574 + }, + { + "entropy": 0.6390470042824745, + "epoch": 1.3102366695181067, + "grad_norm": 0.625, + "learning_rate": 4.911617053687957e-06, + "loss": 0.0298, + "mean_token_accuracy": 0.9899202138185501, + "num_tokens": 68568973.0, + "step": 575 + }, + { + "entropy": 0.6375419497489929, + "epoch": 1.3125178214998574, + "grad_norm": 0.69140625, + "learning_rate": 4.911119993878999e-06, + "loss": 0.0261, + "mean_token_accuracy": 0.9919667840003967, + "num_tokens": 68688315.0, + "step": 576 + }, + { + "entropy": 0.6382449418306351, + "epoch": 1.3147989734816083, + "grad_norm": 0.474609375, + "learning_rate": 4.910621565564882e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.994639053940773, + "num_tokens": 68808080.0, + "step": 577 + }, + { + "entropy": 0.6335155367851257, + "epoch": 1.317080125463359, + "grad_norm": 0.6171875, + "learning_rate": 4.910121769028503e-06, + "loss": 0.0283, + "mean_token_accuracy": 0.9911932945251465, + "num_tokens": 68927369.0, + "step": 578 + }, + { + "entropy": 0.6349631696939468, + "epoch": 1.3193612774451098, + "grad_norm": 0.51953125, + "learning_rate": 4.909620604553537e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9928989186882973, + "num_tokens": 69046475.0, + "step": 579 + }, + { + "entropy": 0.6349516361951828, + "epoch": 1.3216424294268605, + "grad_norm": 0.515625, + "learning_rate": 4.909118072424436e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9946188405156136, + "num_tokens": 69165629.0, + "step": 580 + }, + { + "entropy": 0.6447226628661156, + "epoch": 1.3239235814086112, + "grad_norm": 0.5234375, + "learning_rate": 4.908614172926426e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9943891987204552, + "num_tokens": 69285129.0, + "step": 581 + }, + { + "entropy": 0.6397584080696106, + "epoch": 1.3262047333903622, + "grad_norm": 0.46484375, + "learning_rate": 4.908108906345512e-06, + "loss": 0.015, + "mean_token_accuracy": 0.994297556579113, + "num_tokens": 69405035.0, + "step": 582 + }, + { + "entropy": 0.6413049548864365, + "epoch": 1.3284858853721129, + "grad_norm": 0.66796875, + "learning_rate": 4.907602272968473e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9935144558548927, + "num_tokens": 69523837.0, + "step": 583 + }, + { + "entropy": 0.6386598199605942, + "epoch": 1.3307670373538638, + "grad_norm": 0.68359375, + "learning_rate": 4.907094273082865e-06, + "loss": 0.0244, + "mean_token_accuracy": 0.9918141067028046, + "num_tokens": 69642850.0, + "step": 584 + }, + { + "entropy": 0.6370568573474884, + "epoch": 1.3330481893356145, + "grad_norm": 0.5390625, + "learning_rate": 4.906584906977018e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9939736574888229, + "num_tokens": 69762760.0, + "step": 585 + }, + { + "entropy": 0.637596108019352, + "epoch": 1.3353293413173652, + "grad_norm": 0.65625, + "learning_rate": 4.906074174940038e-06, + "loss": 0.0279, + "mean_token_accuracy": 0.9910685271024704, + "num_tokens": 69882104.0, + "step": 586 + }, + { + "entropy": 0.6364148259162903, + "epoch": 1.337610493299116, + "grad_norm": 0.54296875, + "learning_rate": 4.905562077261808e-06, + "loss": 0.0259, + "mean_token_accuracy": 0.9933517575263977, + "num_tokens": 70001009.0, + "step": 587 + }, + { + "entropy": 0.6384788900613785, + "epoch": 1.339891645280867, + "grad_norm": 0.66796875, + "learning_rate": 4.905048614232984e-06, + "loss": 0.0254, + "mean_token_accuracy": 0.9925908222794533, + "num_tokens": 70120392.0, + "step": 588 + }, + { + "entropy": 0.6350148990750313, + "epoch": 1.3421727972626176, + "grad_norm": 0.76953125, + "learning_rate": 4.904533786144998e-06, + "loss": 0.0294, + "mean_token_accuracy": 0.9933397248387337, + "num_tokens": 70240063.0, + "step": 589 + }, + { + "entropy": 0.6366907432675362, + "epoch": 1.3444539492443683, + "grad_norm": 0.423828125, + "learning_rate": 4.904017593290056e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9935855269432068, + "num_tokens": 70358975.0, + "step": 590 + }, + { + "entropy": 0.6384308785200119, + "epoch": 1.3467351012261193, + "grad_norm": 0.65234375, + "learning_rate": 4.903500035961139e-06, + "loss": 0.0274, + "mean_token_accuracy": 0.9914026483893394, + "num_tokens": 70478727.0, + "step": 591 + }, + { + "entropy": 0.6355723291635513, + "epoch": 1.34901625320787, + "grad_norm": 0.62890625, + "learning_rate": 4.902981114452005e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9936497807502747, + "num_tokens": 70598172.0, + "step": 592 + }, + { + "entropy": 0.633546881377697, + "epoch": 1.3512974051896207, + "grad_norm": 0.58203125, + "learning_rate": 4.90246082905718e-06, + "loss": 0.0252, + "mean_token_accuracy": 0.9920735284686089, + "num_tokens": 70717435.0, + "step": 593 + }, + { + "entropy": 0.6407830342650414, + "epoch": 1.3535785571713714, + "grad_norm": 0.8203125, + "learning_rate": 4.90193918007197e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.9944556429982185, + "num_tokens": 70836715.0, + "step": 594 + }, + { + "entropy": 0.6371683105826378, + "epoch": 1.3558597091531224, + "grad_norm": 0.51171875, + "learning_rate": 4.901416167792452e-06, + "loss": 0.0245, + "mean_token_accuracy": 0.9919260293245316, + "num_tokens": 70956313.0, + "step": 595 + }, + { + "entropy": 0.6408360451459885, + "epoch": 1.358140861134873, + "grad_norm": 0.5546875, + "learning_rate": 4.9008917925154795e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9918401017785072, + "num_tokens": 71076080.0, + "step": 596 + }, + { + "entropy": 0.636874794960022, + "epoch": 1.360422013116624, + "grad_norm": 0.43359375, + "learning_rate": 4.900366054538675e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9958982914686203, + "num_tokens": 71194805.0, + "step": 597 + }, + { + "entropy": 0.6393484547734261, + "epoch": 1.3627031650983747, + "grad_norm": 0.58984375, + "learning_rate": 4.8998389541604405e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9940181598067284, + "num_tokens": 71314217.0, + "step": 598 + }, + { + "entropy": 0.6344383433461189, + "epoch": 1.3649843170801255, + "grad_norm": 0.5703125, + "learning_rate": 4.899310491679945e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9942355677485466, + "num_tokens": 71433674.0, + "step": 599 + }, + { + "entropy": 0.6375348791480064, + "epoch": 1.3672654690618762, + "grad_norm": 0.53515625, + "learning_rate": 4.898780667397136e-06, + "loss": 0.0257, + "mean_token_accuracy": 0.9926640763878822, + "num_tokens": 71553211.0, + "step": 600 + }, + { + "entropy": 0.6368649378418922, + "epoch": 1.369546621043627, + "grad_norm": 0.65625, + "learning_rate": 4.89824948161273e-06, + "loss": 0.0307, + "mean_token_accuracy": 0.9905905947089195, + "num_tokens": 71672490.0, + "step": 601 + }, + { + "entropy": 0.6439754143357277, + "epoch": 1.3718277730253778, + "grad_norm": 0.80859375, + "learning_rate": 4.8977169346282184e-06, + "loss": 0.0255, + "mean_token_accuracy": 0.9938629195094109, + "num_tokens": 71791509.0, + "step": 602 + }, + { + "entropy": 0.6334840282797813, + "epoch": 1.3741089250071286, + "grad_norm": 0.494140625, + "learning_rate": 4.8971830267458645e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9914547204971313, + "num_tokens": 71911340.0, + "step": 603 + }, + { + "entropy": 0.6375756561756134, + "epoch": 1.3763900769888795, + "grad_norm": 0.484375, + "learning_rate": 4.896647758268703e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9938196837902069, + "num_tokens": 72030787.0, + "step": 604 + }, + { + "entropy": 0.6361744701862335, + "epoch": 1.3786712289706302, + "grad_norm": 0.53125, + "learning_rate": 4.8961111295005444e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9935421720147133, + "num_tokens": 72149657.0, + "step": 605 + }, + { + "entropy": 0.6384670361876488, + "epoch": 1.380952380952381, + "grad_norm": 0.578125, + "learning_rate": 4.895573140745967e-06, + "loss": 0.025, + "mean_token_accuracy": 0.9916210472583771, + "num_tokens": 72268841.0, + "step": 606 + }, + { + "entropy": 0.6357404738664627, + "epoch": 1.3832335329341316, + "grad_norm": 0.61328125, + "learning_rate": 4.895033792310323e-06, + "loss": 0.0281, + "mean_token_accuracy": 0.9907049462199211, + "num_tokens": 72388444.0, + "step": 607 + }, + { + "entropy": 0.6411366909742355, + "epoch": 1.3855146849158826, + "grad_norm": 0.51171875, + "learning_rate": 4.894493084499736e-06, + "loss": 0.0246, + "mean_token_accuracy": 0.9914375469088554, + "num_tokens": 72508079.0, + "step": 608 + }, + { + "entropy": 0.6436814814805984, + "epoch": 1.3877958368976333, + "grad_norm": 0.66015625, + "learning_rate": 4.893951017621103e-06, + "loss": 0.0291, + "mean_token_accuracy": 0.9912361949682236, + "num_tokens": 72628168.0, + "step": 609 + }, + { + "entropy": 0.6356795206665993, + "epoch": 1.390076988879384, + "grad_norm": 0.54296875, + "learning_rate": 4.893407591982088e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9928683713078499, + "num_tokens": 72746884.0, + "step": 610 + }, + { + "entropy": 0.639643982052803, + "epoch": 1.392358140861135, + "grad_norm": 0.80859375, + "learning_rate": 4.892862807891131e-06, + "loss": 0.0353, + "mean_token_accuracy": 0.9900860041379929, + "num_tokens": 72866450.0, + "step": 611 + }, + { + "entropy": 0.6365983560681343, + "epoch": 1.3946392928428857, + "grad_norm": 0.515625, + "learning_rate": 4.89231666565744e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9929895550012589, + "num_tokens": 72985902.0, + "step": 612 + }, + { + "entropy": 0.6381252780556679, + "epoch": 1.3969204448246364, + "grad_norm": 0.640625, + "learning_rate": 4.891769165590995e-06, + "loss": 0.03, + "mean_token_accuracy": 0.9907011538743973, + "num_tokens": 73105539.0, + "step": 613 + }, + { + "entropy": 0.6363380774855614, + "epoch": 1.3992015968063871, + "grad_norm": 0.5546875, + "learning_rate": 4.891220308002547e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.9935641959309578, + "num_tokens": 73224388.0, + "step": 614 + }, + { + "entropy": 0.6396177113056183, + "epoch": 1.401482748788138, + "grad_norm": 0.48828125, + "learning_rate": 4.890670093203617e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9913594201207161, + "num_tokens": 73343132.0, + "step": 615 + }, + { + "entropy": 0.6406323164701462, + "epoch": 1.4037639007698888, + "grad_norm": 0.73828125, + "learning_rate": 4.890118521506494e-06, + "loss": 0.0283, + "mean_token_accuracy": 0.9925433993339539, + "num_tokens": 73461886.0, + "step": 616 + }, + { + "entropy": 0.6385371908545494, + "epoch": 1.4060450527516397, + "grad_norm": 0.462890625, + "learning_rate": 4.889565593224242e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9934752136468887, + "num_tokens": 73580794.0, + "step": 617 + }, + { + "entropy": 0.6414740979671478, + "epoch": 1.4083262047333904, + "grad_norm": 0.6796875, + "learning_rate": 4.889011308670693e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9936015233397484, + "num_tokens": 73699763.0, + "step": 618 + }, + { + "entropy": 0.6396740227937698, + "epoch": 1.4106073567151411, + "grad_norm": 0.4453125, + "learning_rate": 4.8884556681604445e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9937741383910179, + "num_tokens": 73819638.0, + "step": 619 + }, + { + "entropy": 0.6450838297605515, + "epoch": 1.4128885086968919, + "grad_norm": 0.51171875, + "learning_rate": 4.8878986720088715e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.993065632879734, + "num_tokens": 73938874.0, + "step": 620 + }, + { + "entropy": 0.643149197101593, + "epoch": 1.4151696606786426, + "grad_norm": 0.46875, + "learning_rate": 4.8873403205321115e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9951049983501434, + "num_tokens": 74058804.0, + "step": 621 + }, + { + "entropy": 0.6382714807987213, + "epoch": 1.4174508126603935, + "grad_norm": 0.58984375, + "learning_rate": 4.886780614047075e-06, + "loss": 0.0265, + "mean_token_accuracy": 0.9895059466362, + "num_tokens": 74178262.0, + "step": 622 + }, + { + "entropy": 0.642909586429596, + "epoch": 1.4197319646421442, + "grad_norm": 0.60546875, + "learning_rate": 4.886219552871441e-06, + "loss": 0.0313, + "mean_token_accuracy": 0.9903777465224266, + "num_tokens": 74298115.0, + "step": 623 + }, + { + "entropy": 0.6423443108797073, + "epoch": 1.4220131166238952, + "grad_norm": 0.5, + "learning_rate": 4.885657137323656e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9935277029871941, + "num_tokens": 74417796.0, + "step": 624 + }, + { + "entropy": 0.6386888101696968, + "epoch": 1.424294268605646, + "grad_norm": 0.625, + "learning_rate": 4.885093367722937e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9927251935005188, + "num_tokens": 74537435.0, + "step": 625 + }, + { + "entropy": 0.6416356191039085, + "epoch": 1.4265754205873966, + "grad_norm": 0.55078125, + "learning_rate": 4.884528244389269e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9951774477958679, + "num_tokens": 74656324.0, + "step": 626 + }, + { + "entropy": 0.6440300196409225, + "epoch": 1.4288565725691473, + "grad_norm": 0.625, + "learning_rate": 4.883961767643404e-06, + "loss": 0.0278, + "mean_token_accuracy": 0.9908922240138054, + "num_tokens": 74775603.0, + "step": 627 + }, + { + "entropy": 0.6436743214726448, + "epoch": 1.4311377245508983, + "grad_norm": 0.51171875, + "learning_rate": 4.883393937806864e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9945162758231163, + "num_tokens": 74895353.0, + "step": 628 + }, + { + "entropy": 0.6423166543245316, + "epoch": 1.433418876532649, + "grad_norm": 0.609375, + "learning_rate": 4.882824755201938e-06, + "loss": 0.0322, + "mean_token_accuracy": 0.9920506253838539, + "num_tokens": 75015295.0, + "step": 629 + }, + { + "entropy": 0.6419238671660423, + "epoch": 1.4357000285143997, + "grad_norm": 0.5234375, + "learning_rate": 4.8822542201516835e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9949490427970886, + "num_tokens": 75134949.0, + "step": 630 + }, + { + "entropy": 0.6373176276683807, + "epoch": 1.4379811804961506, + "grad_norm": 0.63671875, + "learning_rate": 4.881682332979925e-06, + "loss": 0.0298, + "mean_token_accuracy": 0.9903533756732941, + "num_tokens": 75255161.0, + "step": 631 + }, + { + "entropy": 0.6398174986243248, + "epoch": 1.4402623324779014, + "grad_norm": 0.53515625, + "learning_rate": 4.881109094011254e-06, + "loss": 0.0247, + "mean_token_accuracy": 0.9936423525214195, + "num_tokens": 75375247.0, + "step": 632 + }, + { + "entropy": 0.6463412344455719, + "epoch": 1.442543484459652, + "grad_norm": 0.51171875, + "learning_rate": 4.88053450357103e-06, + "loss": 0.0279, + "mean_token_accuracy": 0.9892997294664383, + "num_tokens": 75494951.0, + "step": 633 + }, + { + "entropy": 0.6400787681341171, + "epoch": 1.4448246364414028, + "grad_norm": 0.474609375, + "learning_rate": 4.87995856198538e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.992844358086586, + "num_tokens": 75613778.0, + "step": 634 + }, + { + "entropy": 0.638479970395565, + "epoch": 1.4471057884231537, + "grad_norm": 0.51171875, + "learning_rate": 4.879381269581197e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9938478916883469, + "num_tokens": 75732735.0, + "step": 635 + }, + { + "entropy": 0.6385963708162308, + "epoch": 1.4493869404049045, + "grad_norm": 0.53515625, + "learning_rate": 4.878802626686141e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9936910644173622, + "num_tokens": 75852075.0, + "step": 636 + }, + { + "entropy": 0.6456236019730568, + "epoch": 1.4516680923866554, + "grad_norm": 0.58203125, + "learning_rate": 4.8782226336286395e-06, + "loss": 0.0249, + "mean_token_accuracy": 0.9926618710160255, + "num_tokens": 75971451.0, + "step": 637 + }, + { + "entropy": 0.6422450467944145, + "epoch": 1.4539492443684061, + "grad_norm": 0.439453125, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9952587783336639, + "num_tokens": 76090936.0, + "step": 638 + }, + { + "entropy": 0.6475710645318031, + "epoch": 1.4562303963501568, + "grad_norm": 0.498046875, + "learning_rate": 4.877058598343835e-06, + "loss": 0.0261, + "mean_token_accuracy": 0.9915279448032379, + "num_tokens": 76211121.0, + "step": 639 + }, + { + "entropy": 0.6412482485175133, + "epoch": 1.4585115483319075, + "grad_norm": 0.5390625, + "learning_rate": 4.876474556777216e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9929085969924927, + "num_tokens": 76330901.0, + "step": 640 + }, + { + "entropy": 0.6429166868329048, + "epoch": 1.4607927003136583, + "grad_norm": 0.53125, + "learning_rate": 4.8758891663695165e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9940745085477829, + "num_tokens": 76450204.0, + "step": 641 + }, + { + "entropy": 0.6402018293738365, + "epoch": 1.4630738522954092, + "grad_norm": 0.65234375, + "learning_rate": 4.875302427452996e-06, + "loss": 0.0268, + "mean_token_accuracy": 0.9928427115082741, + "num_tokens": 76569108.0, + "step": 642 + }, + { + "entropy": 0.6428958550095558, + "epoch": 1.46535500427716, + "grad_norm": 0.5, + "learning_rate": 4.874714340360674e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9931109771132469, + "num_tokens": 76689076.0, + "step": 643 + }, + { + "entropy": 0.645475760102272, + "epoch": 1.4676361562589109, + "grad_norm": 0.59375, + "learning_rate": 4.874124905426339e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9935041591525078, + "num_tokens": 76808666.0, + "step": 644 + }, + { + "entropy": 0.6363644152879715, + "epoch": 1.4699173082406616, + "grad_norm": 0.55859375, + "learning_rate": 4.873534122984541e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9939695075154305, + "num_tokens": 76927506.0, + "step": 645 + }, + { + "entropy": 0.6382194384932518, + "epoch": 1.4721984602224123, + "grad_norm": 0.5, + "learning_rate": 4.872941993370598e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9941019862890244, + "num_tokens": 77046924.0, + "step": 646 + }, + { + "entropy": 0.6391751691699028, + "epoch": 1.474479612204163, + "grad_norm": 0.51953125, + "learning_rate": 4.872348516920591e-06, + "loss": 0.032, + "mean_token_accuracy": 0.9920966997742653, + "num_tokens": 77165907.0, + "step": 647 + }, + { + "entropy": 0.6427943706512451, + "epoch": 1.476760764185914, + "grad_norm": 0.474609375, + "learning_rate": 4.8717536939713665e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9959972947835922, + "num_tokens": 77285621.0, + "step": 648 + }, + { + "entropy": 0.6401243954896927, + "epoch": 1.4790419161676647, + "grad_norm": 0.5234375, + "learning_rate": 4.871157524860533e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9928665906190872, + "num_tokens": 77405182.0, + "step": 649 + }, + { + "entropy": 0.6440375074744225, + "epoch": 1.4813230681494154, + "grad_norm": 0.578125, + "learning_rate": 4.870560009926465e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9946481287479401, + "num_tokens": 77525118.0, + "step": 650 + }, + { + "entropy": 0.6403699517250061, + "epoch": 1.4836042201311663, + "grad_norm": 0.671875, + "learning_rate": 4.869961149508301e-06, + "loss": 0.0327, + "mean_token_accuracy": 0.9908455833792686, + "num_tokens": 77643999.0, + "step": 651 + }, + { + "entropy": 0.6359145045280457, + "epoch": 1.485885372112917, + "grad_norm": 0.59375, + "learning_rate": 4.869360943945943e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9931683987379074, + "num_tokens": 77763175.0, + "step": 652 + }, + { + "entropy": 0.637670211493969, + "epoch": 1.4881665240946678, + "grad_norm": 0.47265625, + "learning_rate": 4.868759393580054e-06, + "loss": 0.0228, + "mean_token_accuracy": 0.9928962513804436, + "num_tokens": 77882253.0, + "step": 653 + }, + { + "entropy": 0.6439796760678291, + "epoch": 1.4904476760764185, + "grad_norm": 0.6015625, + "learning_rate": 4.868156498752066e-06, + "loss": 0.0258, + "mean_token_accuracy": 0.9923202320933342, + "num_tokens": 78001673.0, + "step": 654 + }, + { + "entropy": 0.639705628156662, + "epoch": 1.4927288280581694, + "grad_norm": 0.51953125, + "learning_rate": 4.8675522598041675e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.993877187371254, + "num_tokens": 78120612.0, + "step": 655 + }, + { + "entropy": 0.6396002024412155, + "epoch": 1.4950099800399201, + "grad_norm": 0.515625, + "learning_rate": 4.866946677079314e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.992053858935833, + "num_tokens": 78240226.0, + "step": 656 + }, + { + "entropy": 0.6397572904825211, + "epoch": 1.497291132021671, + "grad_norm": 0.5390625, + "learning_rate": 4.866339750921222e-06, + "loss": 0.0291, + "mean_token_accuracy": 0.9919948801398277, + "num_tokens": 78359379.0, + "step": 657 + }, + { + "entropy": 0.6388900429010391, + "epoch": 1.4995722840034218, + "grad_norm": 0.68359375, + "learning_rate": 4.86573148167437e-06, + "loss": 0.0286, + "mean_token_accuracy": 0.9919358566403389, + "num_tokens": 78479068.0, + "step": 658 + }, + { + "entropy": 0.6387871578335762, + "epoch": 1.5018534359851725, + "grad_norm": 0.58984375, + "learning_rate": 4.865121869684003e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9913514852523804, + "num_tokens": 78599084.0, + "step": 659 + }, + { + "entropy": 0.6343478634953499, + "epoch": 1.5041345879669232, + "grad_norm": 0.5234375, + "learning_rate": 4.864510915296122e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9936830848455429, + "num_tokens": 78719380.0, + "step": 660 + }, + { + "epoch": 1.5041345879669232, + "eval_entropy": 0.6381348969818521, + "eval_loss": 0.02479475550353527, + "eval_mean_token_accuracy": 0.9924331638296294, + "eval_num_tokens": 78719380.0, + "eval_runtime": 177.4523, + "eval_samples_per_second": 47.252, + "eval_steps_per_second": 1.482, + "step": 660 + }, + { + "entropy": 0.6377394422888756, + "epoch": 1.506415739948674, + "grad_norm": 0.61328125, + "learning_rate": 4.8638986188574955e-06, + "loss": 0.0259, + "mean_token_accuracy": 0.9935324192047119, + "num_tokens": 78838538.0, + "step": 661 + }, + { + "entropy": 0.6379623562097549, + "epoch": 1.5086968919304249, + "grad_norm": 0.79296875, + "learning_rate": 4.863284980715649e-06, + "loss": 0.0287, + "mean_token_accuracy": 0.9912637248635292, + "num_tokens": 78957648.0, + "step": 662 + }, + { + "entropy": 0.64057657122612, + "epoch": 1.5109780439121756, + "grad_norm": 0.53125, + "learning_rate": 4.8626700012188724e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.992715984582901, + "num_tokens": 79076931.0, + "step": 663 + }, + { + "entropy": 0.6413486376404762, + "epoch": 1.5132591958939265, + "grad_norm": 0.51953125, + "learning_rate": 4.8620536807162164e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9940462857484818, + "num_tokens": 79195928.0, + "step": 664 + }, + { + "entropy": 0.6394598335027695, + "epoch": 1.5155403478756773, + "grad_norm": 0.74609375, + "learning_rate": 4.861436019557492e-06, + "loss": 0.0317, + "mean_token_accuracy": 0.9926827773451805, + "num_tokens": 79314894.0, + "step": 665 + }, + { + "entropy": 0.6347313150763512, + "epoch": 1.517821499857428, + "grad_norm": 0.5, + "learning_rate": 4.8608170180932725e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.994302861392498, + "num_tokens": 79434066.0, + "step": 666 + }, + { + "entropy": 0.6383668258786201, + "epoch": 1.5201026518391787, + "grad_norm": 0.55078125, + "learning_rate": 4.860196676674891e-06, + "loss": 0.0266, + "mean_token_accuracy": 0.9923002645373344, + "num_tokens": 79553120.0, + "step": 667 + }, + { + "entropy": 0.636829674243927, + "epoch": 1.5223838038209294, + "grad_norm": 0.48828125, + "learning_rate": 4.8595749956544414e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9937475174665451, + "num_tokens": 79672378.0, + "step": 668 + }, + { + "entropy": 0.6420734003186226, + "epoch": 1.5246649558026804, + "grad_norm": 0.578125, + "learning_rate": 4.858951975384777e-06, + "loss": 0.023, + "mean_token_accuracy": 0.9932120516896248, + "num_tokens": 79791544.0, + "step": 669 + }, + { + "entropy": 0.638972744345665, + "epoch": 1.5269461077844313, + "grad_norm": 0.50390625, + "learning_rate": 4.858327616219513e-06, + "loss": 0.0276, + "mean_token_accuracy": 0.9924099519848824, + "num_tokens": 79910886.0, + "step": 670 + }, + { + "entropy": 0.6402309462428093, + "epoch": 1.529227259766182, + "grad_norm": 0.53125, + "learning_rate": 4.857701918513023e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.994101881980896, + "num_tokens": 80029732.0, + "step": 671 + }, + { + "entropy": 0.6447106301784515, + "epoch": 1.5315084117479327, + "grad_norm": 0.52734375, + "learning_rate": 4.857074882620442e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9929288849234581, + "num_tokens": 80149696.0, + "step": 672 + }, + { + "entropy": 0.6395364180207253, + "epoch": 1.5337895637296834, + "grad_norm": 0.703125, + "learning_rate": 4.856446508897662e-06, + "loss": 0.0311, + "mean_token_accuracy": 0.989722453057766, + "num_tokens": 80269927.0, + "step": 673 + }, + { + "entropy": 0.6386376321315765, + "epoch": 1.5360707157114342, + "grad_norm": 0.3984375, + "learning_rate": 4.8558167977013365e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9947937279939651, + "num_tokens": 80390078.0, + "step": 674 + }, + { + "entropy": 0.6416147872805595, + "epoch": 1.538351867693185, + "grad_norm": 0.44921875, + "learning_rate": 4.8551857493888775e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9943087846040726, + "num_tokens": 80508885.0, + "step": 675 + }, + { + "entropy": 0.6406270563602448, + "epoch": 1.5406330196749358, + "grad_norm": 0.4375, + "learning_rate": 4.854553364318456e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9929924458265305, + "num_tokens": 80628593.0, + "step": 676 + }, + { + "entropy": 0.6412413567304611, + "epoch": 1.5429141716566868, + "grad_norm": 0.59375, + "learning_rate": 4.8539196428490016e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.991998627781868, + "num_tokens": 80747520.0, + "step": 677 + }, + { + "entropy": 0.6376785486936569, + "epoch": 1.5451953236384375, + "grad_norm": 0.466796875, + "learning_rate": 4.8532845853402015e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9945663809776306, + "num_tokens": 80866565.0, + "step": 678 + }, + { + "entropy": 0.6402008607983589, + "epoch": 1.5474764756201882, + "grad_norm": 0.58984375, + "learning_rate": 4.8526481921525035e-06, + "loss": 0.0315, + "mean_token_accuracy": 0.9918803349137306, + "num_tokens": 80985934.0, + "step": 679 + }, + { + "entropy": 0.6432082206010818, + "epoch": 1.549757627601939, + "grad_norm": 0.4765625, + "learning_rate": 4.85201046364711e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9942620396614075, + "num_tokens": 81105517.0, + "step": 680 + }, + { + "entropy": 0.6411319896578789, + "epoch": 1.5520387795836896, + "grad_norm": 0.671875, + "learning_rate": 4.851371400185986e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.99457236379385, + "num_tokens": 81224786.0, + "step": 681 + }, + { + "entropy": 0.6366729065775871, + "epoch": 1.5543199315654406, + "grad_norm": 0.5546875, + "learning_rate": 4.85073100213185e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.992289200425148, + "num_tokens": 81344004.0, + "step": 682 + }, + { + "entropy": 0.6411734521389008, + "epoch": 1.5566010835471913, + "grad_norm": 0.58203125, + "learning_rate": 4.8500892698481784e-06, + "loss": 0.0274, + "mean_token_accuracy": 0.9901652634143829, + "num_tokens": 81463361.0, + "step": 683 + }, + { + "entropy": 0.6400188356637955, + "epoch": 1.5588822355289422, + "grad_norm": 0.703125, + "learning_rate": 4.849446203699209e-06, + "loss": 0.0267, + "mean_token_accuracy": 0.9924582466483116, + "num_tokens": 81582768.0, + "step": 684 + }, + { + "entropy": 0.6370311826467514, + "epoch": 1.561163387510693, + "grad_norm": 0.7265625, + "learning_rate": 4.848801804049932e-06, + "loss": 0.0337, + "mean_token_accuracy": 0.9888993427157402, + "num_tokens": 81702240.0, + "step": 685 + }, + { + "entropy": 0.6475241258740425, + "epoch": 1.5634445394924437, + "grad_norm": 0.60546875, + "learning_rate": 4.848156071266095e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9915978610515594, + "num_tokens": 81821246.0, + "step": 686 + }, + { + "entropy": 0.6375545486807823, + "epoch": 1.5657256914741944, + "grad_norm": 0.61328125, + "learning_rate": 4.847509005714207e-06, + "loss": 0.0276, + "mean_token_accuracy": 0.9920943826436996, + "num_tokens": 81940284.0, + "step": 687 + }, + { + "entropy": 0.6441953480243683, + "epoch": 1.568006843455945, + "grad_norm": 0.58203125, + "learning_rate": 4.846860607761527e-06, + "loss": 0.024, + "mean_token_accuracy": 0.9923097491264343, + "num_tokens": 82059464.0, + "step": 688 + }, + { + "entropy": 0.6410790383815765, + "epoch": 1.570287995437696, + "grad_norm": 0.5859375, + "learning_rate": 4.8462108777760734e-06, + "loss": 0.0268, + "mean_token_accuracy": 0.9932421669363976, + "num_tokens": 82179782.0, + "step": 689 + }, + { + "entropy": 0.6421094089746475, + "epoch": 1.572569147419447, + "grad_norm": 0.6328125, + "learning_rate": 4.845559816126622e-06, + "loss": 0.0281, + "mean_token_accuracy": 0.9899401143193245, + "num_tokens": 82299142.0, + "step": 690 + }, + { + "entropy": 0.6457755714654922, + "epoch": 1.5748502994011977, + "grad_norm": 0.5390625, + "learning_rate": 4.844907423182699e-06, + "loss": 0.0223, + "mean_token_accuracy": 0.9933750182390213, + "num_tokens": 82418173.0, + "step": 691 + }, + { + "entropy": 0.6443367749452591, + "epoch": 1.5771314513829484, + "grad_norm": 0.6484375, + "learning_rate": 4.844253699314596e-06, + "loss": 0.0284, + "mean_token_accuracy": 0.9917292520403862, + "num_tokens": 82538417.0, + "step": 692 + }, + { + "entropy": 0.6392989680171013, + "epoch": 1.5794126033646991, + "grad_norm": 0.4140625, + "learning_rate": 4.843598644893349e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9962929338216782, + "num_tokens": 82657588.0, + "step": 693 + }, + { + "entropy": 0.6474875435233116, + "epoch": 1.5816937553464498, + "grad_norm": 0.60546875, + "learning_rate": 4.842942260290757e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9924154877662659, + "num_tokens": 82777151.0, + "step": 694 + }, + { + "entropy": 0.6456004232168198, + "epoch": 1.5839749073282008, + "grad_norm": 0.67578125, + "learning_rate": 4.84228454587937e-06, + "loss": 0.0349, + "mean_token_accuracy": 0.988041840493679, + "num_tokens": 82896316.0, + "step": 695 + }, + { + "entropy": 0.6448253691196442, + "epoch": 1.5862560593099515, + "grad_norm": 0.478515625, + "learning_rate": 4.841625502032495e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9936308860778809, + "num_tokens": 83015300.0, + "step": 696 + }, + { + "entropy": 0.6386808678507805, + "epoch": 1.5885372112917024, + "grad_norm": 0.51953125, + "learning_rate": 4.84096512912419e-06, + "loss": 0.0245, + "mean_token_accuracy": 0.99282156676054, + "num_tokens": 83134160.0, + "step": 697 + }, + { + "entropy": 0.6416999101638794, + "epoch": 1.5908183632734532, + "grad_norm": 0.6640625, + "learning_rate": 4.8403034275292735e-06, + "loss": 0.0258, + "mean_token_accuracy": 0.9938384965062141, + "num_tokens": 83253202.0, + "step": 698 + }, + { + "entropy": 0.6429546475410461, + "epoch": 1.5930995152552039, + "grad_norm": 0.578125, + "learning_rate": 4.839640397623312e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9922254383563995, + "num_tokens": 83372245.0, + "step": 699 + }, + { + "entropy": 0.6452757865190506, + "epoch": 1.5953806672369546, + "grad_norm": 0.64453125, + "learning_rate": 4.83897603978263e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9925415217876434, + "num_tokens": 83491590.0, + "step": 700 + }, + { + "entropy": 0.6501963883638382, + "epoch": 1.5976618192187053, + "grad_norm": 0.75390625, + "learning_rate": 4.838310354384304e-06, + "loss": 0.0359, + "mean_token_accuracy": 0.9888075441122055, + "num_tokens": 83610687.0, + "step": 701 + }, + { + "entropy": 0.6465002149343491, + "epoch": 1.5999429712004563, + "grad_norm": 0.58984375, + "learning_rate": 4.8376433418061615e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9923240467905998, + "num_tokens": 83729913.0, + "step": 702 + }, + { + "entropy": 0.649569533765316, + "epoch": 1.602224123182207, + "grad_norm": 0.470703125, + "learning_rate": 4.8369750024267904e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9955791011452675, + "num_tokens": 83849772.0, + "step": 703 + }, + { + "entropy": 0.6420313119888306, + "epoch": 1.604505275163958, + "grad_norm": 0.62890625, + "learning_rate": 4.836305336625523e-06, + "loss": 0.0249, + "mean_token_accuracy": 0.9915095865726471, + "num_tokens": 83968827.0, + "step": 704 + }, + { + "entropy": 0.6453241929411888, + "epoch": 1.6067864271457086, + "grad_norm": 0.62890625, + "learning_rate": 4.835634344782453e-06, + "loss": 0.0273, + "mean_token_accuracy": 0.990353912115097, + "num_tokens": 84088311.0, + "step": 705 + }, + { + "entropy": 0.6403872668743134, + "epoch": 1.6090675791274593, + "grad_norm": 0.62109375, + "learning_rate": 4.834962027278418e-06, + "loss": 0.023, + "mean_token_accuracy": 0.9910108894109726, + "num_tokens": 84207805.0, + "step": 706 + }, + { + "entropy": 0.6525982320308685, + "epoch": 1.61134873110921, + "grad_norm": 0.640625, + "learning_rate": 4.834288384495015e-06, + "loss": 0.0275, + "mean_token_accuracy": 0.9921730011701584, + "num_tokens": 84326865.0, + "step": 707 + }, + { + "entropy": 0.6430354192852974, + "epoch": 1.6136298830909608, + "grad_norm": 0.55859375, + "learning_rate": 4.833613416814591e-06, + "loss": 0.0321, + "mean_token_accuracy": 0.9916104897856712, + "num_tokens": 84446329.0, + "step": 708 + }, + { + "entropy": 0.6450579389929771, + "epoch": 1.6159110350727117, + "grad_norm": 0.490234375, + "learning_rate": 4.832937124620243e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9937263205647469, + "num_tokens": 84566331.0, + "step": 709 + }, + { + "entropy": 0.639849528670311, + "epoch": 1.6181921870544627, + "grad_norm": 0.49609375, + "learning_rate": 4.832259508295822e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9934990629553795, + "num_tokens": 84685657.0, + "step": 710 + }, + { + "entropy": 0.6464897245168686, + "epoch": 1.6204733390362134, + "grad_norm": 0.54296875, + "learning_rate": 4.831580568225931e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9930057525634766, + "num_tokens": 84805617.0, + "step": 711 + }, + { + "entropy": 0.647191122174263, + "epoch": 1.622754491017964, + "grad_norm": 0.51171875, + "learning_rate": 4.830900304795921e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9951288104057312, + "num_tokens": 84925461.0, + "step": 712 + }, + { + "entropy": 0.6494113206863403, + "epoch": 1.6250356429997148, + "grad_norm": 0.71875, + "learning_rate": 4.8302187183918996e-06, + "loss": 0.0266, + "mean_token_accuracy": 0.9903081133961678, + "num_tokens": 85044929.0, + "step": 713 + }, + { + "entropy": 0.6448704525828362, + "epoch": 1.6273167949814655, + "grad_norm": 0.6171875, + "learning_rate": 4.8295358094007184e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.9902205616235733, + "num_tokens": 85163146.0, + "step": 714 + }, + { + "entropy": 0.6447743400931358, + "epoch": 1.6295979469632165, + "grad_norm": 0.48046875, + "learning_rate": 4.828851578209986e-06, + "loss": 0.0237, + "mean_token_accuracy": 0.9925466477870941, + "num_tokens": 85283039.0, + "step": 715 + }, + { + "entropy": 0.6432678773999214, + "epoch": 1.6318790989449672, + "grad_norm": 0.6015625, + "learning_rate": 4.828166025208059e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9925786927342415, + "num_tokens": 85402503.0, + "step": 716 + }, + { + "entropy": 0.648583211004734, + "epoch": 1.6341602509267181, + "grad_norm": 1.25, + "learning_rate": 4.8274791507840416e-06, + "loss": 0.0299, + "mean_token_accuracy": 0.9898280948400497, + "num_tokens": 85521862.0, + "step": 717 + }, + { + "entropy": 0.6471402868628502, + "epoch": 1.6364414029084688, + "grad_norm": 0.5859375, + "learning_rate": 4.826790955327793e-06, + "loss": 0.0293, + "mean_token_accuracy": 0.9910049140453339, + "num_tokens": 85642104.0, + "step": 718 + }, + { + "entropy": 0.6465304270386696, + "epoch": 1.6387225548902196, + "grad_norm": 0.546875, + "learning_rate": 4.826101439229918e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9924459382891655, + "num_tokens": 85760970.0, + "step": 719 + }, + { + "entropy": 0.6472652330994606, + "epoch": 1.6410037068719703, + "grad_norm": 0.703125, + "learning_rate": 4.825410602881774e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9920404553413391, + "num_tokens": 85880131.0, + "step": 720 + }, + { + "entropy": 0.6468649879097939, + "epoch": 1.643284858853721, + "grad_norm": 0.408203125, + "learning_rate": 4.824718446675465e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9937390685081482, + "num_tokens": 85999162.0, + "step": 721 + }, + { + "entropy": 0.642945408821106, + "epoch": 1.645566010835472, + "grad_norm": 0.494140625, + "learning_rate": 4.8240249710038455e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.9925430938601494, + "num_tokens": 86118494.0, + "step": 722 + }, + { + "entropy": 0.642628438770771, + "epoch": 1.6478471628172227, + "grad_norm": 0.6015625, + "learning_rate": 4.82333017626052e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9940452724695206, + "num_tokens": 86237327.0, + "step": 723 + }, + { + "entropy": 0.6475147977471352, + "epoch": 1.6501283147989736, + "grad_norm": 0.59765625, + "learning_rate": 4.82263406283984e-06, + "loss": 0.0277, + "mean_token_accuracy": 0.9918419197201729, + "num_tokens": 86356370.0, + "step": 724 + }, + { + "entropy": 0.6456489786505699, + "epoch": 1.6524094667807243, + "grad_norm": 0.5546875, + "learning_rate": 4.821936631136907e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9956123381853104, + "num_tokens": 86475729.0, + "step": 725 + }, + { + "entropy": 0.6449040770530701, + "epoch": 1.654690618762475, + "grad_norm": 0.62890625, + "learning_rate": 4.821237881547567e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9915178045630455, + "num_tokens": 86596221.0, + "step": 726 + }, + { + "entropy": 0.6486258506774902, + "epoch": 1.6569717707442257, + "grad_norm": 0.6640625, + "learning_rate": 4.82053781446842e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9919554218649864, + "num_tokens": 86715440.0, + "step": 727 + }, + { + "entropy": 0.6412919238209724, + "epoch": 1.6592529227259765, + "grad_norm": 0.54296875, + "learning_rate": 4.819836430296809e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9952799156308174, + "num_tokens": 86834476.0, + "step": 728 + }, + { + "entropy": 0.6438492015004158, + "epoch": 1.6615340747077274, + "grad_norm": 0.62109375, + "learning_rate": 4.819133729430826e-06, + "loss": 0.0307, + "mean_token_accuracy": 0.9909729808568954, + "num_tokens": 86953834.0, + "step": 729 + }, + { + "entropy": 0.645331121981144, + "epoch": 1.6638152266894783, + "grad_norm": 0.462890625, + "learning_rate": 4.818429712269312e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9940946102142334, + "num_tokens": 87073880.0, + "step": 730 + }, + { + "entropy": 0.6398236230015755, + "epoch": 1.666096378671229, + "grad_norm": 0.51953125, + "learning_rate": 4.8177243792118515e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9916470944881439, + "num_tokens": 87193111.0, + "step": 731 + }, + { + "entropy": 0.644985631108284, + "epoch": 1.6683775306529798, + "grad_norm": 0.7890625, + "learning_rate": 4.8170177306587785e-06, + "loss": 0.0353, + "mean_token_accuracy": 0.9895706847310066, + "num_tokens": 87313899.0, + "step": 732 + }, + { + "entropy": 0.6445146352052689, + "epoch": 1.6706586826347305, + "grad_norm": 0.52734375, + "learning_rate": 4.8163097670111735e-06, + "loss": 0.0285, + "mean_token_accuracy": 0.9920459166169167, + "num_tokens": 87434197.0, + "step": 733 + }, + { + "entropy": 0.6375463679432869, + "epoch": 1.6729398346164812, + "grad_norm": 0.58984375, + "learning_rate": 4.815600488670863e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9939498156309128, + "num_tokens": 87553502.0, + "step": 734 + }, + { + "entropy": 0.6356865987181664, + "epoch": 1.6752209865982322, + "grad_norm": 0.5703125, + "learning_rate": 4.81488989604042e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.99378552287817, + "num_tokens": 87672419.0, + "step": 735 + }, + { + "entropy": 0.6418371573090553, + "epoch": 1.6775021385799829, + "grad_norm": 0.5703125, + "learning_rate": 4.814177989523162e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9928050935268402, + "num_tokens": 87792599.0, + "step": 736 + }, + { + "entropy": 0.6433526650071144, + "epoch": 1.6797832905617338, + "grad_norm": 0.484375, + "learning_rate": 4.813464769523154e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9939191564917564, + "num_tokens": 87912312.0, + "step": 737 + }, + { + "entropy": 0.641547754406929, + "epoch": 1.6820644425434845, + "grad_norm": 0.671875, + "learning_rate": 4.812750236445206e-06, + "loss": 0.0276, + "mean_token_accuracy": 0.9914326816797256, + "num_tokens": 88032196.0, + "step": 738 + }, + { + "entropy": 0.6388159841299057, + "epoch": 1.6843455945252352, + "grad_norm": 0.62890625, + "learning_rate": 4.812034390694874e-06, + "loss": 0.0255, + "mean_token_accuracy": 0.9934413284063339, + "num_tokens": 88151955.0, + "step": 739 + }, + { + "entropy": 0.6444572880864143, + "epoch": 1.686626746506986, + "grad_norm": 0.45703125, + "learning_rate": 4.811317232678456e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9960844963788986, + "num_tokens": 88271341.0, + "step": 740 + }, + { + "entropy": 0.6428451836109161, + "epoch": 1.6889078984887367, + "grad_norm": 0.48046875, + "learning_rate": 4.810598762803e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9925880208611488, + "num_tokens": 88391210.0, + "step": 741 + }, + { + "entropy": 0.6456586122512817, + "epoch": 1.6911890504704876, + "grad_norm": 0.546875, + "learning_rate": 4.809878981476293e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9933931156992912, + "num_tokens": 88510162.0, + "step": 742 + }, + { + "entropy": 0.6434487029910088, + "epoch": 1.6934702024522383, + "grad_norm": 0.63671875, + "learning_rate": 4.80915788910687e-06, + "loss": 0.025, + "mean_token_accuracy": 0.9920298904180527, + "num_tokens": 88628899.0, + "step": 743 + }, + { + "entropy": 0.6430339813232422, + "epoch": 1.6957513544339893, + "grad_norm": 0.5546875, + "learning_rate": 4.80843548610401e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9934555143117905, + "num_tokens": 88748204.0, + "step": 744 + }, + { + "entropy": 0.6433944404125214, + "epoch": 1.69803250641574, + "grad_norm": 0.53125, + "learning_rate": 4.807711772877733e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.995145857334137, + "num_tokens": 88867007.0, + "step": 745 + }, + { + "entropy": 0.6437710970640182, + "epoch": 1.7003136583974907, + "grad_norm": 0.609375, + "learning_rate": 4.8069867498388066e-06, + "loss": 0.026, + "mean_token_accuracy": 0.991196408867836, + "num_tokens": 88985711.0, + "step": 746 + }, + { + "entropy": 0.6424547433853149, + "epoch": 1.7025948103792414, + "grad_norm": 0.5546875, + "learning_rate": 4.806260417398739e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9943418875336647, + "num_tokens": 89104558.0, + "step": 747 + }, + { + "entropy": 0.6448725759983063, + "epoch": 1.7048759623609921, + "grad_norm": 0.4609375, + "learning_rate": 4.805532775969783e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9950062036514282, + "num_tokens": 89223715.0, + "step": 748 + }, + { + "entropy": 0.6417075172066689, + "epoch": 1.707157114342743, + "grad_norm": 0.54296875, + "learning_rate": 4.804803825964933e-06, + "loss": 0.0269, + "mean_token_accuracy": 0.9934018403291702, + "num_tokens": 89342858.0, + "step": 749 + }, + { + "entropy": 0.6375532150268555, + "epoch": 1.709438266324494, + "grad_norm": 0.6015625, + "learning_rate": 4.804073567797928e-06, + "loss": 0.0272, + "mean_token_accuracy": 0.9930486381053925, + "num_tokens": 89462361.0, + "step": 750 + }, + { + "entropy": 0.6451791375875473, + "epoch": 1.7117194183062447, + "grad_norm": 0.5703125, + "learning_rate": 4.803342001883247e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9944713041186333, + "num_tokens": 89581946.0, + "step": 751 + }, + { + "entropy": 0.6429461389780045, + "epoch": 1.7140005702879955, + "grad_norm": 0.51953125, + "learning_rate": 4.802609128636113e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9939661398530006, + "num_tokens": 89701045.0, + "step": 752 + }, + { + "entropy": 0.6452252045273781, + "epoch": 1.7162817222697462, + "grad_norm": 0.63671875, + "learning_rate": 4.801874948472492e-06, + "loss": 0.0333, + "mean_token_accuracy": 0.9915708974003792, + "num_tokens": 89820257.0, + "step": 753 + }, + { + "entropy": 0.6444855108857155, + "epoch": 1.718562874251497, + "grad_norm": 0.59375, + "learning_rate": 4.801139461809089e-06, + "loss": 0.0264, + "mean_token_accuracy": 0.9913069009780884, + "num_tokens": 89940014.0, + "step": 754 + }, + { + "entropy": 0.6475725024938583, + "epoch": 1.7208440262332478, + "grad_norm": 0.640625, + "learning_rate": 4.800402669063353e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.9946991875767708, + "num_tokens": 90058707.0, + "step": 755 + }, + { + "entropy": 0.6459760963916779, + "epoch": 1.7231251782149986, + "grad_norm": 0.76171875, + "learning_rate": 4.799664570653473e-06, + "loss": 0.0267, + "mean_token_accuracy": 0.9923143461346626, + "num_tokens": 90178106.0, + "step": 756 + }, + { + "entropy": 0.6410199627280235, + "epoch": 1.7254063301967495, + "grad_norm": 0.5546875, + "learning_rate": 4.79892516699838e-06, + "loss": 0.0244, + "mean_token_accuracy": 0.9934609830379486, + "num_tokens": 90297495.0, + "step": 757 + }, + { + "entropy": 0.6406372636556625, + "epoch": 1.7276874821785002, + "grad_norm": 0.5625, + "learning_rate": 4.798184458517745e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9927756786346436, + "num_tokens": 90417538.0, + "step": 758 + }, + { + "entropy": 0.6397253796458244, + "epoch": 1.729968634160251, + "grad_norm": 0.498046875, + "learning_rate": 4.797442445631978e-06, + "loss": 0.024, + "mean_token_accuracy": 0.9907316192984581, + "num_tokens": 90536831.0, + "step": 759 + }, + { + "entropy": 0.642189547419548, + "epoch": 1.7322497861420016, + "grad_norm": 0.5390625, + "learning_rate": 4.7966991287622335e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9940938726067543, + "num_tokens": 90656134.0, + "step": 760 + }, + { + "entropy": 0.6436172500252724, + "epoch": 1.7345309381237524, + "grad_norm": 0.52734375, + "learning_rate": 4.795954508330403e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9926697164773941, + "num_tokens": 90774919.0, + "step": 761 + }, + { + "entropy": 0.6461043506860733, + "epoch": 1.7368120901055033, + "grad_norm": 0.53125, + "learning_rate": 4.795208584759119e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9951663315296173, + "num_tokens": 90894563.0, + "step": 762 + }, + { + "entropy": 0.6419747397303581, + "epoch": 1.739093242087254, + "grad_norm": 0.47265625, + "learning_rate": 4.794461358471753e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9945511594414711, + "num_tokens": 91014251.0, + "step": 763 + }, + { + "entropy": 0.6447116956114769, + "epoch": 1.741374394069005, + "grad_norm": 0.58984375, + "learning_rate": 4.7937128298924155e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9946996867656708, + "num_tokens": 91133891.0, + "step": 764 + }, + { + "entropy": 0.6402135342359543, + "epoch": 1.7436555460507557, + "grad_norm": 0.416015625, + "learning_rate": 4.7929629994459584e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9939092546701431, + "num_tokens": 91253766.0, + "step": 765 + }, + { + "entropy": 0.6470304876565933, + "epoch": 1.7459366980325064, + "grad_norm": 0.40625, + "learning_rate": 4.792211867557969e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9952184781432152, + "num_tokens": 91373430.0, + "step": 766 + }, + { + "entropy": 0.6442730501294136, + "epoch": 1.7482178500142571, + "grad_norm": 0.54296875, + "learning_rate": 4.7914594346547774e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9957994818687439, + "num_tokens": 91492710.0, + "step": 767 + }, + { + "entropy": 0.6423919945955276, + "epoch": 1.7504990019960078, + "grad_norm": 0.4609375, + "learning_rate": 4.790705701163449e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9950983375310898, + "num_tokens": 91611844.0, + "step": 768 + }, + { + "entropy": 0.641578771173954, + "epoch": 1.7527801539777588, + "grad_norm": 0.73046875, + "learning_rate": 4.789950667511789e-06, + "loss": 0.031, + "mean_token_accuracy": 0.9906304553151131, + "num_tokens": 91731264.0, + "step": 769 + }, + { + "entropy": 0.6476150676608086, + "epoch": 1.7550613059595097, + "grad_norm": 0.55859375, + "learning_rate": 4.789194334128338e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9941876083612442, + "num_tokens": 91850625.0, + "step": 770 + }, + { + "entropy": 0.6472403407096863, + "epoch": 1.7573424579412604, + "grad_norm": 0.55078125, + "learning_rate": 4.788436701442378e-06, + "loss": 0.0235, + "mean_token_accuracy": 0.9938217476010323, + "num_tokens": 91970558.0, + "step": 771 + }, + { + "entropy": 0.6410912722349167, + "epoch": 1.7596236099230111, + "grad_norm": 0.6328125, + "learning_rate": 4.787677769883926e-06, + "loss": 0.0298, + "mean_token_accuracy": 0.9912939816713333, + "num_tokens": 92089902.0, + "step": 772 + }, + { + "entropy": 0.6399624273180962, + "epoch": 1.7619047619047619, + "grad_norm": 0.427734375, + "learning_rate": 4.786917539883738e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9939317777752876, + "num_tokens": 92209221.0, + "step": 773 + }, + { + "entropy": 0.644411139190197, + "epoch": 1.7641859138865126, + "grad_norm": 0.7109375, + "learning_rate": 4.786156011873304e-06, + "loss": 0.0274, + "mean_token_accuracy": 0.9922387152910233, + "num_tokens": 92328360.0, + "step": 774 + }, + { + "entropy": 0.6378368809819221, + "epoch": 1.7664670658682635, + "grad_norm": 0.51171875, + "learning_rate": 4.785393186284854e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9956673979759216, + "num_tokens": 92447145.0, + "step": 775 + }, + { + "entropy": 0.6459790617227554, + "epoch": 1.7687482178500142, + "grad_norm": 0.453125, + "learning_rate": 4.784629063551354e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9945438653230667, + "num_tokens": 92566614.0, + "step": 776 + }, + { + "entropy": 0.6455347687005997, + "epoch": 1.7710293698317652, + "grad_norm": 0.74609375, + "learning_rate": 4.783863644106502e-06, + "loss": 0.0267, + "mean_token_accuracy": 0.990486703813076, + "num_tokens": 92686247.0, + "step": 777 + }, + { + "entropy": 0.6386021599173546, + "epoch": 1.773310521813516, + "grad_norm": 0.48828125, + "learning_rate": 4.783096928384739e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9936746880412102, + "num_tokens": 92806007.0, + "step": 778 + }, + { + "entropy": 0.6443333253264427, + "epoch": 1.7755916737952666, + "grad_norm": 0.5625, + "learning_rate": 4.782328916821235e-06, + "loss": 0.0244, + "mean_token_accuracy": 0.991473525762558, + "num_tokens": 92924911.0, + "step": 779 + }, + { + "entropy": 0.6462390795350075, + "epoch": 1.7778728257770173, + "grad_norm": 0.50390625, + "learning_rate": 4.7815596098519004e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.9928196892142296, + "num_tokens": 93044231.0, + "step": 780 + }, + { + "entropy": 0.6348602473735809, + "epoch": 1.780153977758768, + "grad_norm": 0.6796875, + "learning_rate": 4.780789007913379e-06, + "loss": 0.0312, + "mean_token_accuracy": 0.9920860454440117, + "num_tokens": 93163925.0, + "step": 781 + }, + { + "entropy": 0.6416789814829826, + "epoch": 1.782435129740519, + "grad_norm": 0.5234375, + "learning_rate": 4.780017111443048e-06, + "loss": 0.0294, + "mean_token_accuracy": 0.9920956119894981, + "num_tokens": 93283101.0, + "step": 782 + }, + { + "entropy": 0.6394111216068268, + "epoch": 1.7847162817222697, + "grad_norm": 0.5703125, + "learning_rate": 4.779243920879023e-06, + "loss": 0.0237, + "mean_token_accuracy": 0.9937139302492142, + "num_tokens": 93401912.0, + "step": 783 + }, + { + "entropy": 0.6419713646173477, + "epoch": 1.7869974337040206, + "grad_norm": 0.486328125, + "learning_rate": 4.77846943666015e-06, + "loss": 0.0247, + "mean_token_accuracy": 0.9914099872112274, + "num_tokens": 93521742.0, + "step": 784 + }, + { + "entropy": 0.6428253501653671, + "epoch": 1.7892785856857714, + "grad_norm": 0.5078125, + "learning_rate": 4.777693659226013e-06, + "loss": 0.0258, + "mean_token_accuracy": 0.9932371824979782, + "num_tokens": 93641280.0, + "step": 785 + }, + { + "entropy": 0.6445315256714821, + "epoch": 1.791559737667522, + "grad_norm": 0.625, + "learning_rate": 4.776916589016928e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9938997402787209, + "num_tokens": 93760448.0, + "step": 786 + }, + { + "entropy": 0.6364223510026932, + "epoch": 1.7938408896492728, + "grad_norm": 0.65625, + "learning_rate": 4.776138226473944e-06, + "loss": 0.0267, + "mean_token_accuracy": 0.9916330128908157, + "num_tokens": 93879209.0, + "step": 787 + }, + { + "entropy": 0.6394477114081383, + "epoch": 1.7961220416310235, + "grad_norm": 0.58984375, + "learning_rate": 4.775358572038845e-06, + "loss": 0.0287, + "mean_token_accuracy": 0.9902980551123619, + "num_tokens": 93998889.0, + "step": 788 + }, + { + "entropy": 0.6437738537788391, + "epoch": 1.7984031936127745, + "grad_norm": 0.640625, + "learning_rate": 4.774577626154148e-06, + "loss": 0.029, + "mean_token_accuracy": 0.9909818917512894, + "num_tokens": 94118345.0, + "step": 789 + }, + { + "entropy": 0.6425467059016228, + "epoch": 1.8006843455945254, + "grad_norm": 0.65625, + "learning_rate": 4.773795389263104e-06, + "loss": 0.0327, + "mean_token_accuracy": 0.9902200251817703, + "num_tokens": 94237821.0, + "step": 790 + }, + { + "entropy": 0.6421486660838127, + "epoch": 1.802965497576276, + "grad_norm": 0.5625, + "learning_rate": 4.773011861809694e-06, + "loss": 0.0274, + "mean_token_accuracy": 0.9930723309516907, + "num_tokens": 94357042.0, + "step": 791 + }, + { + "entropy": 0.6406670808792114, + "epoch": 1.8052466495580268, + "grad_norm": 0.486328125, + "learning_rate": 4.772227044238632e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9940053522586823, + "num_tokens": 94476681.0, + "step": 792 + }, + { + "entropy": 0.6385804414749146, + "epoch": 1.8075278015397775, + "grad_norm": 0.57421875, + "learning_rate": 4.771440936995367e-06, + "loss": 0.027, + "mean_token_accuracy": 0.9900851920247078, + "num_tokens": 94595803.0, + "step": 793 + }, + { + "entropy": 0.6465981230139732, + "epoch": 1.8098089535215283, + "grad_norm": 0.515625, + "learning_rate": 4.770653540526079e-06, + "loss": 0.0228, + "mean_token_accuracy": 0.9938011020421982, + "num_tokens": 94715192.0, + "step": 794 + }, + { + "entropy": 0.6406853869557381, + "epoch": 1.8120901055032792, + "grad_norm": 0.51953125, + "learning_rate": 4.7698648552776785e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9935202449560165, + "num_tokens": 94834134.0, + "step": 795 + }, + { + "entropy": 0.6431820392608643, + "epoch": 1.81437125748503, + "grad_norm": 0.51953125, + "learning_rate": 4.769074881697806e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9935549795627594, + "num_tokens": 94953633.0, + "step": 796 + }, + { + "entropy": 0.6455485746264458, + "epoch": 1.8166524094667809, + "grad_norm": 0.55078125, + "learning_rate": 4.768283620234838e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.9932443499565125, + "num_tokens": 95072972.0, + "step": 797 + }, + { + "entropy": 0.64368986338377, + "epoch": 1.8189335614485316, + "grad_norm": 0.61328125, + "learning_rate": 4.767491071337877e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9942041859030724, + "num_tokens": 95192568.0, + "step": 798 + }, + { + "entropy": 0.6459890231490135, + "epoch": 1.8212147134302823, + "grad_norm": 0.53515625, + "learning_rate": 4.766697235456761e-06, + "loss": 0.0268, + "mean_token_accuracy": 0.9925054162740707, + "num_tokens": 95313029.0, + "step": 799 + }, + { + "entropy": 0.6480426266789436, + "epoch": 1.823495865412033, + "grad_norm": 0.5546875, + "learning_rate": 4.765902113042053e-06, + "loss": 0.0277, + "mean_token_accuracy": 0.9920576885342598, + "num_tokens": 95431615.0, + "step": 800 + }, + { + "entropy": 0.645766444504261, + "epoch": 1.8257770173937837, + "grad_norm": 0.578125, + "learning_rate": 4.765105704545052e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9921344146132469, + "num_tokens": 95550996.0, + "step": 801 + }, + { + "entropy": 0.6435322239995003, + "epoch": 1.8280581693755347, + "grad_norm": 0.470703125, + "learning_rate": 4.7643080104177815e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.9941036626696587, + "num_tokens": 95670230.0, + "step": 802 + }, + { + "entropy": 0.6485661417245865, + "epoch": 1.8303393213572854, + "grad_norm": 0.671875, + "learning_rate": 4.763509031113e-06, + "loss": 0.0319, + "mean_token_accuracy": 0.9913212060928345, + "num_tokens": 95789931.0, + "step": 803 + }, + { + "entropy": 0.6465274691581726, + "epoch": 1.8326204733390363, + "grad_norm": 0.56640625, + "learning_rate": 4.7627087670841894e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9945723861455917, + "num_tokens": 95909078.0, + "step": 804 + }, + { + "entropy": 0.6458447054028511, + "epoch": 1.834901625320787, + "grad_norm": 0.490234375, + "learning_rate": 4.761907218785566e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9941222965717316, + "num_tokens": 96028017.0, + "step": 805 + }, + { + "entropy": 0.6486319601535797, + "epoch": 1.8371827773025378, + "grad_norm": 0.490234375, + "learning_rate": 4.761104386672074e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9932251274585724, + "num_tokens": 96147910.0, + "step": 806 + }, + { + "entropy": 0.6437408924102783, + "epoch": 1.8394639292842885, + "grad_norm": 0.62890625, + "learning_rate": 4.760300271199384e-06, + "loss": 0.0263, + "mean_token_accuracy": 0.9925180226564407, + "num_tokens": 96267539.0, + "step": 807 + }, + { + "entropy": 0.6437537223100662, + "epoch": 1.8417450812660392, + "grad_norm": 0.6640625, + "learning_rate": 4.759494872823896e-06, + "loss": 0.0298, + "mean_token_accuracy": 0.9930199906229973, + "num_tokens": 96386615.0, + "step": 808 + }, + { + "entropy": 0.6514753326773643, + "epoch": 1.8440262332477901, + "grad_norm": 0.4765625, + "learning_rate": 4.758688192002741e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9950203001499176, + "num_tokens": 96506243.0, + "step": 809 + }, + { + "entropy": 0.6473363041877747, + "epoch": 1.846307385229541, + "grad_norm": 0.6015625, + "learning_rate": 4.757880229193773e-06, + "loss": 0.024, + "mean_token_accuracy": 0.9927156046032906, + "num_tokens": 96625280.0, + "step": 810 + }, + { + "entropy": 0.6453452110290527, + "epoch": 1.8485885372112918, + "grad_norm": 0.48046875, + "learning_rate": 4.757070984855577e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9941175803542137, + "num_tokens": 96744523.0, + "step": 811 + }, + { + "entropy": 0.6455847397446632, + "epoch": 1.8508696891930425, + "grad_norm": 0.58984375, + "learning_rate": 4.756260459447465e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.9921280443668365, + "num_tokens": 96863514.0, + "step": 812 + }, + { + "entropy": 0.6499064117670059, + "epoch": 1.8531508411747932, + "grad_norm": 0.578125, + "learning_rate": 4.755448653429475e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9947948977351189, + "num_tokens": 96983431.0, + "step": 813 + }, + { + "entropy": 0.6463795080780983, + "epoch": 1.855431993156544, + "grad_norm": 0.46875, + "learning_rate": 4.754635567262372e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9926830604672432, + "num_tokens": 97102825.0, + "step": 814 + }, + { + "entropy": 0.6454407125711441, + "epoch": 1.8577131451382949, + "grad_norm": 0.5234375, + "learning_rate": 4.753821201407648e-06, + "loss": 0.0246, + "mean_token_accuracy": 0.9924419522285461, + "num_tokens": 97222151.0, + "step": 815 + }, + { + "entropy": 0.6511183977127075, + "epoch": 1.8599942971200456, + "grad_norm": 0.5625, + "learning_rate": 4.7530055563275225e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9937237948179245, + "num_tokens": 97342006.0, + "step": 816 + }, + { + "entropy": 0.6488251462578773, + "epoch": 1.8622754491017965, + "grad_norm": 0.458984375, + "learning_rate": 4.7521886324849385e-06, + "loss": 0.0246, + "mean_token_accuracy": 0.9938654005527496, + "num_tokens": 97461062.0, + "step": 817 + }, + { + "entropy": 0.6496801525354385, + "epoch": 1.8645566010835473, + "grad_norm": 0.486328125, + "learning_rate": 4.751370430343568e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9921344369649887, + "num_tokens": 97580469.0, + "step": 818 + }, + { + "entropy": 0.6465569362044334, + "epoch": 1.866837753065298, + "grad_norm": 0.515625, + "learning_rate": 4.750550950367805e-06, + "loss": 0.023, + "mean_token_accuracy": 0.992755688726902, + "num_tokens": 97699733.0, + "step": 819 + }, + { + "entropy": 0.645734615623951, + "epoch": 1.8691189050470487, + "grad_norm": 0.5859375, + "learning_rate": 4.749730193022771e-06, + "loss": 0.0267, + "mean_token_accuracy": 0.992108017206192, + "num_tokens": 97818680.0, + "step": 820 + }, + { + "entropy": 0.6470183655619621, + "epoch": 1.8714000570287994, + "grad_norm": 0.5234375, + "learning_rate": 4.748908158774312e-06, + "loss": 0.0243, + "mean_token_accuracy": 0.9943297728896141, + "num_tokens": 97938367.0, + "step": 821 + }, + { + "entropy": 0.6481959521770477, + "epoch": 1.8736812090105504, + "grad_norm": 0.4609375, + "learning_rate": 4.748084848089e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9936198145151138, + "num_tokens": 98057507.0, + "step": 822 + }, + { + "entropy": 0.6462680324912071, + "epoch": 1.875962360992301, + "grad_norm": 0.5625, + "learning_rate": 4.747260261434128e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9932012036442757, + "num_tokens": 98177247.0, + "step": 823 + }, + { + "entropy": 0.6449310258030891, + "epoch": 1.878243512974052, + "grad_norm": 0.45703125, + "learning_rate": 4.7464343992777175e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9932101219892502, + "num_tokens": 98296413.0, + "step": 824 + }, + { + "entropy": 0.647075243294239, + "epoch": 1.8805246649558027, + "grad_norm": 0.5, + "learning_rate": 4.74560726208851e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9931443184614182, + "num_tokens": 98415548.0, + "step": 825 + }, + { + "entropy": 0.6472158432006836, + "epoch": 1.8828058169375534, + "grad_norm": 0.4921875, + "learning_rate": 4.744778850335974e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9931735768914223, + "num_tokens": 98534853.0, + "step": 826 + }, + { + "entropy": 0.6420326679944992, + "epoch": 1.8850869689193042, + "grad_norm": 0.51953125, + "learning_rate": 4.7439491644903e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9914205819368362, + "num_tokens": 98654280.0, + "step": 827 + }, + { + "entropy": 0.6473598927259445, + "epoch": 1.8873681209010549, + "grad_norm": 0.6015625, + "learning_rate": 4.743118205022402e-06, + "loss": 0.023, + "mean_token_accuracy": 0.9928292632102966, + "num_tokens": 98773686.0, + "step": 828 + }, + { + "entropy": 0.651142030954361, + "epoch": 1.8896492728828058, + "grad_norm": 0.66796875, + "learning_rate": 4.742285972403915e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9926173761487007, + "num_tokens": 98892991.0, + "step": 829 + }, + { + "entropy": 0.6447849199175835, + "epoch": 1.8919304248645568, + "grad_norm": 0.51171875, + "learning_rate": 4.7414524671071995e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9930430054664612, + "num_tokens": 99012680.0, + "step": 830 + }, + { + "entropy": 0.6449818909168243, + "epoch": 1.8942115768463075, + "grad_norm": 0.50390625, + "learning_rate": 4.7406176896053356e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9966456145048141, + "num_tokens": 99131903.0, + "step": 831 + }, + { + "entropy": 0.6415169015526772, + "epoch": 1.8964927288280582, + "grad_norm": 0.458984375, + "learning_rate": 4.739781640372129e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9925014302134514, + "num_tokens": 99251685.0, + "step": 832 + }, + { + "entropy": 0.6500499993562698, + "epoch": 1.898773880809809, + "grad_norm": 0.498046875, + "learning_rate": 4.7389443198821035e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.995226688683033, + "num_tokens": 99371119.0, + "step": 833 + }, + { + "entropy": 0.6449698656797409, + "epoch": 1.9010550327915596, + "grad_norm": 0.59765625, + "learning_rate": 4.738105728610507e-06, + "loss": 0.0273, + "mean_token_accuracy": 0.991396114230156, + "num_tokens": 99490163.0, + "step": 834 + }, + { + "entropy": 0.6477948948740959, + "epoch": 1.9033361847733106, + "grad_norm": 0.5234375, + "learning_rate": 4.737265867033307e-06, + "loss": 0.0247, + "mean_token_accuracy": 0.9942711815237999, + "num_tokens": 99609184.0, + "step": 835 + }, + { + "entropy": 0.6478389501571655, + "epoch": 1.9056173367550613, + "grad_norm": 0.46875, + "learning_rate": 4.736424735627193e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.9919007867574692, + "num_tokens": 99727760.0, + "step": 836 + }, + { + "entropy": 0.6491989493370056, + "epoch": 1.9078984887368122, + "grad_norm": 0.70703125, + "learning_rate": 4.735582334869575e-06, + "loss": 0.0264, + "mean_token_accuracy": 0.9929561242461205, + "num_tokens": 99847190.0, + "step": 837 + }, + { + "entropy": 0.6482290402054787, + "epoch": 1.910179640718563, + "grad_norm": 0.546875, + "learning_rate": 4.734738665238583e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9928398430347443, + "num_tokens": 99966965.0, + "step": 838 + }, + { + "entropy": 0.6453288793563843, + "epoch": 1.9124607927003137, + "grad_norm": 0.73046875, + "learning_rate": 4.733893727213068e-06, + "loss": 0.0265, + "mean_token_accuracy": 0.9917667359113693, + "num_tokens": 100086436.0, + "step": 839 + }, + { + "entropy": 0.6439173147082329, + "epoch": 1.9147419446820644, + "grad_norm": 0.412109375, + "learning_rate": 4.7330475212726e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9930537641048431, + "num_tokens": 100206661.0, + "step": 840 + }, + { + "entropy": 0.644884780049324, + "epoch": 1.917023096663815, + "grad_norm": 0.4921875, + "learning_rate": 4.73220004789747e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9926646873354912, + "num_tokens": 100325497.0, + "step": 841 + }, + { + "entropy": 0.6490446180105209, + "epoch": 1.919304248645566, + "grad_norm": 0.625, + "learning_rate": 4.7313513075686875e-06, + "loss": 0.0299, + "mean_token_accuracy": 0.9914592728018761, + "num_tokens": 100444927.0, + "step": 842 + }, + { + "entropy": 0.6507568657398224, + "epoch": 1.9215854006273168, + "grad_norm": 0.578125, + "learning_rate": 4.73050130076798e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9938268288969994, + "num_tokens": 100564592.0, + "step": 843 + }, + { + "entropy": 0.6458798348903656, + "epoch": 1.9238665526090677, + "grad_norm": 0.498046875, + "learning_rate": 4.729650027977797e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.994054526090622, + "num_tokens": 100684331.0, + "step": 844 + }, + { + "entropy": 0.6414771378040314, + "epoch": 1.9261477045908184, + "grad_norm": 0.58203125, + "learning_rate": 4.728797489681302e-06, + "loss": 0.0277, + "mean_token_accuracy": 0.9916529059410095, + "num_tokens": 100803311.0, + "step": 845 + }, + { + "entropy": 0.652600072324276, + "epoch": 1.9284288565725691, + "grad_norm": 0.490234375, + "learning_rate": 4.7279436863623805e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9921798408031464, + "num_tokens": 100923431.0, + "step": 846 + }, + { + "entropy": 0.6486269980669022, + "epoch": 1.9307100085543198, + "grad_norm": 0.44921875, + "learning_rate": 4.7270886185056355e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.9961706921458244, + "num_tokens": 101043149.0, + "step": 847 + }, + { + "entropy": 0.6528254449367523, + "epoch": 1.9329911605360706, + "grad_norm": 0.546875, + "learning_rate": 4.726232286596385e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9930520802736282, + "num_tokens": 101162709.0, + "step": 848 + }, + { + "entropy": 0.6509424895048141, + "epoch": 1.9352723125178215, + "grad_norm": 0.6328125, + "learning_rate": 4.725374691120669e-06, + "loss": 0.036, + "mean_token_accuracy": 0.9887207448482513, + "num_tokens": 101281837.0, + "step": 849 + }, + { + "entropy": 0.6540134474635124, + "epoch": 1.9375534644995724, + "grad_norm": 0.55859375, + "learning_rate": 4.7245158325652396e-06, + "loss": 0.0269, + "mean_token_accuracy": 0.9897925108671188, + "num_tokens": 101401240.0, + "step": 850 + }, + { + "entropy": 0.645803265273571, + "epoch": 1.9398346164813232, + "grad_norm": 0.67578125, + "learning_rate": 4.7236557114175705e-06, + "loss": 0.0307, + "mean_token_accuracy": 0.9897523894906044, + "num_tokens": 101520522.0, + "step": 851 + }, + { + "entropy": 0.6508874669671059, + "epoch": 1.9421157684630739, + "grad_norm": 0.53515625, + "learning_rate": 4.722794328165849e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9944771155714989, + "num_tokens": 101639999.0, + "step": 852 + }, + { + "entropy": 0.6456273198127747, + "epoch": 1.9443969204448246, + "grad_norm": 0.482421875, + "learning_rate": 4.721931683298979e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.994625024497509, + "num_tokens": 101759034.0, + "step": 853 + }, + { + "entropy": 0.6472189649939537, + "epoch": 1.9466780724265753, + "grad_norm": 0.55078125, + "learning_rate": 4.721067777306582e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9929979145526886, + "num_tokens": 101878043.0, + "step": 854 + }, + { + "entropy": 0.6490361243486404, + "epoch": 1.9489592244083263, + "grad_norm": 0.6015625, + "learning_rate": 4.7202026106789935e-06, + "loss": 0.0235, + "mean_token_accuracy": 0.9918460920453072, + "num_tokens": 101997159.0, + "step": 855 + }, + { + "entropy": 0.6446860209107399, + "epoch": 1.951240376390077, + "grad_norm": 0.60546875, + "learning_rate": 4.719336183907266e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9934374913573265, + "num_tokens": 102116634.0, + "step": 856 + }, + { + "entropy": 0.6481933146715164, + "epoch": 1.953521528371828, + "grad_norm": 0.50390625, + "learning_rate": 4.718468497483166e-06, + "loss": 0.0252, + "mean_token_accuracy": 0.9923881366848946, + "num_tokens": 102236053.0, + "step": 857 + }, + { + "entropy": 0.6508172526955605, + "epoch": 1.9558026803535786, + "grad_norm": 0.796875, + "learning_rate": 4.717599551899177e-06, + "loss": 0.0307, + "mean_token_accuracy": 0.9911124482750893, + "num_tokens": 102355353.0, + "step": 858 + }, + { + "entropy": 0.6490844413638115, + "epoch": 1.9580838323353293, + "grad_norm": 0.54296875, + "learning_rate": 4.716729347648494e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.9940649941563606, + "num_tokens": 102474819.0, + "step": 859 + }, + { + "entropy": 0.6474167257547379, + "epoch": 1.96036498431708, + "grad_norm": 0.4375, + "learning_rate": 4.71585788522503e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9935140982270241, + "num_tokens": 102594557.0, + "step": 860 + }, + { + "entropy": 0.6482702493667603, + "epoch": 1.9626461362988308, + "grad_norm": 0.515625, + "learning_rate": 4.7149851651234085e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9935211166739464, + "num_tokens": 102713959.0, + "step": 861 + }, + { + "entropy": 0.6439765021204948, + "epoch": 1.9649272882805817, + "grad_norm": 0.53515625, + "learning_rate": 4.714111187838969e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.992180660367012, + "num_tokens": 102834437.0, + "step": 862 + }, + { + "entropy": 0.6448732241988182, + "epoch": 1.9672084402623324, + "grad_norm": 0.5390625, + "learning_rate": 4.713235953867764e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9929385632276535, + "num_tokens": 102952700.0, + "step": 863 + }, + { + "entropy": 0.6465591117739677, + "epoch": 1.9694895922440834, + "grad_norm": 0.53515625, + "learning_rate": 4.712359463706561e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.992632269859314, + "num_tokens": 103071836.0, + "step": 864 + }, + { + "entropy": 0.6469573378562927, + "epoch": 1.971770744225834, + "grad_norm": 0.69140625, + "learning_rate": 4.711481717852837e-06, + "loss": 0.0278, + "mean_token_accuracy": 0.9926692023873329, + "num_tokens": 103191553.0, + "step": 865 + }, + { + "entropy": 0.6502205356955528, + "epoch": 1.9740518962075848, + "grad_norm": 0.640625, + "learning_rate": 4.710602716804784e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9913962483406067, + "num_tokens": 103310866.0, + "step": 866 + }, + { + "entropy": 0.6472933739423752, + "epoch": 1.9763330481893355, + "grad_norm": 0.65625, + "learning_rate": 4.709722461061307e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.9935525730252266, + "num_tokens": 103430005.0, + "step": 867 + }, + { + "entropy": 0.6507910043001175, + "epoch": 1.9786142001710862, + "grad_norm": 0.54296875, + "learning_rate": 4.70884095112202e-06, + "loss": 0.0263, + "mean_token_accuracy": 0.9920111671090126, + "num_tokens": 103549481.0, + "step": 868 + }, + { + "entropy": 0.6505469605326653, + "epoch": 1.9808953521528372, + "grad_norm": 0.60546875, + "learning_rate": 4.707958187487254e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.993021622300148, + "num_tokens": 103669173.0, + "step": 869 + }, + { + "entropy": 0.6497827470302582, + "epoch": 1.9831765041345881, + "grad_norm": 0.69140625, + "learning_rate": 4.707074170658046e-06, + "loss": 0.0259, + "mean_token_accuracy": 0.9922433644533157, + "num_tokens": 103787546.0, + "step": 870 + }, + { + "entropy": 0.6540321186184883, + "epoch": 1.9854576561163388, + "grad_norm": 0.431640625, + "learning_rate": 4.706188901136148e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9939952939748764, + "num_tokens": 103907284.0, + "step": 871 + }, + { + "entropy": 0.6436341404914856, + "epoch": 1.9877388080980896, + "grad_norm": 0.57421875, + "learning_rate": 4.705302379424023e-06, + "loss": 0.0275, + "mean_token_accuracy": 0.9909894242882729, + "num_tokens": 104026914.0, + "step": 872 + }, + { + "entropy": 0.6429296284914017, + "epoch": 1.9900199600798403, + "grad_norm": 0.60546875, + "learning_rate": 4.704414606024842e-06, + "loss": 0.0255, + "mean_token_accuracy": 0.9916453287005424, + "num_tokens": 104145727.0, + "step": 873 + }, + { + "entropy": 0.6463484317064285, + "epoch": 1.992301112061591, + "grad_norm": 0.474609375, + "learning_rate": 4.703525581442488e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9949651807546616, + "num_tokens": 104264940.0, + "step": 874 + }, + { + "entropy": 0.6462028920650482, + "epoch": 1.994582264043342, + "grad_norm": 0.5234375, + "learning_rate": 4.702635306181554e-06, + "loss": 0.023, + "mean_token_accuracy": 0.9926431402564049, + "num_tokens": 104384132.0, + "step": 875 + }, + { + "entropy": 0.6507808119058609, + "epoch": 1.9968634160250927, + "grad_norm": 0.609375, + "learning_rate": 4.701743780747345e-06, + "loss": 0.0277, + "mean_token_accuracy": 0.9900699034333229, + "num_tokens": 104504048.0, + "step": 876 + }, + { + "entropy": 0.6507648527622223, + "epoch": 1.9991445680068436, + "grad_norm": 0.58984375, + "learning_rate": 4.700851005645872e-06, + "loss": 0.0284, + "mean_token_accuracy": 0.9923951774835587, + "num_tokens": 104623367.0, + "step": 877 + }, + { + "entropy": 0.6425047914187113, + "epoch": 2.0, + "grad_norm": 0.7578125, + "learning_rate": 4.699956981383857e-06, + "loss": 0.014, + "mean_token_accuracy": 0.9948599338531494, + "num_tokens": 104666956.0, + "step": 878 + }, + { + "entropy": 0.6480984389781952, + "epoch": 2.0022811519817507, + "grad_norm": 0.44140625, + "learning_rate": 4.699061708468732e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9948044419288635, + "num_tokens": 104785944.0, + "step": 879 + }, + { + "entropy": 0.6471492201089859, + "epoch": 2.0045623039635014, + "grad_norm": 0.50390625, + "learning_rate": 4.698165187408635e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9948283657431602, + "num_tokens": 104906071.0, + "step": 880 + }, + { + "epoch": 2.0045623039635014, + "eval_entropy": 0.6471576593221368, + "eval_loss": 0.023211119696497917, + "eval_mean_token_accuracy": 0.9928208351588521, + "eval_num_tokens": 104906071.0, + "eval_runtime": 177.4825, + "eval_samples_per_second": 47.244, + "eval_steps_per_second": 1.482, + "step": 880 + }, + { + "entropy": 0.6432726085186005, + "epoch": 2.006843455945252, + "grad_norm": 0.51171875, + "learning_rate": 4.697267418712415e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9959918409585953, + "num_tokens": 105025428.0, + "step": 881 + }, + { + "entropy": 0.6465716883540154, + "epoch": 2.0091246079270033, + "grad_norm": 0.52734375, + "learning_rate": 4.6963684028896285e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9946444183588028, + "num_tokens": 105144957.0, + "step": 882 + }, + { + "entropy": 0.6429931372404099, + "epoch": 2.011405759908754, + "grad_norm": 0.53125, + "learning_rate": 4.695468140450539e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9938395991921425, + "num_tokens": 105264471.0, + "step": 883 + }, + { + "entropy": 0.6493684574961662, + "epoch": 2.0136869118905047, + "grad_norm": 0.48046875, + "learning_rate": 4.6945666319061166e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9929615259170532, + "num_tokens": 105383968.0, + "step": 884 + }, + { + "entropy": 0.6434665843844414, + "epoch": 2.0159680638722555, + "grad_norm": 0.59765625, + "learning_rate": 4.6936638777680435e-06, + "loss": 0.0268, + "mean_token_accuracy": 0.9910896494984627, + "num_tokens": 105503153.0, + "step": 885 + }, + { + "entropy": 0.642222136259079, + "epoch": 2.018249215854006, + "grad_norm": 0.46484375, + "learning_rate": 4.6927598785487026e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9944727867841721, + "num_tokens": 105621984.0, + "step": 886 + }, + { + "entropy": 0.6534808203577995, + "epoch": 2.020530367835757, + "grad_norm": 0.490234375, + "learning_rate": 4.691854634761188e-06, + "loss": 0.0252, + "mean_token_accuracy": 0.9926039427518845, + "num_tokens": 105742072.0, + "step": 887 + }, + { + "entropy": 0.6416345685720444, + "epoch": 2.022811519817508, + "grad_norm": 0.61328125, + "learning_rate": 4.690948146919299e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9937000274658203, + "num_tokens": 105861553.0, + "step": 888 + }, + { + "entropy": 0.6439993605017662, + "epoch": 2.025092671799259, + "grad_norm": 0.4296875, + "learning_rate": 4.690040415537538e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9935351312160492, + "num_tokens": 105981041.0, + "step": 889 + }, + { + "entropy": 0.6440286561846733, + "epoch": 2.0273738237810095, + "grad_norm": 0.55859375, + "learning_rate": 4.689131441131119e-06, + "loss": 0.027, + "mean_token_accuracy": 0.9925558716058731, + "num_tokens": 106101086.0, + "step": 890 + }, + { + "entropy": 0.6408254206180573, + "epoch": 2.02965497576276, + "grad_norm": 0.494140625, + "learning_rate": 4.6882212242159555e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9949453473091125, + "num_tokens": 106220491.0, + "step": 891 + }, + { + "entropy": 0.648041382431984, + "epoch": 2.031936127744511, + "grad_norm": 0.4296875, + "learning_rate": 4.687309765308671e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9956628829240799, + "num_tokens": 106340406.0, + "step": 892 + }, + { + "entropy": 0.6462336108088493, + "epoch": 2.0342172797262617, + "grad_norm": 0.6171875, + "learning_rate": 4.6863970649265914e-06, + "loss": 0.0244, + "mean_token_accuracy": 0.9906064867973328, + "num_tokens": 106460313.0, + "step": 893 + }, + { + "entropy": 0.641668863594532, + "epoch": 2.0364984317080124, + "grad_norm": 0.427734375, + "learning_rate": 4.685483123587748e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.995160348713398, + "num_tokens": 106579664.0, + "step": 894 + }, + { + "entropy": 0.6438464671373367, + "epoch": 2.0387795836897635, + "grad_norm": 0.421875, + "learning_rate": 4.684567941810876e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9946065247058868, + "num_tokens": 106699383.0, + "step": 895 + }, + { + "entropy": 0.6466414108872414, + "epoch": 2.0410607356715142, + "grad_norm": 0.5859375, + "learning_rate": 4.683651520115414e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9930070415139198, + "num_tokens": 106819364.0, + "step": 896 + }, + { + "entropy": 0.6451754197478294, + "epoch": 2.043341887653265, + "grad_norm": 0.462890625, + "learning_rate": 4.682733859021508e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9947843924164772, + "num_tokens": 106938841.0, + "step": 897 + }, + { + "entropy": 0.6451068595051765, + "epoch": 2.0456230396350157, + "grad_norm": 0.41015625, + "learning_rate": 4.681814959050002e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9948955625295639, + "num_tokens": 107058901.0, + "step": 898 + }, + { + "entropy": 0.6457319334149361, + "epoch": 2.0479041916167664, + "grad_norm": 0.625, + "learning_rate": 4.680894820722446e-06, + "loss": 0.0297, + "mean_token_accuracy": 0.9901825115084648, + "num_tokens": 107178666.0, + "step": 899 + }, + { + "entropy": 0.646767258644104, + "epoch": 2.050185343598517, + "grad_norm": 0.494140625, + "learning_rate": 4.679973444561095e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.9907189160585403, + "num_tokens": 107297832.0, + "step": 900 + }, + { + "entropy": 0.649049885571003, + "epoch": 2.052466495580268, + "grad_norm": 0.69921875, + "learning_rate": 4.679050831088902e-06, + "loss": 0.0275, + "mean_token_accuracy": 0.9922243729233742, + "num_tokens": 107416791.0, + "step": 901 + }, + { + "entropy": 0.6446406692266464, + "epoch": 2.054747647562019, + "grad_norm": 0.51171875, + "learning_rate": 4.678126980829525e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9951703920960426, + "num_tokens": 107536738.0, + "step": 902 + }, + { + "entropy": 0.6480982527136803, + "epoch": 2.0570287995437697, + "grad_norm": 0.671875, + "learning_rate": 4.677201894307325e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9922808334231377, + "num_tokens": 107655947.0, + "step": 903 + }, + { + "entropy": 0.6428212672472, + "epoch": 2.0593099515255204, + "grad_norm": 0.59375, + "learning_rate": 4.676275572047362e-06, + "loss": 0.0294, + "mean_token_accuracy": 0.9906750172376633, + "num_tokens": 107775274.0, + "step": 904 + }, + { + "entropy": 0.6478404179215431, + "epoch": 2.061591103507271, + "grad_norm": 0.56640625, + "learning_rate": 4.675348014575399e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.991539441049099, + "num_tokens": 107894701.0, + "step": 905 + }, + { + "entropy": 0.6416878253221512, + "epoch": 2.063872255489022, + "grad_norm": 0.52734375, + "learning_rate": 4.674419222417899e-06, + "loss": 0.0254, + "mean_token_accuracy": 0.9943337291479111, + "num_tokens": 108013950.0, + "step": 906 + }, + { + "entropy": 0.6459713727235794, + "epoch": 2.0661534074707726, + "grad_norm": 0.4921875, + "learning_rate": 4.673489196102028e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9949918165802956, + "num_tokens": 108133894.0, + "step": 907 + }, + { + "entropy": 0.6449541002511978, + "epoch": 2.0684345594525233, + "grad_norm": 0.59375, + "learning_rate": 4.67255793615565e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9954577386379242, + "num_tokens": 108253091.0, + "step": 908 + }, + { + "entropy": 0.6431689187884331, + "epoch": 2.0707157114342745, + "grad_norm": 0.5390625, + "learning_rate": 4.67162544310733e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9931415542960167, + "num_tokens": 108372675.0, + "step": 909 + }, + { + "entropy": 0.6422490254044533, + "epoch": 2.072996863416025, + "grad_norm": 0.5546875, + "learning_rate": 4.670691717486333e-06, + "loss": 0.019, + "mean_token_accuracy": 0.994296982884407, + "num_tokens": 108492532.0, + "step": 910 + }, + { + "entropy": 0.6493171378970146, + "epoch": 2.075278015397776, + "grad_norm": 0.515625, + "learning_rate": 4.669756759822625e-06, + "loss": 0.0247, + "mean_token_accuracy": 0.9914119392633438, + "num_tokens": 108612063.0, + "step": 911 + }, + { + "entropy": 0.6455090418457985, + "epoch": 2.0775591673795266, + "grad_norm": 0.41796875, + "learning_rate": 4.668820570646868e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9947303459048271, + "num_tokens": 108731151.0, + "step": 912 + }, + { + "entropy": 0.6441130638122559, + "epoch": 2.0798403193612773, + "grad_norm": 0.51171875, + "learning_rate": 4.667883150490427e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9962335973978043, + "num_tokens": 108849954.0, + "step": 913 + }, + { + "entropy": 0.644192561507225, + "epoch": 2.082121471343028, + "grad_norm": 0.5234375, + "learning_rate": 4.666944499885361e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9930669814348221, + "num_tokens": 108969323.0, + "step": 914 + }, + { + "entropy": 0.6402963250875473, + "epoch": 2.084402623324779, + "grad_norm": 0.439453125, + "learning_rate": 4.6660046193644315e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9942471534013748, + "num_tokens": 109088632.0, + "step": 915 + }, + { + "entropy": 0.6394580826163292, + "epoch": 2.08668377530653, + "grad_norm": 0.46875, + "learning_rate": 4.665063509461098e-06, + "loss": 0.0235, + "mean_token_accuracy": 0.9925375580787659, + "num_tokens": 109208366.0, + "step": 916 + }, + { + "entropy": 0.643595464527607, + "epoch": 2.0889649272882806, + "grad_norm": 0.546875, + "learning_rate": 4.664121170709512e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9923162907361984, + "num_tokens": 109328035.0, + "step": 917 + }, + { + "entropy": 0.6456233933568001, + "epoch": 2.0912460792700314, + "grad_norm": 0.65625, + "learning_rate": 4.663177603644532e-06, + "loss": 0.0243, + "mean_token_accuracy": 0.9925654530525208, + "num_tokens": 109447325.0, + "step": 918 + }, + { + "entropy": 0.6403044015169144, + "epoch": 2.093527231251782, + "grad_norm": 0.55078125, + "learning_rate": 4.662232808801704e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9937315508723259, + "num_tokens": 109566831.0, + "step": 919 + }, + { + "entropy": 0.6412009596824646, + "epoch": 2.095808383233533, + "grad_norm": 0.498046875, + "learning_rate": 4.661286786717278e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9943097680807114, + "num_tokens": 109686593.0, + "step": 920 + }, + { + "entropy": 0.643731489777565, + "epoch": 2.0980895352152835, + "grad_norm": 0.482421875, + "learning_rate": 4.660339537928198e-06, + "loss": 0.0237, + "mean_token_accuracy": 0.9920297861099243, + "num_tokens": 109805860.0, + "step": 921 + }, + { + "entropy": 0.6496521756052971, + "epoch": 2.1003706871970347, + "grad_norm": 0.4765625, + "learning_rate": 4.659391062972102e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9930063635110855, + "num_tokens": 109925667.0, + "step": 922 + }, + { + "entropy": 0.6407992318272591, + "epoch": 2.1026518391787854, + "grad_norm": 0.62890625, + "learning_rate": 4.658441362387328e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9938400238752365, + "num_tokens": 110044804.0, + "step": 923 + }, + { + "entropy": 0.6424065679311752, + "epoch": 2.104932991160536, + "grad_norm": 0.54296875, + "learning_rate": 4.657490436712907e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9926265701651573, + "num_tokens": 110163449.0, + "step": 924 + }, + { + "entropy": 0.6422346532344818, + "epoch": 2.107214143142287, + "grad_norm": 0.421875, + "learning_rate": 4.6565382864885665e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9934867322444916, + "num_tokens": 110283057.0, + "step": 925 + }, + { + "entropy": 0.6409388557076454, + "epoch": 2.1094952951240376, + "grad_norm": 0.5546875, + "learning_rate": 4.655584912254727e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9939386770129204, + "num_tokens": 110402640.0, + "step": 926 + }, + { + "entropy": 0.6449766755104065, + "epoch": 2.1117764471057883, + "grad_norm": 0.57421875, + "learning_rate": 4.654630314552508e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9933101981878281, + "num_tokens": 110522076.0, + "step": 927 + }, + { + "entropy": 0.6416251882910728, + "epoch": 2.1140575990875394, + "grad_norm": 0.5078125, + "learning_rate": 4.653674493923718e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9934192821383476, + "num_tokens": 110641944.0, + "step": 928 + }, + { + "entropy": 0.6425573453307152, + "epoch": 2.11633875106929, + "grad_norm": 0.5859375, + "learning_rate": 4.652717450910864e-06, + "loss": 0.0281, + "mean_token_accuracy": 0.9915198236703873, + "num_tokens": 110761583.0, + "step": 929 + }, + { + "entropy": 0.6445116102695465, + "epoch": 2.118619903051041, + "grad_norm": 0.486328125, + "learning_rate": 4.651759186057144e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9957027435302734, + "num_tokens": 110880611.0, + "step": 930 + }, + { + "entropy": 0.6493979915976524, + "epoch": 2.1209010550327916, + "grad_norm": 0.59765625, + "learning_rate": 4.650799699906452e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9945766106247902, + "num_tokens": 110999905.0, + "step": 931 + }, + { + "entropy": 0.6437303498387337, + "epoch": 2.1231822070145423, + "grad_norm": 0.439453125, + "learning_rate": 4.649838993003373e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9935996830463409, + "num_tokens": 111119601.0, + "step": 932 + }, + { + "entropy": 0.6459619253873825, + "epoch": 2.125463358996293, + "grad_norm": 0.412109375, + "learning_rate": 4.648877065893186e-06, + "loss": 0.0115, + "mean_token_accuracy": 0.9956796616315842, + "num_tokens": 111239135.0, + "step": 933 + }, + { + "entropy": 0.6436245292425156, + "epoch": 2.1277445109780437, + "grad_norm": 0.470703125, + "learning_rate": 4.647913919121861e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9951276779174805, + "num_tokens": 111358844.0, + "step": 934 + }, + { + "entropy": 0.6365725472569466, + "epoch": 2.130025662959795, + "grad_norm": 0.5625, + "learning_rate": 4.646949553236064e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9926934540271759, + "num_tokens": 111478602.0, + "step": 935 + }, + { + "entropy": 0.640450119972229, + "epoch": 2.1323068149415456, + "grad_norm": 0.77734375, + "learning_rate": 4.645983968783148e-06, + "loss": 0.0285, + "mean_token_accuracy": 0.9895846769213676, + "num_tokens": 111597960.0, + "step": 936 + }, + { + "entropy": 0.6457747742533684, + "epoch": 2.1345879669232963, + "grad_norm": 0.734375, + "learning_rate": 4.645017166311163e-06, + "loss": 0.0267, + "mean_token_accuracy": 0.9923356175422668, + "num_tokens": 111718200.0, + "step": 937 + }, + { + "entropy": 0.6372727677226067, + "epoch": 2.136869118905047, + "grad_norm": 0.56640625, + "learning_rate": 4.644049146368844e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9937806203961372, + "num_tokens": 111836664.0, + "step": 938 + }, + { + "entropy": 0.6364338994026184, + "epoch": 2.1391502708867978, + "grad_norm": 0.416015625, + "learning_rate": 4.643079909505622e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9947284236550331, + "num_tokens": 111956294.0, + "step": 939 + }, + { + "entropy": 0.64455895870924, + "epoch": 2.1414314228685485, + "grad_norm": 0.56640625, + "learning_rate": 4.642109456271618e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9929002970457077, + "num_tokens": 112075881.0, + "step": 940 + }, + { + "entropy": 0.6412783563137054, + "epoch": 2.143712574850299, + "grad_norm": 0.484375, + "learning_rate": 4.64113778721764e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9957423731684685, + "num_tokens": 112195013.0, + "step": 941 + }, + { + "entropy": 0.6389534920454025, + "epoch": 2.1459937268320504, + "grad_norm": 0.5390625, + "learning_rate": 4.640164902895192e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9929999709129333, + "num_tokens": 112315228.0, + "step": 942 + }, + { + "entropy": 0.636613018810749, + "epoch": 2.148274878813801, + "grad_norm": 0.5, + "learning_rate": 4.6391908038564615e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9951353371143341, + "num_tokens": 112434637.0, + "step": 943 + }, + { + "entropy": 0.6374993845820427, + "epoch": 2.150556030795552, + "grad_norm": 0.4921875, + "learning_rate": 4.6382154906543295e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9943896681070328, + "num_tokens": 112554175.0, + "step": 944 + }, + { + "entropy": 0.6377143263816833, + "epoch": 2.1528371827773025, + "grad_norm": 0.5078125, + "learning_rate": 4.637238963842365e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9928826317191124, + "num_tokens": 112672757.0, + "step": 945 + }, + { + "entropy": 0.6390852555632591, + "epoch": 2.1551183347590532, + "grad_norm": 0.54296875, + "learning_rate": 4.636261223974826e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.994330644607544, + "num_tokens": 112791782.0, + "step": 946 + }, + { + "entropy": 0.6392199769616127, + "epoch": 2.157399486740804, + "grad_norm": 0.51953125, + "learning_rate": 4.635282271606658e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.995146818459034, + "num_tokens": 112911160.0, + "step": 947 + }, + { + "entropy": 0.6351307034492493, + "epoch": 2.1596806387225547, + "grad_norm": 0.6015625, + "learning_rate": 4.634302107293497e-06, + "loss": 0.0272, + "mean_token_accuracy": 0.9938187301158905, + "num_tokens": 113029878.0, + "step": 948 + }, + { + "entropy": 0.6375966891646385, + "epoch": 2.161961790704306, + "grad_norm": 0.4765625, + "learning_rate": 4.633320731591663e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9950644001364708, + "num_tokens": 113149776.0, + "step": 949 + }, + { + "entropy": 0.6451000273227692, + "epoch": 2.1642429426860565, + "grad_norm": 0.62109375, + "learning_rate": 4.632338145058167e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9943834915757179, + "num_tokens": 113269298.0, + "step": 950 + }, + { + "entropy": 0.6390797570347786, + "epoch": 2.1665240946678073, + "grad_norm": 0.6171875, + "learning_rate": 4.631354348250706e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.9931740760803223, + "num_tokens": 113389086.0, + "step": 951 + }, + { + "entropy": 0.638025663793087, + "epoch": 2.168805246649558, + "grad_norm": 0.453125, + "learning_rate": 4.630369341727665e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9946513175964355, + "num_tokens": 113508017.0, + "step": 952 + }, + { + "entropy": 0.6372638195753098, + "epoch": 2.1710863986313087, + "grad_norm": 0.51171875, + "learning_rate": 4.629383126048114e-06, + "loss": 0.023, + "mean_token_accuracy": 0.9923384934663773, + "num_tokens": 113627652.0, + "step": 953 + }, + { + "entropy": 0.6320433840155602, + "epoch": 2.1733675506130594, + "grad_norm": 0.57421875, + "learning_rate": 4.6283957017718105e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9917952120304108, + "num_tokens": 113747503.0, + "step": 954 + }, + { + "entropy": 0.6354814246296883, + "epoch": 2.1756487025948106, + "grad_norm": 0.53515625, + "learning_rate": 4.627407069459196e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9935519024729729, + "num_tokens": 113866505.0, + "step": 955 + }, + { + "entropy": 0.6381952837109566, + "epoch": 2.1779298545765613, + "grad_norm": 0.51171875, + "learning_rate": 4.626417229671401e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9929353296756744, + "num_tokens": 113986787.0, + "step": 956 + }, + { + "entropy": 0.6394190043210983, + "epoch": 2.180211006558312, + "grad_norm": 0.7109375, + "learning_rate": 4.625426182970237e-06, + "loss": 0.0282, + "mean_token_accuracy": 0.9920107275247574, + "num_tokens": 114105786.0, + "step": 957 + }, + { + "entropy": 0.6390003263950348, + "epoch": 2.1824921585400627, + "grad_norm": 0.49609375, + "learning_rate": 4.6244339299182065e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9924478381872177, + "num_tokens": 114225013.0, + "step": 958 + }, + { + "entropy": 0.640265554189682, + "epoch": 2.1847733105218134, + "grad_norm": 0.6328125, + "learning_rate": 4.62344047107849e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9927495270967484, + "num_tokens": 114344370.0, + "step": 959 + }, + { + "entropy": 0.6394573673605919, + "epoch": 2.187054462503564, + "grad_norm": 0.55078125, + "learning_rate": 4.622445807014956e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.9939690008759499, + "num_tokens": 114464418.0, + "step": 960 + }, + { + "entropy": 0.6417965888977051, + "epoch": 2.189335614485315, + "grad_norm": 0.6953125, + "learning_rate": 4.621449938292159e-06, + "loss": 0.0361, + "mean_token_accuracy": 0.990020640194416, + "num_tokens": 114584724.0, + "step": 961 + }, + { + "entropy": 0.636952742934227, + "epoch": 2.191616766467066, + "grad_norm": 0.482421875, + "learning_rate": 4.620452865475331e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9956431835889816, + "num_tokens": 114704750.0, + "step": 962 + }, + { + "entropy": 0.6370786800980568, + "epoch": 2.1938979184488168, + "grad_norm": 0.412109375, + "learning_rate": 4.6194545891303955e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9938665181398392, + "num_tokens": 114823985.0, + "step": 963 + }, + { + "entropy": 0.6380989030003548, + "epoch": 2.1961790704305675, + "grad_norm": 0.69921875, + "learning_rate": 4.618455109823952e-06, + "loss": 0.0317, + "mean_token_accuracy": 0.9911800473928452, + "num_tokens": 114943105.0, + "step": 964 + }, + { + "entropy": 0.6383858099579811, + "epoch": 2.198460222412318, + "grad_norm": 0.65234375, + "learning_rate": 4.617454428123287e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9935815408825874, + "num_tokens": 115061834.0, + "step": 965 + }, + { + "entropy": 0.6421424150466919, + "epoch": 2.200741374394069, + "grad_norm": 0.53125, + "learning_rate": 4.616452544596367e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.993191160261631, + "num_tokens": 115181565.0, + "step": 966 + }, + { + "entropy": 0.6379749476909637, + "epoch": 2.2030225263758196, + "grad_norm": 0.515625, + "learning_rate": 4.615449459811843e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9930060654878616, + "num_tokens": 115300856.0, + "step": 967 + }, + { + "entropy": 0.6391274034976959, + "epoch": 2.205303678357571, + "grad_norm": 0.59375, + "learning_rate": 4.614445174339045e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9938290119171143, + "num_tokens": 115420128.0, + "step": 968 + }, + { + "entropy": 0.6390098109841347, + "epoch": 2.2075848303393215, + "grad_norm": 0.62109375, + "learning_rate": 4.613439688747988e-06, + "loss": 0.0244, + "mean_token_accuracy": 0.9930730164051056, + "num_tokens": 115539999.0, + "step": 969 + }, + { + "entropy": 0.6390435919165611, + "epoch": 2.2098659823210722, + "grad_norm": 0.462890625, + "learning_rate": 4.612433003609365e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.993230365216732, + "num_tokens": 115659040.0, + "step": 970 + }, + { + "entropy": 0.639408066868782, + "epoch": 2.212147134302823, + "grad_norm": 0.53515625, + "learning_rate": 4.611425119494552e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9945095106959343, + "num_tokens": 115778859.0, + "step": 971 + }, + { + "entropy": 0.6427251994609833, + "epoch": 2.2144282862845737, + "grad_norm": 0.61328125, + "learning_rate": 4.6104160369756025e-06, + "loss": 0.0263, + "mean_token_accuracy": 0.9926045313477516, + "num_tokens": 115898802.0, + "step": 972 + }, + { + "entropy": 0.6425415053963661, + "epoch": 2.2167094382663244, + "grad_norm": 0.4765625, + "learning_rate": 4.609405756625254e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9953538551926613, + "num_tokens": 116018393.0, + "step": 973 + }, + { + "entropy": 0.6378829032182693, + "epoch": 2.218990590248075, + "grad_norm": 0.51953125, + "learning_rate": 4.608394279016921e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9941507950425148, + "num_tokens": 116137933.0, + "step": 974 + }, + { + "entropy": 0.6434884816408157, + "epoch": 2.2212717422298263, + "grad_norm": 0.6015625, + "learning_rate": 4.6073816047247e-06, + "loss": 0.0279, + "mean_token_accuracy": 0.9909072071313858, + "num_tokens": 116257762.0, + "step": 975 + }, + { + "entropy": 0.64150620251894, + "epoch": 2.223552894211577, + "grad_norm": 0.49609375, + "learning_rate": 4.606367734323365e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9931074306368828, + "num_tokens": 116377383.0, + "step": 976 + }, + { + "entropy": 0.6393812969326973, + "epoch": 2.2258340461933277, + "grad_norm": 0.54296875, + "learning_rate": 4.605352668388369e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9941477030515671, + "num_tokens": 116497569.0, + "step": 977 + }, + { + "entropy": 0.6368449404835701, + "epoch": 2.2281151981750784, + "grad_norm": 0.5859375, + "learning_rate": 4.6043364074958435e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9945333376526833, + "num_tokens": 116617417.0, + "step": 978 + }, + { + "entropy": 0.6419454589486122, + "epoch": 2.230396350156829, + "grad_norm": 0.515625, + "learning_rate": 4.6033189522226e-06, + "loss": 0.0235, + "mean_token_accuracy": 0.992912545800209, + "num_tokens": 116736440.0, + "step": 979 + }, + { + "entropy": 0.6460841000080109, + "epoch": 2.23267750213858, + "grad_norm": 0.447265625, + "learning_rate": 4.602300303146123e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9932864680886269, + "num_tokens": 116856410.0, + "step": 980 + }, + { + "entropy": 0.6422955989837646, + "epoch": 2.2349586541203306, + "grad_norm": 0.50390625, + "learning_rate": 4.601280460844583e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.995804563164711, + "num_tokens": 116975372.0, + "step": 981 + }, + { + "entropy": 0.6418846771121025, + "epoch": 2.2372398061020817, + "grad_norm": 0.455078125, + "learning_rate": 4.6002594258968185e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9932572320103645, + "num_tokens": 117094505.0, + "step": 982 + }, + { + "entropy": 0.6411086469888687, + "epoch": 2.2395209580838324, + "grad_norm": 0.484375, + "learning_rate": 4.599237198882351e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9925042390823364, + "num_tokens": 117213527.0, + "step": 983 + }, + { + "entropy": 0.6400792449712753, + "epoch": 2.241802110065583, + "grad_norm": 0.47265625, + "learning_rate": 4.598213780381377e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9948024079203606, + "num_tokens": 117332499.0, + "step": 984 + }, + { + "entropy": 0.6408540830016136, + "epoch": 2.244083262047334, + "grad_norm": 0.48046875, + "learning_rate": 4.59718917097477e-06, + "loss": 0.015, + "mean_token_accuracy": 0.9945340007543564, + "num_tokens": 117451870.0, + "step": 985 + }, + { + "entropy": 0.6416864395141602, + "epoch": 2.2463644140290846, + "grad_norm": 0.55078125, + "learning_rate": 4.596163371244076e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9940404444932938, + "num_tokens": 117570995.0, + "step": 986 + }, + { + "entropy": 0.6379028409719467, + "epoch": 2.2486455660108353, + "grad_norm": 0.4140625, + "learning_rate": 4.595136381771521e-06, + "loss": 0.0103, + "mean_token_accuracy": 0.9977867305278778, + "num_tokens": 117689670.0, + "step": 987 + }, + { + "entropy": 0.6407862976193428, + "epoch": 2.250926717992586, + "grad_norm": 0.5703125, + "learning_rate": 4.594108203140004e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9921007230877876, + "num_tokens": 117809465.0, + "step": 988 + }, + { + "entropy": 0.6438969820737839, + "epoch": 2.253207869974337, + "grad_norm": 0.494140625, + "learning_rate": 4.593078835933099e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9947152808308601, + "num_tokens": 117929171.0, + "step": 989 + }, + { + "entropy": 0.6416413187980652, + "epoch": 2.255489021956088, + "grad_norm": 0.51171875, + "learning_rate": 4.592048280735055e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9931126683950424, + "num_tokens": 118048842.0, + "step": 990 + }, + { + "entropy": 0.6384021937847137, + "epoch": 2.2577701739378386, + "grad_norm": 0.57421875, + "learning_rate": 4.591016538130796e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9950251430273056, + "num_tokens": 118167502.0, + "step": 991 + }, + { + "entropy": 0.6365034356713295, + "epoch": 2.2600513259195893, + "grad_norm": 0.5234375, + "learning_rate": 4.589983608705918e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9924382641911507, + "num_tokens": 118286291.0, + "step": 992 + }, + { + "entropy": 0.6411382034420967, + "epoch": 2.26233247790134, + "grad_norm": 0.7421875, + "learning_rate": 4.588949493046693e-06, + "loss": 0.0339, + "mean_token_accuracy": 0.9912251979112625, + "num_tokens": 118405718.0, + "step": 993 + }, + { + "entropy": 0.6407843381166458, + "epoch": 2.264613629883091, + "grad_norm": 0.59375, + "learning_rate": 4.587914191740064e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9935315698385239, + "num_tokens": 118524534.0, + "step": 994 + }, + { + "entropy": 0.6387510299682617, + "epoch": 2.266894781864842, + "grad_norm": 0.5078125, + "learning_rate": 4.586877705373648e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9924075081944466, + "num_tokens": 118643941.0, + "step": 995 + }, + { + "entropy": 0.6496892869472504, + "epoch": 2.2691759338465927, + "grad_norm": 0.578125, + "learning_rate": 4.585840034535736e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9940337613224983, + "num_tokens": 118764368.0, + "step": 996 + }, + { + "entropy": 0.6363160461187363, + "epoch": 2.2714570858283434, + "grad_norm": 0.546875, + "learning_rate": 4.584801179815289e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9926983565092087, + "num_tokens": 118883828.0, + "step": 997 + }, + { + "entropy": 0.6430164501070976, + "epoch": 2.273738237810094, + "grad_norm": 0.56640625, + "learning_rate": 4.583761141801941e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9929351732134819, + "num_tokens": 119002935.0, + "step": 998 + }, + { + "entropy": 0.6377171948552132, + "epoch": 2.276019389791845, + "grad_norm": 0.451171875, + "learning_rate": 4.5827199210859975e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9952782839536667, + "num_tokens": 119122493.0, + "step": 999 + }, + { + "entropy": 0.6448248848319054, + "epoch": 2.2783005417735955, + "grad_norm": 0.65625, + "learning_rate": 4.581677518258435e-06, + "loss": 0.0332, + "mean_token_accuracy": 0.9907676801085472, + "num_tokens": 119241750.0, + "step": 1000 + }, + { + "entropy": 0.6413828358054161, + "epoch": 2.2805816937553463, + "grad_norm": 0.466796875, + "learning_rate": 4.580633933910901e-06, + "loss": 0.014, + "mean_token_accuracy": 0.9953452795743942, + "num_tokens": 119361097.0, + "step": 1001 + }, + { + "entropy": 0.637774296104908, + "epoch": 2.2828628457370974, + "grad_norm": 0.64453125, + "learning_rate": 4.579589168635715e-06, + "loss": 0.0284, + "mean_token_accuracy": 0.9896834120154381, + "num_tokens": 119479615.0, + "step": 1002 + }, + { + "entropy": 0.6366004198789597, + "epoch": 2.285143997718848, + "grad_norm": 0.431640625, + "learning_rate": 4.578543223025865e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.994237057864666, + "num_tokens": 119598631.0, + "step": 1003 + }, + { + "entropy": 0.6397456601262093, + "epoch": 2.287425149700599, + "grad_norm": 0.5703125, + "learning_rate": 4.577496097675009e-06, + "loss": 0.0254, + "mean_token_accuracy": 0.9918204694986343, + "num_tokens": 119717548.0, + "step": 1004 + }, + { + "entropy": 0.635946162045002, + "epoch": 2.2897063016823496, + "grad_norm": 0.5078125, + "learning_rate": 4.576447793177476e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9934871196746826, + "num_tokens": 119836517.0, + "step": 1005 + }, + { + "entropy": 0.6352904215455055, + "epoch": 2.2919874536641003, + "grad_norm": 0.51171875, + "learning_rate": 4.575398310128263e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9939360097050667, + "num_tokens": 119956195.0, + "step": 1006 + }, + { + "entropy": 0.6363174766302109, + "epoch": 2.294268605645851, + "grad_norm": 0.78125, + "learning_rate": 4.574347649123036e-06, + "loss": 0.0364, + "mean_token_accuracy": 0.9912518411874771, + "num_tokens": 120075276.0, + "step": 1007 + }, + { + "entropy": 0.6375698447227478, + "epoch": 2.296549757627602, + "grad_norm": 0.54296875, + "learning_rate": 4.57329581075813e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9928524792194366, + "num_tokens": 120194678.0, + "step": 1008 + }, + { + "entropy": 0.6373138949275017, + "epoch": 2.298830909609353, + "grad_norm": 0.51171875, + "learning_rate": 4.572242795630549e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9939661398530006, + "num_tokens": 120314352.0, + "step": 1009 + }, + { + "entropy": 0.6368154585361481, + "epoch": 2.3011120615911036, + "grad_norm": 0.427734375, + "learning_rate": 4.571188604337963e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9929381012916565, + "num_tokens": 120433992.0, + "step": 1010 + }, + { + "entropy": 0.6388794705271721, + "epoch": 2.3033932135728543, + "grad_norm": 0.5546875, + "learning_rate": 4.570133237478711e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9924069792032242, + "num_tokens": 120552947.0, + "step": 1011 + }, + { + "entropy": 0.6416718289256096, + "epoch": 2.305674365554605, + "grad_norm": 0.4609375, + "learning_rate": 4.5690766956517985e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9950264915823936, + "num_tokens": 120672064.0, + "step": 1012 + }, + { + "entropy": 0.6423712819814682, + "epoch": 2.3079555175363557, + "grad_norm": 0.458984375, + "learning_rate": 4.568018979456899e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9938359335064888, + "num_tokens": 120791725.0, + "step": 1013 + }, + { + "entropy": 0.6429428607225418, + "epoch": 2.3102366695181065, + "grad_norm": 0.58203125, + "learning_rate": 4.566960089494351e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9926755055785179, + "num_tokens": 120911303.0, + "step": 1014 + }, + { + "entropy": 0.637117512524128, + "epoch": 2.312517821499857, + "grad_norm": 0.404296875, + "learning_rate": 4.5659000263651615e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9950164332985878, + "num_tokens": 121030806.0, + "step": 1015 + }, + { + "entropy": 0.6454900354146957, + "epoch": 2.3147989734816083, + "grad_norm": 0.640625, + "learning_rate": 4.564838790671e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9935033991932869, + "num_tokens": 121150440.0, + "step": 1016 + }, + { + "entropy": 0.6460636109113693, + "epoch": 2.317080125463359, + "grad_norm": 0.4453125, + "learning_rate": 4.5637763830142046e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9938573986291885, + "num_tokens": 121269915.0, + "step": 1017 + }, + { + "entropy": 0.6405893340706825, + "epoch": 2.31936127744511, + "grad_norm": 0.44921875, + "learning_rate": 4.562712803997776e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9936657100915909, + "num_tokens": 121389135.0, + "step": 1018 + }, + { + "entropy": 0.6329614296555519, + "epoch": 2.3216424294268605, + "grad_norm": 0.578125, + "learning_rate": 4.5616480542253825e-06, + "loss": 0.0113, + "mean_token_accuracy": 0.996358685195446, + "num_tokens": 121508007.0, + "step": 1019 + }, + { + "entropy": 0.6437092646956444, + "epoch": 2.323923581408611, + "grad_norm": 0.55859375, + "learning_rate": 4.5605821343013555e-06, + "loss": 0.0257, + "mean_token_accuracy": 0.9926828742027283, + "num_tokens": 121627709.0, + "step": 1020 + }, + { + "entropy": 0.6415265500545502, + "epoch": 2.3262047333903624, + "grad_norm": 0.50390625, + "learning_rate": 4.55951504483069e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9926813468337059, + "num_tokens": 121747298.0, + "step": 1021 + }, + { + "entropy": 0.6384659707546234, + "epoch": 2.328485885372113, + "grad_norm": 0.66015625, + "learning_rate": 4.558446786419045e-06, + "loss": 0.026, + "mean_token_accuracy": 0.9914373680949211, + "num_tokens": 121865962.0, + "step": 1022 + }, + { + "entropy": 0.6400692760944366, + "epoch": 2.330767037353864, + "grad_norm": 0.474609375, + "learning_rate": 4.557377359672745e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9949802756309509, + "num_tokens": 121984715.0, + "step": 1023 + }, + { + "entropy": 0.6462912261486053, + "epoch": 2.3330481893356145, + "grad_norm": 0.447265625, + "learning_rate": 4.556306765198775e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9933438301086426, + "num_tokens": 122104382.0, + "step": 1024 + }, + { + "entropy": 0.6369228959083557, + "epoch": 2.3353293413173652, + "grad_norm": 0.48828125, + "learning_rate": 4.555235003604782e-06, + "loss": 0.0249, + "mean_token_accuracy": 0.9924382045865059, + "num_tokens": 122223645.0, + "step": 1025 + }, + { + "entropy": 0.6446040645241737, + "epoch": 2.337610493299116, + "grad_norm": 0.4375, + "learning_rate": 4.55416207549908e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9947635307908058, + "num_tokens": 122342779.0, + "step": 1026 + }, + { + "entropy": 0.6431852951645851, + "epoch": 2.3398916452808667, + "grad_norm": 0.6328125, + "learning_rate": 4.5530879814906404e-06, + "loss": 0.0307, + "mean_token_accuracy": 0.9891186356544495, + "num_tokens": 122461908.0, + "step": 1027 + }, + { + "entropy": 0.6399667784571648, + "epoch": 2.3421727972626174, + "grad_norm": 0.59375, + "learning_rate": 4.5520127221891e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9939208254218102, + "num_tokens": 122581691.0, + "step": 1028 + }, + { + "entropy": 0.6368868127465248, + "epoch": 2.3444539492443686, + "grad_norm": 0.53125, + "learning_rate": 4.5509362982047525e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9936526194214821, + "num_tokens": 122700758.0, + "step": 1029 + }, + { + "entropy": 0.6371084377169609, + "epoch": 2.3467351012261193, + "grad_norm": 0.6015625, + "learning_rate": 4.549858710148558e-06, + "loss": 0.0293, + "mean_token_accuracy": 0.9917342811822891, + "num_tokens": 122820854.0, + "step": 1030 + }, + { + "entropy": 0.6375982835888863, + "epoch": 2.34901625320787, + "grad_norm": 0.455078125, + "learning_rate": 4.548779958632134e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9948002994060516, + "num_tokens": 122940521.0, + "step": 1031 + }, + { + "entropy": 0.642249770462513, + "epoch": 2.3512974051896207, + "grad_norm": 0.51953125, + "learning_rate": 4.5477000442677575e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9942295700311661, + "num_tokens": 123059742.0, + "step": 1032 + }, + { + "entropy": 0.6370107382535934, + "epoch": 2.3535785571713714, + "grad_norm": 0.46875, + "learning_rate": 4.546618967668369e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9959599748253822, + "num_tokens": 123179047.0, + "step": 1033 + }, + { + "entropy": 0.6463580578565598, + "epoch": 2.355859709153122, + "grad_norm": 0.52734375, + "learning_rate": 4.545536729447566e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.9929211959242821, + "num_tokens": 123298856.0, + "step": 1034 + }, + { + "entropy": 0.640204168856144, + "epoch": 2.3581408611348733, + "grad_norm": 0.5390625, + "learning_rate": 4.544453330219606e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.994728147983551, + "num_tokens": 123418320.0, + "step": 1035 + }, + { + "entropy": 0.6407118961215019, + "epoch": 2.360422013116624, + "grad_norm": 0.51171875, + "learning_rate": 4.543368770599406e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9948677569627762, + "num_tokens": 123538021.0, + "step": 1036 + }, + { + "entropy": 0.6387612298130989, + "epoch": 2.3627031650983747, + "grad_norm": 0.5625, + "learning_rate": 4.542283051202539e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9915593713521957, + "num_tokens": 123657120.0, + "step": 1037 + }, + { + "entropy": 0.6396182850003242, + "epoch": 2.3649843170801255, + "grad_norm": 0.53515625, + "learning_rate": 4.541196172645242e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9968800619244576, + "num_tokens": 123776818.0, + "step": 1038 + }, + { + "entropy": 0.6372858732938766, + "epoch": 2.367265469061876, + "grad_norm": 0.640625, + "learning_rate": 4.540108135544403e-06, + "loss": 0.0283, + "mean_token_accuracy": 0.991791658103466, + "num_tokens": 123896357.0, + "step": 1039 + }, + { + "entropy": 0.6443905532360077, + "epoch": 2.369546621043627, + "grad_norm": 0.494140625, + "learning_rate": 4.5390189405175725e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9933754205703735, + "num_tokens": 124015270.0, + "step": 1040 + }, + { + "entropy": 0.6363228932023048, + "epoch": 2.3718277730253776, + "grad_norm": 0.58984375, + "learning_rate": 4.537928588182955e-06, + "loss": 0.0272, + "mean_token_accuracy": 0.9924478679895401, + "num_tokens": 124134640.0, + "step": 1041 + }, + { + "entropy": 0.6408818066120148, + "epoch": 2.374108925007129, + "grad_norm": 0.57421875, + "learning_rate": 4.536837079159416e-06, + "loss": 0.0245, + "mean_token_accuracy": 0.9934322088956833, + "num_tokens": 124253663.0, + "step": 1042 + }, + { + "entropy": 0.639022134244442, + "epoch": 2.3763900769888795, + "grad_norm": 0.55078125, + "learning_rate": 4.535744414066473e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9934694990515709, + "num_tokens": 124372940.0, + "step": 1043 + }, + { + "entropy": 0.6362015008926392, + "epoch": 2.37867122897063, + "grad_norm": 0.5, + "learning_rate": 4.534650593524302e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9920793771743774, + "num_tokens": 124491995.0, + "step": 1044 + }, + { + "entropy": 0.6412335559725761, + "epoch": 2.380952380952381, + "grad_norm": 0.58203125, + "learning_rate": 4.533555618153735e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9930414110422134, + "num_tokens": 124611230.0, + "step": 1045 + }, + { + "entropy": 0.6477549821138382, + "epoch": 2.3832335329341316, + "grad_norm": 0.66015625, + "learning_rate": 4.532459488576258e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9910938516259193, + "num_tokens": 124730119.0, + "step": 1046 + }, + { + "entropy": 0.6424364298582077, + "epoch": 2.3855146849158824, + "grad_norm": 0.515625, + "learning_rate": 4.531362205414013e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9929376170039177, + "num_tokens": 124848979.0, + "step": 1047 + }, + { + "entropy": 0.6455400288105011, + "epoch": 2.3877958368976335, + "grad_norm": 0.578125, + "learning_rate": 4.530263769289798e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.992613285779953, + "num_tokens": 124968986.0, + "step": 1048 + }, + { + "entropy": 0.6424897760152817, + "epoch": 2.3900769888793842, + "grad_norm": 0.58203125, + "learning_rate": 4.529164180827063e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.9913064017891884, + "num_tokens": 125087654.0, + "step": 1049 + }, + { + "entropy": 0.6453209295868874, + "epoch": 2.392358140861135, + "grad_norm": 0.50390625, + "learning_rate": 4.528063440649913e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9941554665565491, + "num_tokens": 125207040.0, + "step": 1050 + }, + { + "entropy": 0.6461070850491524, + "epoch": 2.3946392928428857, + "grad_norm": 0.458984375, + "learning_rate": 4.526961549383109e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9941353425383568, + "num_tokens": 125326284.0, + "step": 1051 + }, + { + "entropy": 0.6403311416506767, + "epoch": 2.3969204448246364, + "grad_norm": 0.515625, + "learning_rate": 4.52585850765206e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9929804056882858, + "num_tokens": 125445516.0, + "step": 1052 + }, + { + "entropy": 0.6430934742093086, + "epoch": 2.399201596806387, + "grad_norm": 0.48046875, + "learning_rate": 4.524754316082833e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9947503581643105, + "num_tokens": 125564687.0, + "step": 1053 + }, + { + "entropy": 0.6435483247041702, + "epoch": 2.401482748788138, + "grad_norm": 0.6953125, + "learning_rate": 4.5236489753021465e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9927422255277634, + "num_tokens": 125683754.0, + "step": 1054 + }, + { + "entropy": 0.6493874192237854, + "epoch": 2.4037639007698886, + "grad_norm": 0.51171875, + "learning_rate": 4.522542485937369e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9946058169007301, + "num_tokens": 125802952.0, + "step": 1055 + }, + { + "entropy": 0.6451443582773209, + "epoch": 2.4060450527516397, + "grad_norm": 0.50390625, + "learning_rate": 4.521434848616523e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9955688044428825, + "num_tokens": 125922444.0, + "step": 1056 + }, + { + "entropy": 0.637938529253006, + "epoch": 2.4083262047333904, + "grad_norm": 0.474609375, + "learning_rate": 4.520326063968283e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9944659918546677, + "num_tokens": 126041310.0, + "step": 1057 + }, + { + "entropy": 0.6378754079341888, + "epoch": 2.410607356715141, + "grad_norm": 0.4453125, + "learning_rate": 4.5192161326219716e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.9957025051116943, + "num_tokens": 126160878.0, + "step": 1058 + }, + { + "entropy": 0.6441041454672813, + "epoch": 2.412888508696892, + "grad_norm": 0.58203125, + "learning_rate": 4.5181050552075665e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9937355890870094, + "num_tokens": 126280165.0, + "step": 1059 + }, + { + "entropy": 0.6401591822504997, + "epoch": 2.4151696606786426, + "grad_norm": 0.640625, + "learning_rate": 4.516992832355694e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9927733615040779, + "num_tokens": 126398710.0, + "step": 1060 + }, + { + "entropy": 0.6446267366409302, + "epoch": 2.4174508126603937, + "grad_norm": 0.53515625, + "learning_rate": 4.515879464697629e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9934676885604858, + "num_tokens": 126518534.0, + "step": 1061 + }, + { + "entropy": 0.640055276453495, + "epoch": 2.4197319646421445, + "grad_norm": 0.62109375, + "learning_rate": 4.514764952865297e-06, + "loss": 0.0293, + "mean_token_accuracy": 0.9923874512314796, + "num_tokens": 126637643.0, + "step": 1062 + }, + { + "entropy": 0.6427299380302429, + "epoch": 2.422013116623895, + "grad_norm": 0.59765625, + "learning_rate": 4.513649297491275e-06, + "loss": 0.034, + "mean_token_accuracy": 0.9906466975808144, + "num_tokens": 126756422.0, + "step": 1063 + }, + { + "entropy": 0.6408056318759918, + "epoch": 2.424294268605646, + "grad_norm": 0.55859375, + "learning_rate": 4.512532499208787e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.9936962649226189, + "num_tokens": 126876100.0, + "step": 1064 + }, + { + "entropy": 0.642096146941185, + "epoch": 2.4265754205873966, + "grad_norm": 0.51953125, + "learning_rate": 4.511414558651706e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9951143711805344, + "num_tokens": 126996391.0, + "step": 1065 + }, + { + "entropy": 0.6348882392048836, + "epoch": 2.4288565725691473, + "grad_norm": 0.466796875, + "learning_rate": 4.5102954764545525e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9943396523594856, + "num_tokens": 127115637.0, + "step": 1066 + }, + { + "entropy": 0.6420768350362778, + "epoch": 2.431137724550898, + "grad_norm": 0.53125, + "learning_rate": 4.509175253252497e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9934955164790154, + "num_tokens": 127234579.0, + "step": 1067 + }, + { + "entropy": 0.6362888365983963, + "epoch": 2.4334188765326488, + "grad_norm": 0.427734375, + "learning_rate": 4.508053889681357e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9943517744541168, + "num_tokens": 127353742.0, + "step": 1068 + }, + { + "entropy": 0.6351320222020149, + "epoch": 2.4357000285144, + "grad_norm": 0.423828125, + "learning_rate": 4.5069313863775956e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9939227029681206, + "num_tokens": 127472724.0, + "step": 1069 + }, + { + "entropy": 0.6404151245951653, + "epoch": 2.4379811804961506, + "grad_norm": 0.51953125, + "learning_rate": 4.505807743978325e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9935252889990807, + "num_tokens": 127592550.0, + "step": 1070 + }, + { + "entropy": 0.6342273280024529, + "epoch": 2.4402623324779014, + "grad_norm": 0.64453125, + "learning_rate": 4.5046829631213014e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9921693131327629, + "num_tokens": 127711617.0, + "step": 1071 + }, + { + "entropy": 0.6379176080226898, + "epoch": 2.442543484459652, + "grad_norm": 0.490234375, + "learning_rate": 4.503557044444931e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9936751276254654, + "num_tokens": 127830447.0, + "step": 1072 + }, + { + "entropy": 0.6387809440493584, + "epoch": 2.444824636441403, + "grad_norm": 0.53515625, + "learning_rate": 4.502429988588263e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9931355193257332, + "num_tokens": 127950503.0, + "step": 1073 + }, + { + "entropy": 0.6355446577072144, + "epoch": 2.4471057884231535, + "grad_norm": 0.5, + "learning_rate": 4.50130179619099e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9937892854213715, + "num_tokens": 128069777.0, + "step": 1074 + }, + { + "entropy": 0.640824094414711, + "epoch": 2.4493869404049047, + "grad_norm": 0.48046875, + "learning_rate": 4.500172467893455e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9951151385903358, + "num_tokens": 128188792.0, + "step": 1075 + }, + { + "entropy": 0.6394430324435234, + "epoch": 2.4516680923866554, + "grad_norm": 0.515625, + "learning_rate": 4.499042004336642e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9930424913764, + "num_tokens": 128307945.0, + "step": 1076 + }, + { + "entropy": 0.6372370645403862, + "epoch": 2.453949244368406, + "grad_norm": 0.671875, + "learning_rate": 4.497910406162182e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.9946125149726868, + "num_tokens": 128427041.0, + "step": 1077 + }, + { + "entropy": 0.6328971609473228, + "epoch": 2.456230396350157, + "grad_norm": 0.7890625, + "learning_rate": 4.496777674012345e-06, + "loss": 0.0288, + "mean_token_accuracy": 0.9911246970295906, + "num_tokens": 128546085.0, + "step": 1078 + }, + { + "entropy": 0.6379141956567764, + "epoch": 2.4585115483319075, + "grad_norm": 0.5390625, + "learning_rate": 4.495643808530049e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9929580464959145, + "num_tokens": 128665339.0, + "step": 1079 + }, + { + "entropy": 0.6434521600604057, + "epoch": 2.4607927003136583, + "grad_norm": 0.65234375, + "learning_rate": 4.494508810358855e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9946243315935135, + "num_tokens": 128784808.0, + "step": 1080 + }, + { + "entropy": 0.6429071575403214, + "epoch": 2.463073852295409, + "grad_norm": 0.53515625, + "learning_rate": 4.4933726801429665e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9926569759845734, + "num_tokens": 128904437.0, + "step": 1081 + }, + { + "entropy": 0.646098680794239, + "epoch": 2.46535500427716, + "grad_norm": 0.5234375, + "learning_rate": 4.4922354185272275e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9950884208083153, + "num_tokens": 129023352.0, + "step": 1082 + }, + { + "entropy": 0.6437766700983047, + "epoch": 2.467636156258911, + "grad_norm": 0.51953125, + "learning_rate": 4.491097026157127e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9918876066803932, + "num_tokens": 129142687.0, + "step": 1083 + }, + { + "entropy": 0.6448287442326546, + "epoch": 2.4699173082406616, + "grad_norm": 0.71875, + "learning_rate": 4.489957503678794e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9924912080168724, + "num_tokens": 129262546.0, + "step": 1084 + }, + { + "entropy": 0.6411218494176865, + "epoch": 2.4721984602224123, + "grad_norm": 0.35546875, + "learning_rate": 4.488816851738999e-06, + "loss": 0.0098, + "mean_token_accuracy": 0.9972352385520935, + "num_tokens": 129381357.0, + "step": 1085 + }, + { + "entropy": 0.6421251818537712, + "epoch": 2.474479612204163, + "grad_norm": 0.58984375, + "learning_rate": 4.487675070985156e-06, + "loss": 0.0259, + "mean_token_accuracy": 0.992126815021038, + "num_tokens": 129500540.0, + "step": 1086 + }, + { + "entropy": 0.6451724767684937, + "epoch": 2.4767607641859137, + "grad_norm": 0.53515625, + "learning_rate": 4.4865321620653144e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9929729700088501, + "num_tokens": 129619739.0, + "step": 1087 + }, + { + "entropy": 0.6416263580322266, + "epoch": 2.479041916167665, + "grad_norm": 0.515625, + "learning_rate": 4.485388125628171e-06, + "loss": 0.0254, + "mean_token_accuracy": 0.9918057024478912, + "num_tokens": 129739003.0, + "step": 1088 + }, + { + "entropy": 0.6400829777121544, + "epoch": 2.4813230681494156, + "grad_norm": 0.482421875, + "learning_rate": 4.484242962323056e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.995732493698597, + "num_tokens": 129858074.0, + "step": 1089 + }, + { + "entropy": 0.6463603004813194, + "epoch": 2.4836042201311663, + "grad_norm": 0.59765625, + "learning_rate": 4.483096672799942e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9936800301074982, + "num_tokens": 129977309.0, + "step": 1090 + }, + { + "entropy": 0.6474750265479088, + "epoch": 2.485885372112917, + "grad_norm": 0.392578125, + "learning_rate": 4.481949257709442e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9942535758018494, + "num_tokens": 130096949.0, + "step": 1091 + }, + { + "entropy": 0.6468400731682777, + "epoch": 2.4881665240946678, + "grad_norm": 0.5234375, + "learning_rate": 4.480800717702807e-06, + "loss": 0.0245, + "mean_token_accuracy": 0.9925299286842346, + "num_tokens": 130217186.0, + "step": 1092 + }, + { + "entropy": 0.6471474915742874, + "epoch": 2.4904476760764185, + "grad_norm": 0.41796875, + "learning_rate": 4.479651053431926e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9939058795571327, + "num_tokens": 130336309.0, + "step": 1093 + }, + { + "entropy": 0.6470202282071114, + "epoch": 2.492728828058169, + "grad_norm": 0.474609375, + "learning_rate": 4.4785002655493246e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9950836151838303, + "num_tokens": 130455828.0, + "step": 1094 + }, + { + "entropy": 0.6430611312389374, + "epoch": 2.49500998003992, + "grad_norm": 0.470703125, + "learning_rate": 4.477348354708169e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9933904558420181, + "num_tokens": 130575425.0, + "step": 1095 + }, + { + "entropy": 0.6436156332492828, + "epoch": 2.497291132021671, + "grad_norm": 0.498046875, + "learning_rate": 4.476195321562262e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.993250273168087, + "num_tokens": 130694331.0, + "step": 1096 + }, + { + "entropy": 0.6472144350409508, + "epoch": 2.499572284003422, + "grad_norm": 0.6484375, + "learning_rate": 4.475041166766042e-06, + "loss": 0.0269, + "mean_token_accuracy": 0.992514356970787, + "num_tokens": 130813546.0, + "step": 1097 + }, + { + "entropy": 0.6492441743612289, + "epoch": 2.5018534359851725, + "grad_norm": 0.64453125, + "learning_rate": 4.473885890974586e-06, + "loss": 0.0276, + "mean_token_accuracy": 0.9923117980360985, + "num_tokens": 130933419.0, + "step": 1098 + }, + { + "entropy": 0.6491595283150673, + "epoch": 2.5041345879669232, + "grad_norm": 0.447265625, + "learning_rate": 4.472729494843605e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9941786378622055, + "num_tokens": 131052639.0, + "step": 1099 + }, + { + "entropy": 0.6450280249118805, + "epoch": 2.506415739948674, + "grad_norm": 0.462890625, + "learning_rate": 4.471571979029448e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9957521557807922, + "num_tokens": 131172222.0, + "step": 1100 + }, + { + "epoch": 2.506415739948674, + "eval_entropy": 0.6463129320978666, + "eval_loss": 0.02239544689655304, + "eval_mean_token_accuracy": 0.9930465352852534, + "eval_num_tokens": 131172222.0, + "eval_runtime": 177.4802, + "eval_samples_per_second": 47.245, + "eval_steps_per_second": 1.482, + "step": 1100 + }, + { + "entropy": 0.6445396915078163, + "epoch": 2.508696891930425, + "grad_norm": 0.455078125, + "learning_rate": 4.470413344189098e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9946949705481529, + "num_tokens": 131290888.0, + "step": 1101 + }, + { + "entropy": 0.6448357030749321, + "epoch": 2.510978043912176, + "grad_norm": 0.59375, + "learning_rate": 4.469253590980175e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9926307052373886, + "num_tokens": 131410071.0, + "step": 1102 + }, + { + "entropy": 0.6436197385191917, + "epoch": 2.5132591958939265, + "grad_norm": 0.46875, + "learning_rate": 4.46809272006093e-06, + "loss": 0.014, + "mean_token_accuracy": 0.9965946227312088, + "num_tokens": 131529236.0, + "step": 1103 + }, + { + "entropy": 0.6423685103654861, + "epoch": 2.5155403478756773, + "grad_norm": 0.3671875, + "learning_rate": 4.466930732090254e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9956322982907295, + "num_tokens": 131648130.0, + "step": 1104 + }, + { + "entropy": 0.6457662358880043, + "epoch": 2.517821499857428, + "grad_norm": 0.5625, + "learning_rate": 4.465767627727668e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9933006837964058, + "num_tokens": 131768228.0, + "step": 1105 + }, + { + "entropy": 0.6431275904178619, + "epoch": 2.5201026518391787, + "grad_norm": 0.56640625, + "learning_rate": 4.464603407633326e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9929965287446976, + "num_tokens": 131887518.0, + "step": 1106 + }, + { + "entropy": 0.6418722197413445, + "epoch": 2.5223838038209294, + "grad_norm": 0.703125, + "learning_rate": 4.463438072468018e-06, + "loss": 0.0318, + "mean_token_accuracy": 0.9900112152099609, + "num_tokens": 132006061.0, + "step": 1107 + }, + { + "entropy": 0.6461492255330086, + "epoch": 2.52466495580268, + "grad_norm": 0.79296875, + "learning_rate": 4.462271622893166e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9944084137678146, + "num_tokens": 132125300.0, + "step": 1108 + }, + { + "entropy": 0.6445858404040337, + "epoch": 2.5269461077844313, + "grad_norm": 0.55078125, + "learning_rate": 4.461104059570825e-06, + "loss": 0.0245, + "mean_token_accuracy": 0.9933403059840202, + "num_tokens": 132244586.0, + "step": 1109 + }, + { + "entropy": 0.6474030017852783, + "epoch": 2.529227259766182, + "grad_norm": 0.478515625, + "learning_rate": 4.4599353831636785e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9926426932215691, + "num_tokens": 132364304.0, + "step": 1110 + }, + { + "entropy": 0.6458150967955589, + "epoch": 2.5315084117479327, + "grad_norm": 0.48046875, + "learning_rate": 4.458765594335048e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9940094128251076, + "num_tokens": 132483439.0, + "step": 1111 + }, + { + "entropy": 0.6529837101697922, + "epoch": 2.5337895637296834, + "grad_norm": 0.609375, + "learning_rate": 4.457594693748881e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9940134584903717, + "num_tokens": 132602628.0, + "step": 1112 + }, + { + "entropy": 0.6495637521147728, + "epoch": 2.536070715711434, + "grad_norm": 0.515625, + "learning_rate": 4.456422682069758e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.994764655828476, + "num_tokens": 132721783.0, + "step": 1113 + }, + { + "entropy": 0.6470118537545204, + "epoch": 2.5383518676931853, + "grad_norm": 0.7421875, + "learning_rate": 4.455249559962892e-06, + "loss": 0.0267, + "mean_token_accuracy": 0.9932965636253357, + "num_tokens": 132840910.0, + "step": 1114 + }, + { + "entropy": 0.6475682109594345, + "epoch": 2.540633019674936, + "grad_norm": 0.484375, + "learning_rate": 4.454075328094123e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9924033731222153, + "num_tokens": 132960930.0, + "step": 1115 + }, + { + "entropy": 0.6452540755271912, + "epoch": 2.5429141716566868, + "grad_norm": 0.71484375, + "learning_rate": 4.452899987129922e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.9935204386711121, + "num_tokens": 133081228.0, + "step": 1116 + }, + { + "entropy": 0.6446475237607956, + "epoch": 2.5451953236384375, + "grad_norm": 0.498046875, + "learning_rate": 4.4517235377373915e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9940571635961533, + "num_tokens": 133200620.0, + "step": 1117 + }, + { + "entropy": 0.6410806402564049, + "epoch": 2.547476475620188, + "grad_norm": 0.66015625, + "learning_rate": 4.45054598058426e-06, + "loss": 0.0259, + "mean_token_accuracy": 0.992426835000515, + "num_tokens": 133319769.0, + "step": 1118 + }, + { + "entropy": 0.6466113552451134, + "epoch": 2.549757627601939, + "grad_norm": 0.6953125, + "learning_rate": 4.449367316338887e-06, + "loss": 0.0294, + "mean_token_accuracy": 0.9909719675779343, + "num_tokens": 133439166.0, + "step": 1119 + }, + { + "entropy": 0.654993899166584, + "epoch": 2.5520387795836896, + "grad_norm": 0.404296875, + "learning_rate": 4.448187545670258e-06, + "loss": 0.0125, + "mean_token_accuracy": 0.9960489347577095, + "num_tokens": 133558888.0, + "step": 1120 + }, + { + "entropy": 0.6430490240454674, + "epoch": 2.5543199315654403, + "grad_norm": 0.5234375, + "learning_rate": 4.44700666924799e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9925662130117416, + "num_tokens": 133677725.0, + "step": 1121 + }, + { + "entropy": 0.6473485231399536, + "epoch": 2.556601083547191, + "grad_norm": 0.67578125, + "learning_rate": 4.4458246877423254e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9927801787853241, + "num_tokens": 133796863.0, + "step": 1122 + }, + { + "entropy": 0.6410003155469894, + "epoch": 2.5588822355289422, + "grad_norm": 0.47265625, + "learning_rate": 4.444641601824134e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9923785775899887, + "num_tokens": 133915759.0, + "step": 1123 + }, + { + "entropy": 0.6478569433093071, + "epoch": 2.561163387510693, + "grad_norm": 0.52734375, + "learning_rate": 4.443457412164911e-06, + "loss": 0.017, + "mean_token_accuracy": 0.993810847401619, + "num_tokens": 134034973.0, + "step": 1124 + }, + { + "entropy": 0.6486515328288078, + "epoch": 2.5634445394924437, + "grad_norm": 0.490234375, + "learning_rate": 4.442272119436781e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9950217455625534, + "num_tokens": 134155094.0, + "step": 1125 + }, + { + "entropy": 0.6491230428218842, + "epoch": 2.5657256914741944, + "grad_norm": 0.66015625, + "learning_rate": 4.441085724312494e-06, + "loss": 0.0316, + "mean_token_accuracy": 0.9900611862540245, + "num_tokens": 134274561.0, + "step": 1126 + }, + { + "entropy": 0.6477911099791527, + "epoch": 2.568006843455945, + "grad_norm": 0.5078125, + "learning_rate": 4.4398982274654235e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9948785379528999, + "num_tokens": 134394817.0, + "step": 1127 + }, + { + "entropy": 0.6417411789298058, + "epoch": 2.5702879954376963, + "grad_norm": 0.404296875, + "learning_rate": 4.43870962956957e-06, + "loss": 0.0102, + "mean_token_accuracy": 0.9973216652870178, + "num_tokens": 134514447.0, + "step": 1128 + }, + { + "entropy": 0.646095298230648, + "epoch": 2.572569147419447, + "grad_norm": 0.56640625, + "learning_rate": 4.437519931299559e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.9937388822436333, + "num_tokens": 134634950.0, + "step": 1129 + }, + { + "entropy": 0.6438381746411324, + "epoch": 2.5748502994011977, + "grad_norm": 0.435546875, + "learning_rate": 4.43632913333064e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9958721920847893, + "num_tokens": 134754112.0, + "step": 1130 + }, + { + "entropy": 0.6496585384011269, + "epoch": 2.5771314513829484, + "grad_norm": 0.40625, + "learning_rate": 4.435137236338688e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9958222582936287, + "num_tokens": 134873910.0, + "step": 1131 + }, + { + "entropy": 0.6482359990477562, + "epoch": 2.579412603364699, + "grad_norm": 0.50390625, + "learning_rate": 4.433944241000199e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9951571077108383, + "num_tokens": 134993239.0, + "step": 1132 + }, + { + "entropy": 0.6459065303206444, + "epoch": 2.58169375534645, + "grad_norm": 0.421875, + "learning_rate": 4.4327501479922955e-06, + "loss": 0.0121, + "mean_token_accuracy": 0.9952214956283569, + "num_tokens": 135112998.0, + "step": 1133 + }, + { + "entropy": 0.648823969066143, + "epoch": 2.5839749073282006, + "grad_norm": 0.41015625, + "learning_rate": 4.431554957992722e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9928578734397888, + "num_tokens": 135231994.0, + "step": 1134 + }, + { + "entropy": 0.6432489156723022, + "epoch": 2.5862560593099513, + "grad_norm": 0.53125, + "learning_rate": 4.430358671679843e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9923126175999641, + "num_tokens": 135350741.0, + "step": 1135 + }, + { + "entropy": 0.6442755311727524, + "epoch": 2.5885372112917024, + "grad_norm": 0.68359375, + "learning_rate": 4.42916128973265e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9909946024417877, + "num_tokens": 135469916.0, + "step": 1136 + }, + { + "entropy": 0.6450408101081848, + "epoch": 2.590818363273453, + "grad_norm": 0.6171875, + "learning_rate": 4.427962812830753e-06, + "loss": 0.0223, + "mean_token_accuracy": 0.992295116186142, + "num_tokens": 135589564.0, + "step": 1137 + }, + { + "entropy": 0.6442283764481544, + "epoch": 2.593099515255204, + "grad_norm": 0.57421875, + "learning_rate": 4.426763241654383e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9927260652184486, + "num_tokens": 135709134.0, + "step": 1138 + }, + { + "entropy": 0.6440069079399109, + "epoch": 2.5953806672369546, + "grad_norm": 0.5703125, + "learning_rate": 4.425562576884396e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9927716478705406, + "num_tokens": 135828121.0, + "step": 1139 + }, + { + "entropy": 0.6461745277047157, + "epoch": 2.5976618192187053, + "grad_norm": 0.5, + "learning_rate": 4.424360819202264e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9946931526064873, + "num_tokens": 135947221.0, + "step": 1140 + }, + { + "entropy": 0.6459729447960854, + "epoch": 2.5999429712004565, + "grad_norm": 0.482421875, + "learning_rate": 4.423157969290081e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9935640394687653, + "num_tokens": 136066191.0, + "step": 1141 + }, + { + "entropy": 0.6463459506630898, + "epoch": 2.602224123182207, + "grad_norm": 0.57421875, + "learning_rate": 4.421954027830565e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.991445355117321, + "num_tokens": 136185838.0, + "step": 1142 + }, + { + "entropy": 0.6463172435760498, + "epoch": 2.604505275163958, + "grad_norm": 0.45703125, + "learning_rate": 4.4207489955070465e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9937485978007317, + "num_tokens": 136304733.0, + "step": 1143 + }, + { + "entropy": 0.6474164128303528, + "epoch": 2.6067864271457086, + "grad_norm": 0.498046875, + "learning_rate": 4.419542873003479e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9934702590107918, + "num_tokens": 136424561.0, + "step": 1144 + }, + { + "entropy": 0.6465034857392311, + "epoch": 2.6090675791274593, + "grad_norm": 0.64453125, + "learning_rate": 4.418335661004436e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9937256425619125, + "num_tokens": 136543783.0, + "step": 1145 + }, + { + "entropy": 0.6469360142946243, + "epoch": 2.61134873110921, + "grad_norm": 0.56640625, + "learning_rate": 4.417127360195107e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.9933875426650047, + "num_tokens": 136663283.0, + "step": 1146 + }, + { + "entropy": 0.6486634761095047, + "epoch": 2.613629883090961, + "grad_norm": 0.58203125, + "learning_rate": 4.415917971261299e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.9923209547996521, + "num_tokens": 136783179.0, + "step": 1147 + }, + { + "entropy": 0.6522021070122719, + "epoch": 2.6159110350727115, + "grad_norm": 0.46875, + "learning_rate": 4.414707494889439e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9967024698853493, + "num_tokens": 136902612.0, + "step": 1148 + }, + { + "entropy": 0.6517991349101067, + "epoch": 2.6181921870544627, + "grad_norm": 0.64453125, + "learning_rate": 4.413495931766571e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9917541816830635, + "num_tokens": 137022520.0, + "step": 1149 + }, + { + "entropy": 0.6492122113704681, + "epoch": 2.6204733390362134, + "grad_norm": 0.474609375, + "learning_rate": 4.412283282580352e-06, + "loss": 0.0097, + "mean_token_accuracy": 0.9968208372592926, + "num_tokens": 137141462.0, + "step": 1150 + }, + { + "entropy": 0.6446317359805107, + "epoch": 2.622754491017964, + "grad_norm": 0.5546875, + "learning_rate": 4.41106954801906e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9947995916008949, + "num_tokens": 137260999.0, + "step": 1151 + }, + { + "entropy": 0.6461166068911552, + "epoch": 2.625035642999715, + "grad_norm": 0.486328125, + "learning_rate": 4.409854728771588e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9945811331272125, + "num_tokens": 137380639.0, + "step": 1152 + }, + { + "entropy": 0.6422529220581055, + "epoch": 2.6273167949814655, + "grad_norm": 0.58203125, + "learning_rate": 4.4086388255274425e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9928118586540222, + "num_tokens": 137499595.0, + "step": 1153 + }, + { + "entropy": 0.6429700180888176, + "epoch": 2.6295979469632167, + "grad_norm": 0.7109375, + "learning_rate": 4.407421838976747e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9931432902812958, + "num_tokens": 137619213.0, + "step": 1154 + }, + { + "entropy": 0.6506702825427055, + "epoch": 2.6318790989449674, + "grad_norm": 0.470703125, + "learning_rate": 4.40620376981024e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9952447935938835, + "num_tokens": 137738698.0, + "step": 1155 + }, + { + "entropy": 0.6407376825809479, + "epoch": 2.634160250926718, + "grad_norm": 0.60546875, + "learning_rate": 4.404984618719275e-06, + "loss": 0.0261, + "mean_token_accuracy": 0.9916696101427078, + "num_tokens": 137857577.0, + "step": 1156 + }, + { + "entropy": 0.6428218334913254, + "epoch": 2.636441402908469, + "grad_norm": 0.640625, + "learning_rate": 4.403764386395817e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9946346208453178, + "num_tokens": 137977603.0, + "step": 1157 + }, + { + "entropy": 0.6477707326412201, + "epoch": 2.6387225548902196, + "grad_norm": 0.4453125, + "learning_rate": 4.402543073532446e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9944760352373123, + "num_tokens": 138097609.0, + "step": 1158 + }, + { + "entropy": 0.653021790087223, + "epoch": 2.6410037068719703, + "grad_norm": 0.44140625, + "learning_rate": 4.401320680822357e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9956031814217567, + "num_tokens": 138217691.0, + "step": 1159 + }, + { + "entropy": 0.6435412988066673, + "epoch": 2.643284858853721, + "grad_norm": 0.99609375, + "learning_rate": 4.400097208959357e-06, + "loss": 0.0286, + "mean_token_accuracy": 0.9916277304291725, + "num_tokens": 138338065.0, + "step": 1160 + }, + { + "entropy": 0.6484354138374329, + "epoch": 2.6455660108354717, + "grad_norm": 0.4609375, + "learning_rate": 4.398872658637863e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.994301050901413, + "num_tokens": 138457298.0, + "step": 1161 + }, + { + "entropy": 0.6442309245467186, + "epoch": 2.6478471628172224, + "grad_norm": 0.54296875, + "learning_rate": 4.397647030552907e-06, + "loss": 0.0261, + "mean_token_accuracy": 0.9929163083434105, + "num_tokens": 138576196.0, + "step": 1162 + }, + { + "entropy": 0.6462688744068146, + "epoch": 2.6501283147989736, + "grad_norm": 0.54296875, + "learning_rate": 4.396420325400132e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9914287105202675, + "num_tokens": 138695453.0, + "step": 1163 + }, + { + "entropy": 0.6453618705272675, + "epoch": 2.6524094667807243, + "grad_norm": 0.578125, + "learning_rate": 4.3951925438757936e-06, + "loss": 0.0268, + "mean_token_accuracy": 0.9934358075261116, + "num_tokens": 138815290.0, + "step": 1164 + }, + { + "entropy": 0.6477271839976311, + "epoch": 2.654690618762475, + "grad_norm": 0.5859375, + "learning_rate": 4.3939636866767535e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9936697483062744, + "num_tokens": 138935132.0, + "step": 1165 + }, + { + "entropy": 0.647329106926918, + "epoch": 2.6569717707442257, + "grad_norm": 0.5, + "learning_rate": 4.39273375450049e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.9920663684606552, + "num_tokens": 139054096.0, + "step": 1166 + }, + { + "entropy": 0.6424959227442741, + "epoch": 2.6592529227259765, + "grad_norm": 0.51953125, + "learning_rate": 4.391502748045088e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9942084401845932, + "num_tokens": 139173009.0, + "step": 1167 + }, + { + "entropy": 0.6475557088851929, + "epoch": 2.6615340747077276, + "grad_norm": 0.48828125, + "learning_rate": 4.390270668009244e-06, + "loss": 0.0228, + "mean_token_accuracy": 0.9927590414881706, + "num_tokens": 139292270.0, + "step": 1168 + }, + { + "entropy": 0.6457214057445526, + "epoch": 2.6638152266894783, + "grad_norm": 0.5, + "learning_rate": 4.38903751509226e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.994793213903904, + "num_tokens": 139411932.0, + "step": 1169 + }, + { + "entropy": 0.6480599716305733, + "epoch": 2.666096378671229, + "grad_norm": 0.6015625, + "learning_rate": 4.3878032899940534e-06, + "loss": 0.0272, + "mean_token_accuracy": 0.9924744293093681, + "num_tokens": 139531186.0, + "step": 1170 + }, + { + "entropy": 0.6498171910643578, + "epoch": 2.66837753065298, + "grad_norm": 0.443359375, + "learning_rate": 4.386567993415144e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9927224963903427, + "num_tokens": 139650867.0, + "step": 1171 + }, + { + "entropy": 0.6491375118494034, + "epoch": 2.6706586826347305, + "grad_norm": 0.421875, + "learning_rate": 4.3853316260566635e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9941316023468971, + "num_tokens": 139770170.0, + "step": 1172 + }, + { + "entropy": 0.6510792598128319, + "epoch": 2.672939834616481, + "grad_norm": 0.5703125, + "learning_rate": 4.384094188620349e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9939843267202377, + "num_tokens": 139888822.0, + "step": 1173 + }, + { + "entropy": 0.6481146365404129, + "epoch": 2.675220986598232, + "grad_norm": 0.5859375, + "learning_rate": 4.3828556818085485e-06, + "loss": 0.0255, + "mean_token_accuracy": 0.9928008541464806, + "num_tokens": 140008199.0, + "step": 1174 + }, + { + "entropy": 0.6431998014450073, + "epoch": 2.6775021385799826, + "grad_norm": 0.46875, + "learning_rate": 4.3816161063242115e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.9930864572525024, + "num_tokens": 140126814.0, + "step": 1175 + }, + { + "entropy": 0.6448974683880806, + "epoch": 2.679783290561734, + "grad_norm": 0.53515625, + "learning_rate": 4.3803754628708995e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9937411323189735, + "num_tokens": 140246704.0, + "step": 1176 + }, + { + "entropy": 0.641652300953865, + "epoch": 2.6820644425434845, + "grad_norm": 0.52734375, + "learning_rate": 4.379133752152776e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.994914822280407, + "num_tokens": 140366462.0, + "step": 1177 + }, + { + "entropy": 0.6454568728804588, + "epoch": 2.6843455945252352, + "grad_norm": 0.62109375, + "learning_rate": 4.377890974874614e-06, + "loss": 0.0257, + "mean_token_accuracy": 0.9917562454938889, + "num_tokens": 140486821.0, + "step": 1178 + }, + { + "entropy": 0.6510927230119705, + "epoch": 2.686626746506986, + "grad_norm": 0.48828125, + "learning_rate": 4.376647131741787e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9940659776329994, + "num_tokens": 140606201.0, + "step": 1179 + }, + { + "entropy": 0.6487446278333664, + "epoch": 2.6889078984887367, + "grad_norm": 0.67578125, + "learning_rate": 4.375402223460279e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.993372231721878, + "num_tokens": 140725315.0, + "step": 1180 + }, + { + "entropy": 0.655800573527813, + "epoch": 2.691189050470488, + "grad_norm": 0.46875, + "learning_rate": 4.3741562507366754e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9951123520731926, + "num_tokens": 140844723.0, + "step": 1181 + }, + { + "entropy": 0.6461532339453697, + "epoch": 2.6934702024522386, + "grad_norm": 0.609375, + "learning_rate": 4.3729092142781655e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.993079274892807, + "num_tokens": 140964218.0, + "step": 1182 + }, + { + "entropy": 0.6476193442940712, + "epoch": 2.6957513544339893, + "grad_norm": 0.439453125, + "learning_rate": 4.3716611147925435e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.995073564350605, + "num_tokens": 141083308.0, + "step": 1183 + }, + { + "entropy": 0.6513443514704704, + "epoch": 2.69803250641574, + "grad_norm": 0.5234375, + "learning_rate": 4.370411952988207e-06, + "loss": 0.0254, + "mean_token_accuracy": 0.9929497316479683, + "num_tokens": 141202975.0, + "step": 1184 + }, + { + "entropy": 0.6504493877291679, + "epoch": 2.7003136583974907, + "grad_norm": 0.5390625, + "learning_rate": 4.369161729574155e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9925594702363014, + "num_tokens": 141321914.0, + "step": 1185 + }, + { + "entropy": 0.6488075032830238, + "epoch": 2.7025948103792414, + "grad_norm": 0.59375, + "learning_rate": 4.367910445259991e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.994489423930645, + "num_tokens": 141440736.0, + "step": 1186 + }, + { + "entropy": 0.6472251489758492, + "epoch": 2.704875962360992, + "grad_norm": 0.60546875, + "learning_rate": 4.36665810075592e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9941000714898109, + "num_tokens": 141559800.0, + "step": 1187 + }, + { + "entropy": 0.6469399556517601, + "epoch": 2.707157114342743, + "grad_norm": 0.51171875, + "learning_rate": 4.365404696772748e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9949042722582817, + "num_tokens": 141679516.0, + "step": 1188 + }, + { + "entropy": 0.651231124997139, + "epoch": 2.709438266324494, + "grad_norm": 0.5546875, + "learning_rate": 4.364150234021883e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9923813417553902, + "num_tokens": 141799136.0, + "step": 1189 + }, + { + "entropy": 0.6481442674994469, + "epoch": 2.7117194183062447, + "grad_norm": 0.455078125, + "learning_rate": 4.362894713215334e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9930433928966522, + "num_tokens": 141919584.0, + "step": 1190 + }, + { + "entropy": 0.6489777788519859, + "epoch": 2.7140005702879955, + "grad_norm": 0.43359375, + "learning_rate": 4.361638135065711e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.994399294257164, + "num_tokens": 142040620.0, + "step": 1191 + }, + { + "entropy": 0.6477468684315681, + "epoch": 2.716281722269746, + "grad_norm": 0.48046875, + "learning_rate": 4.360380500286222e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.992766983807087, + "num_tokens": 142160164.0, + "step": 1192 + }, + { + "entropy": 0.6486508175730705, + "epoch": 2.718562874251497, + "grad_norm": 0.53515625, + "learning_rate": 4.359121809590678e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9932355359196663, + "num_tokens": 142279761.0, + "step": 1193 + }, + { + "entropy": 0.6455471590161324, + "epoch": 2.720844026233248, + "grad_norm": 0.6328125, + "learning_rate": 4.357862063693486e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9937280714511871, + "num_tokens": 142398522.0, + "step": 1194 + }, + { + "entropy": 0.645331509411335, + "epoch": 2.7231251782149988, + "grad_norm": 0.57421875, + "learning_rate": 4.356601263309654e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.9922391623258591, + "num_tokens": 142517371.0, + "step": 1195 + }, + { + "entropy": 0.6490024402737617, + "epoch": 2.7254063301967495, + "grad_norm": 0.42578125, + "learning_rate": 4.355339409154788e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9950054511427879, + "num_tokens": 142636844.0, + "step": 1196 + }, + { + "entropy": 0.6518825590610504, + "epoch": 2.7276874821785, + "grad_norm": 0.5859375, + "learning_rate": 4.354076501945093e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9937741011381149, + "num_tokens": 142756429.0, + "step": 1197 + }, + { + "entropy": 0.6469733342528343, + "epoch": 2.729968634160251, + "grad_norm": 0.6015625, + "learning_rate": 4.352812542397369e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.993604026734829, + "num_tokens": 142875215.0, + "step": 1198 + }, + { + "entropy": 0.6454950571060181, + "epoch": 2.7322497861420016, + "grad_norm": 0.57421875, + "learning_rate": 4.351547531229016e-06, + "loss": 0.0247, + "mean_token_accuracy": 0.9935324415564537, + "num_tokens": 142994375.0, + "step": 1199 + }, + { + "entropy": 0.6500625535845757, + "epoch": 2.7345309381237524, + "grad_norm": 0.546875, + "learning_rate": 4.350281469158029e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9918430000543594, + "num_tokens": 143113296.0, + "step": 1200 + }, + { + "entropy": 0.650897927582264, + "epoch": 2.736812090105503, + "grad_norm": 0.51171875, + "learning_rate": 4.3490143569030025e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.993792824447155, + "num_tokens": 143232528.0, + "step": 1201 + }, + { + "entropy": 0.6542709097266197, + "epoch": 2.739093242087254, + "grad_norm": 0.609375, + "learning_rate": 4.347746195183123e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9929731488227844, + "num_tokens": 143352186.0, + "step": 1202 + }, + { + "entropy": 0.6570945307612419, + "epoch": 2.741374394069005, + "grad_norm": 0.546875, + "learning_rate": 4.346476984718176e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9925025328993797, + "num_tokens": 143472075.0, + "step": 1203 + }, + { + "entropy": 0.6530449762940407, + "epoch": 2.7436555460507557, + "grad_norm": 0.451171875, + "learning_rate": 4.345206726228538e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9949450492858887, + "num_tokens": 143591472.0, + "step": 1204 + }, + { + "entropy": 0.6523020043969154, + "epoch": 2.7459366980325064, + "grad_norm": 0.60546875, + "learning_rate": 4.343935420435187e-06, + "loss": 0.025, + "mean_token_accuracy": 0.9921044185757637, + "num_tokens": 143710711.0, + "step": 1205 + }, + { + "entropy": 0.6500193551182747, + "epoch": 2.748217850014257, + "grad_norm": 0.46484375, + "learning_rate": 4.34266306805969e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9948188364505768, + "num_tokens": 143829622.0, + "step": 1206 + }, + { + "entropy": 0.6495638266205788, + "epoch": 2.750499001996008, + "grad_norm": 0.55078125, + "learning_rate": 4.341389669824209e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9931730180978775, + "num_tokens": 143948590.0, + "step": 1207 + }, + { + "entropy": 0.6505590304732323, + "epoch": 2.752780153977759, + "grad_norm": 0.490234375, + "learning_rate": 4.340115226451501e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9953299537301064, + "num_tokens": 144068239.0, + "step": 1208 + }, + { + "entropy": 0.6552454307675362, + "epoch": 2.7550613059595097, + "grad_norm": 0.490234375, + "learning_rate": 4.338839738664915e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9951419234275818, + "num_tokens": 144187606.0, + "step": 1209 + }, + { + "entropy": 0.6514817774295807, + "epoch": 2.7573424579412604, + "grad_norm": 0.6171875, + "learning_rate": 4.3375632071883935e-06, + "loss": 0.0273, + "mean_token_accuracy": 0.9917921125888824, + "num_tokens": 144306789.0, + "step": 1210 + }, + { + "entropy": 0.6482063233852386, + "epoch": 2.759623609923011, + "grad_norm": 0.51171875, + "learning_rate": 4.336285632746472e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9917678683996201, + "num_tokens": 144426146.0, + "step": 1211 + }, + { + "entropy": 0.6508849635720253, + "epoch": 2.761904761904762, + "grad_norm": 0.50390625, + "learning_rate": 4.3350070160642754e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9920036122202873, + "num_tokens": 144545143.0, + "step": 1212 + }, + { + "entropy": 0.6520933359861374, + "epoch": 2.7641859138865126, + "grad_norm": 0.6484375, + "learning_rate": 4.333727357867523e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9944972321391106, + "num_tokens": 144665086.0, + "step": 1213 + }, + { + "entropy": 0.6509671211242676, + "epoch": 2.7664670658682633, + "grad_norm": 0.51953125, + "learning_rate": 4.3324466588825235e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9946522563695908, + "num_tokens": 144784347.0, + "step": 1214 + }, + { + "entropy": 0.6523344293236732, + "epoch": 2.768748217850014, + "grad_norm": 0.609375, + "learning_rate": 4.331164919836177e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9930352047085762, + "num_tokens": 144904091.0, + "step": 1215 + }, + { + "entropy": 0.6554993167519569, + "epoch": 2.771029369831765, + "grad_norm": 0.515625, + "learning_rate": 4.329882141455974e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9945582300424576, + "num_tokens": 145022885.0, + "step": 1216 + }, + { + "entropy": 0.6527039930224419, + "epoch": 2.773310521813516, + "grad_norm": 0.4453125, + "learning_rate": 4.3285983244699955e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9946563392877579, + "num_tokens": 145142320.0, + "step": 1217 + }, + { + "entropy": 0.6555519476532936, + "epoch": 2.7755916737952666, + "grad_norm": 0.484375, + "learning_rate": 4.327313469606911e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9924005717039108, + "num_tokens": 145261838.0, + "step": 1218 + }, + { + "entropy": 0.6538817435503006, + "epoch": 2.7778728257770173, + "grad_norm": 0.361328125, + "learning_rate": 4.326027577595977e-06, + "loss": 0.0127, + "mean_token_accuracy": 0.995294950902462, + "num_tokens": 145381016.0, + "step": 1219 + }, + { + "entropy": 0.6512335017323494, + "epoch": 2.780153977758768, + "grad_norm": 0.51953125, + "learning_rate": 4.324740649167044e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9950651377439499, + "num_tokens": 145500266.0, + "step": 1220 + }, + { + "entropy": 0.6514842584729195, + "epoch": 2.782435129740519, + "grad_norm": 0.48046875, + "learning_rate": 4.323452685050545e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9954509139060974, + "num_tokens": 145620036.0, + "step": 1221 + }, + { + "entropy": 0.6524055898189545, + "epoch": 2.78471628172227, + "grad_norm": 0.5546875, + "learning_rate": 4.3221636859775075e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9933938607573509, + "num_tokens": 145739259.0, + "step": 1222 + }, + { + "entropy": 0.6474889144301414, + "epoch": 2.7869974337040206, + "grad_norm": 0.51953125, + "learning_rate": 4.320873652679538e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9919271320104599, + "num_tokens": 145858522.0, + "step": 1223 + }, + { + "entropy": 0.6471715569496155, + "epoch": 2.7892785856857714, + "grad_norm": 0.412109375, + "learning_rate": 4.319582585888838e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9948296695947647, + "num_tokens": 145977946.0, + "step": 1224 + }, + { + "entropy": 0.653621070086956, + "epoch": 2.791559737667522, + "grad_norm": 0.68359375, + "learning_rate": 4.31829048633819e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9928679019212723, + "num_tokens": 146097762.0, + "step": 1225 + }, + { + "entropy": 0.6550022438168526, + "epoch": 2.793840889649273, + "grad_norm": 0.546875, + "learning_rate": 4.316997354760965e-06, + "loss": 0.0247, + "mean_token_accuracy": 0.9913444444537163, + "num_tokens": 146217475.0, + "step": 1226 + }, + { + "entropy": 0.6477186903357506, + "epoch": 2.7961220416310235, + "grad_norm": 0.49609375, + "learning_rate": 4.3157031918911204e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9951938465237617, + "num_tokens": 146336199.0, + "step": 1227 + }, + { + "entropy": 0.6459951177239418, + "epoch": 2.7984031936127742, + "grad_norm": 0.53125, + "learning_rate": 4.314407998463198e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9931886494159698, + "num_tokens": 146455426.0, + "step": 1228 + }, + { + "entropy": 0.647793136537075, + "epoch": 2.8006843455945254, + "grad_norm": 0.478515625, + "learning_rate": 4.3131117752123235e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9951990097761154, + "num_tokens": 146574463.0, + "step": 1229 + }, + { + "entropy": 0.6460911184549332, + "epoch": 2.802965497576276, + "grad_norm": 0.71484375, + "learning_rate": 4.311814522874209e-06, + "loss": 0.0263, + "mean_token_accuracy": 0.9907710701227188, + "num_tokens": 146693512.0, + "step": 1230 + }, + { + "entropy": 0.6480645909905434, + "epoch": 2.805246649558027, + "grad_norm": 0.57421875, + "learning_rate": 4.3105162421851494e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.9933440834283829, + "num_tokens": 146812587.0, + "step": 1231 + }, + { + "entropy": 0.6455476507544518, + "epoch": 2.8075278015397775, + "grad_norm": 0.59765625, + "learning_rate": 4.309216933882025e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9933163821697235, + "num_tokens": 146931638.0, + "step": 1232 + }, + { + "entropy": 0.6460504308342934, + "epoch": 2.8098089535215283, + "grad_norm": 0.7109375, + "learning_rate": 4.307916598702296e-06, + "loss": 0.0294, + "mean_token_accuracy": 0.9909825325012207, + "num_tokens": 147051426.0, + "step": 1233 + }, + { + "entropy": 0.6494486704468727, + "epoch": 2.8120901055032794, + "grad_norm": 0.51953125, + "learning_rate": 4.3066152373840105e-06, + "loss": 0.0228, + "mean_token_accuracy": 0.9937650337815285, + "num_tokens": 147171456.0, + "step": 1234 + }, + { + "entropy": 0.6550133675336838, + "epoch": 2.81437125748503, + "grad_norm": 0.66796875, + "learning_rate": 4.305312850665794e-06, + "loss": 0.0276, + "mean_token_accuracy": 0.9923286363482475, + "num_tokens": 147290889.0, + "step": 1235 + }, + { + "entropy": 0.6495688706636429, + "epoch": 2.816652409466781, + "grad_norm": 0.54296875, + "learning_rate": 4.304009439286855e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9950536116957664, + "num_tokens": 147409507.0, + "step": 1236 + }, + { + "entropy": 0.6539899930357933, + "epoch": 2.8189335614485316, + "grad_norm": 0.73046875, + "learning_rate": 4.3027050039869865e-06, + "loss": 0.0268, + "mean_token_accuracy": 0.9921978786587715, + "num_tokens": 147529169.0, + "step": 1237 + }, + { + "entropy": 0.6574088037014008, + "epoch": 2.8212147134302823, + "grad_norm": 0.439453125, + "learning_rate": 4.301399545506561e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.995095744729042, + "num_tokens": 147648767.0, + "step": 1238 + }, + { + "entropy": 0.6516990810632706, + "epoch": 2.823495865412033, + "grad_norm": 0.392578125, + "learning_rate": 4.3000930645865305e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9954869225621223, + "num_tokens": 147768137.0, + "step": 1239 + }, + { + "entropy": 0.6487155258655548, + "epoch": 2.8257770173937837, + "grad_norm": 0.546875, + "learning_rate": 4.298785561968428e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9932971447706223, + "num_tokens": 147886867.0, + "step": 1240 + }, + { + "entropy": 0.6475519090890884, + "epoch": 2.8280581693755344, + "grad_norm": 0.625, + "learning_rate": 4.297477038394368e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.9919485002756119, + "num_tokens": 148006207.0, + "step": 1241 + }, + { + "entropy": 0.6522155776619911, + "epoch": 2.830339321357285, + "grad_norm": 0.4765625, + "learning_rate": 4.296167494607043e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9933972135186195, + "num_tokens": 148126336.0, + "step": 1242 + }, + { + "entropy": 0.653118185698986, + "epoch": 2.8326204733390363, + "grad_norm": 0.6875, + "learning_rate": 4.294856931349724e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.9910801872611046, + "num_tokens": 148245665.0, + "step": 1243 + }, + { + "entropy": 0.6557525545358658, + "epoch": 2.834901625320787, + "grad_norm": 0.369140625, + "learning_rate": 4.293545349366262e-06, + "loss": 0.0119, + "mean_token_accuracy": 0.9966613799333572, + "num_tokens": 148364211.0, + "step": 1244 + }, + { + "entropy": 0.6509460210800171, + "epoch": 2.8371827773025378, + "grad_norm": 0.484375, + "learning_rate": 4.292232749401085e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9966714009642601, + "num_tokens": 148483371.0, + "step": 1245 + }, + { + "entropy": 0.65097676217556, + "epoch": 2.8394639292842885, + "grad_norm": 0.53125, + "learning_rate": 4.2909191321992e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9936327114701271, + "num_tokens": 148602946.0, + "step": 1246 + }, + { + "entropy": 0.6480291858315468, + "epoch": 2.841745081266039, + "grad_norm": 0.55859375, + "learning_rate": 4.2896044985061915e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9941493719816208, + "num_tokens": 148722080.0, + "step": 1247 + }, + { + "entropy": 0.6528186649084091, + "epoch": 2.8440262332477904, + "grad_norm": 0.40234375, + "learning_rate": 4.288288849068218e-06, + "loss": 0.0117, + "mean_token_accuracy": 0.9966644421219826, + "num_tokens": 148841485.0, + "step": 1248 + }, + { + "entropy": 0.6562492251396179, + "epoch": 2.846307385229541, + "grad_norm": 0.54296875, + "learning_rate": 4.286972184632019e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9938517659902573, + "num_tokens": 148961080.0, + "step": 1249 + }, + { + "entropy": 0.649836465716362, + "epoch": 2.848588537211292, + "grad_norm": 0.34375, + "learning_rate": 4.285654505944906e-06, + "loss": 0.0128, + "mean_token_accuracy": 0.9965597912669182, + "num_tokens": 149080293.0, + "step": 1250 + }, + { + "entropy": 0.6490693986415863, + "epoch": 2.8508696891930425, + "grad_norm": 0.427734375, + "learning_rate": 4.28433581375477e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9950278252363205, + "num_tokens": 149200531.0, + "step": 1251 + }, + { + "entropy": 0.6488681063055992, + "epoch": 2.8531508411747932, + "grad_norm": 0.60546875, + "learning_rate": 4.283016108810073e-06, + "loss": 0.0283, + "mean_token_accuracy": 0.991633951663971, + "num_tokens": 149319953.0, + "step": 1252 + }, + { + "entropy": 0.6477656811475754, + "epoch": 2.855431993156544, + "grad_norm": 0.6328125, + "learning_rate": 4.281695391859854e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.9944901764392853, + "num_tokens": 149439461.0, + "step": 1253 + }, + { + "entropy": 0.6547368466854095, + "epoch": 2.8577131451382947, + "grad_norm": 0.451171875, + "learning_rate": 4.28037366365373e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9957932904362679, + "num_tokens": 149558848.0, + "step": 1254 + }, + { + "entropy": 0.6488895639777184, + "epoch": 2.8599942971200454, + "grad_norm": 0.4765625, + "learning_rate": 4.279050924941885e-06, + "loss": 0.015, + "mean_token_accuracy": 0.9937520250678062, + "num_tokens": 149678420.0, + "step": 1255 + }, + { + "entropy": 0.6521189287304878, + "epoch": 2.8622754491017965, + "grad_norm": 0.49609375, + "learning_rate": 4.2777271764750805e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9947205856442451, + "num_tokens": 149797862.0, + "step": 1256 + }, + { + "entropy": 0.6494321972131729, + "epoch": 2.8645566010835473, + "grad_norm": 0.57421875, + "learning_rate": 4.276402419004652e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9932700023055077, + "num_tokens": 149916925.0, + "step": 1257 + }, + { + "entropy": 0.6489140838384628, + "epoch": 2.866837753065298, + "grad_norm": 0.4921875, + "learning_rate": 4.275076653282504e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9957459345459938, + "num_tokens": 150035906.0, + "step": 1258 + }, + { + "entropy": 0.6478017196059227, + "epoch": 2.8691189050470487, + "grad_norm": 0.58984375, + "learning_rate": 4.273749880061118e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9923037067055702, + "num_tokens": 150154895.0, + "step": 1259 + }, + { + "entropy": 0.6526802331209183, + "epoch": 2.8714000570287994, + "grad_norm": 0.345703125, + "learning_rate": 4.272422100093542e-06, + "loss": 0.0098, + "mean_token_accuracy": 0.9967421740293503, + "num_tokens": 150274136.0, + "step": 1260 + }, + { + "entropy": 0.6467663422226906, + "epoch": 2.8736812090105506, + "grad_norm": 0.435546875, + "learning_rate": 4.271093314133401e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9935760498046875, + "num_tokens": 150393579.0, + "step": 1261 + }, + { + "entropy": 0.6517160013318062, + "epoch": 2.8759623609923013, + "grad_norm": 0.55859375, + "learning_rate": 4.269763522934888e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9951289147138596, + "num_tokens": 150513084.0, + "step": 1262 + }, + { + "entropy": 0.6529334709048271, + "epoch": 2.878243512974052, + "grad_norm": 0.68359375, + "learning_rate": 4.268432727252765e-06, + "loss": 0.031, + "mean_token_accuracy": 0.9898612350225449, + "num_tokens": 150632957.0, + "step": 1263 + }, + { + "entropy": 0.647835299372673, + "epoch": 2.8805246649558027, + "grad_norm": 0.59765625, + "learning_rate": 4.2671009278423665e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9934124946594238, + "num_tokens": 150752003.0, + "step": 1264 + }, + { + "entropy": 0.6490294486284256, + "epoch": 2.8828058169375534, + "grad_norm": 0.578125, + "learning_rate": 4.265768125459597e-06, + "loss": 0.025, + "mean_token_accuracy": 0.9938136786222458, + "num_tokens": 150871488.0, + "step": 1265 + }, + { + "entropy": 0.6475485116243362, + "epoch": 2.885086968919304, + "grad_norm": 0.5390625, + "learning_rate": 4.264434320860929e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9920951351523399, + "num_tokens": 150990824.0, + "step": 1266 + }, + { + "entropy": 0.6501416638493538, + "epoch": 2.887368120901055, + "grad_norm": 0.546875, + "learning_rate": 4.2630995148034044e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9931245222687721, + "num_tokens": 151109695.0, + "step": 1267 + }, + { + "entropy": 0.6468615233898163, + "epoch": 2.8896492728828056, + "grad_norm": 0.478515625, + "learning_rate": 4.261763708044633e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9946742951869965, + "num_tokens": 151228893.0, + "step": 1268 + }, + { + "entropy": 0.6493132337927818, + "epoch": 2.8919304248645568, + "grad_norm": 0.462890625, + "learning_rate": 4.2604269013427925e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9957385882735252, + "num_tokens": 151348646.0, + "step": 1269 + }, + { + "entropy": 0.6416860073804855, + "epoch": 2.8942115768463075, + "grad_norm": 0.5703125, + "learning_rate": 4.25908909545663e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9919984117150307, + "num_tokens": 151467849.0, + "step": 1270 + }, + { + "entropy": 0.6516547128558159, + "epoch": 2.896492728828058, + "grad_norm": 0.62109375, + "learning_rate": 4.257750291145457e-06, + "loss": 0.0259, + "mean_token_accuracy": 0.9914350360631943, + "num_tokens": 151586562.0, + "step": 1271 + }, + { + "entropy": 0.6509486138820648, + "epoch": 2.898773880809809, + "grad_norm": 0.42578125, + "learning_rate": 4.256410489169154e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9950994178652763, + "num_tokens": 151706066.0, + "step": 1272 + }, + { + "entropy": 0.6508637070655823, + "epoch": 2.9010550327915596, + "grad_norm": 0.7109375, + "learning_rate": 4.255069690288166e-06, + "loss": 0.0235, + "mean_token_accuracy": 0.9941271916031837, + "num_tokens": 151825670.0, + "step": 1273 + }, + { + "entropy": 0.6513590887188911, + "epoch": 2.903336184773311, + "grad_norm": 0.578125, + "learning_rate": 4.253727895263504e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9929093718528748, + "num_tokens": 151945302.0, + "step": 1274 + }, + { + "entropy": 0.6488741636276245, + "epoch": 2.9056173367550615, + "grad_norm": 0.52734375, + "learning_rate": 4.252385104856746e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9941171333193779, + "num_tokens": 152064511.0, + "step": 1275 + }, + { + "entropy": 0.6490021347999573, + "epoch": 2.9078984887368122, + "grad_norm": 0.54296875, + "learning_rate": 4.251041319830034e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.994852215051651, + "num_tokens": 152183577.0, + "step": 1276 + }, + { + "entropy": 0.6488051190972328, + "epoch": 2.910179640718563, + "grad_norm": 0.59375, + "learning_rate": 4.249696540946074e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9944665431976318, + "num_tokens": 152303659.0, + "step": 1277 + }, + { + "entropy": 0.6480104476213455, + "epoch": 2.9124607927003137, + "grad_norm": 0.66796875, + "learning_rate": 4.248350768968136e-06, + "loss": 0.0263, + "mean_token_accuracy": 0.9910087883472443, + "num_tokens": 152422215.0, + "step": 1278 + }, + { + "entropy": 0.650012731552124, + "epoch": 2.9147419446820644, + "grad_norm": 0.640625, + "learning_rate": 4.247004004660055e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9942890182137489, + "num_tokens": 152541293.0, + "step": 1279 + }, + { + "entropy": 0.648506835103035, + "epoch": 2.917023096663815, + "grad_norm": 0.451171875, + "learning_rate": 4.245656248786228e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.994360476732254, + "num_tokens": 152660310.0, + "step": 1280 + }, + { + "entropy": 0.6481037810444832, + "epoch": 2.919304248645566, + "grad_norm": 0.427734375, + "learning_rate": 4.2443075021116166e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9950786307454109, + "num_tokens": 152779058.0, + "step": 1281 + }, + { + "entropy": 0.6549055054783821, + "epoch": 2.9215854006273165, + "grad_norm": 0.458984375, + "learning_rate": 4.242957765401741e-06, + "loss": 0.013, + "mean_token_accuracy": 0.9950268194079399, + "num_tokens": 152898544.0, + "step": 1282 + }, + { + "entropy": 0.6508414447307587, + "epoch": 2.9238665526090677, + "grad_norm": 0.431640625, + "learning_rate": 4.241607039422687e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9962744489312172, + "num_tokens": 153017749.0, + "step": 1283 + }, + { + "entropy": 0.6463128849864006, + "epoch": 2.9261477045908184, + "grad_norm": 0.47265625, + "learning_rate": 4.2402553249411e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9950668215751648, + "num_tokens": 153137243.0, + "step": 1284 + }, + { + "entropy": 0.6467094719409943, + "epoch": 2.928428856572569, + "grad_norm": 0.4140625, + "learning_rate": 4.238902622724188e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9942401945590973, + "num_tokens": 153257032.0, + "step": 1285 + }, + { + "entropy": 0.6495672091841698, + "epoch": 2.93071000855432, + "grad_norm": 0.609375, + "learning_rate": 4.237548933539718e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9942636340856552, + "num_tokens": 153375866.0, + "step": 1286 + }, + { + "entropy": 0.6452918350696564, + "epoch": 2.9329911605360706, + "grad_norm": 0.58984375, + "learning_rate": 4.236194258156019e-06, + "loss": 0.0264, + "mean_token_accuracy": 0.991928830742836, + "num_tokens": 153495723.0, + "step": 1287 + }, + { + "entropy": 0.6448934897780418, + "epoch": 2.9352723125178217, + "grad_norm": 0.65234375, + "learning_rate": 4.234838597341977e-06, + "loss": 0.0292, + "mean_token_accuracy": 0.9912170395255089, + "num_tokens": 153615286.0, + "step": 1288 + }, + { + "entropy": 0.6485610827803612, + "epoch": 2.9375534644995724, + "grad_norm": 0.51171875, + "learning_rate": 4.233481951867039e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9940480068325996, + "num_tokens": 153734283.0, + "step": 1289 + }, + { + "entropy": 0.6488269567489624, + "epoch": 2.939834616481323, + "grad_norm": 0.73828125, + "learning_rate": 4.232124322501212e-06, + "loss": 0.0319, + "mean_token_accuracy": 0.9908827692270279, + "num_tokens": 153853441.0, + "step": 1290 + }, + { + "entropy": 0.6402896642684937, + "epoch": 2.942115768463074, + "grad_norm": 0.431640625, + "learning_rate": 4.230765710015058e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.995225302875042, + "num_tokens": 153972194.0, + "step": 1291 + }, + { + "entropy": 0.6483354419469833, + "epoch": 2.9443969204448246, + "grad_norm": 0.6171875, + "learning_rate": 4.229406115179703e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.993073083460331, + "num_tokens": 154091912.0, + "step": 1292 + }, + { + "entropy": 0.6441086754202843, + "epoch": 2.9466780724265753, + "grad_norm": 0.59375, + "learning_rate": 4.228045538766823e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9949968233704567, + "num_tokens": 154210762.0, + "step": 1293 + }, + { + "entropy": 0.6435507461428642, + "epoch": 2.948959224408326, + "grad_norm": 0.578125, + "learning_rate": 4.226683981548656e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9928685948252678, + "num_tokens": 154330331.0, + "step": 1294 + }, + { + "entropy": 0.6447737291455269, + "epoch": 2.9512403763900767, + "grad_norm": 0.5703125, + "learning_rate": 4.2253214442979975e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9928772002458572, + "num_tokens": 154450020.0, + "step": 1295 + }, + { + "entropy": 0.6551017165184021, + "epoch": 2.953521528371828, + "grad_norm": 0.67578125, + "learning_rate": 4.223957927788195e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.9921037331223488, + "num_tokens": 154569253.0, + "step": 1296 + }, + { + "entropy": 0.6521547213196754, + "epoch": 2.9558026803535786, + "grad_norm": 0.69140625, + "learning_rate": 4.222593432793155e-06, + "loss": 0.0223, + "mean_token_accuracy": 0.9933907762169838, + "num_tokens": 154688436.0, + "step": 1297 + }, + { + "entropy": 0.6516120284795761, + "epoch": 2.9580838323353293, + "grad_norm": 0.416015625, + "learning_rate": 4.2212279600873385e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9955504015088081, + "num_tokens": 154808317.0, + "step": 1298 + }, + { + "entropy": 0.6471229270100594, + "epoch": 2.96036498431708, + "grad_norm": 0.62109375, + "learning_rate": 4.219861510445762e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9918579608201981, + "num_tokens": 154927540.0, + "step": 1299 + }, + { + "entropy": 0.6545456573367119, + "epoch": 2.962646136298831, + "grad_norm": 0.578125, + "learning_rate": 4.2184940846439946e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9940153136849403, + "num_tokens": 155047196.0, + "step": 1300 + }, + { + "entropy": 0.6481975689530373, + "epoch": 2.964927288280582, + "grad_norm": 0.34765625, + "learning_rate": 4.217125683458162e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9951271191239357, + "num_tokens": 155166568.0, + "step": 1301 + }, + { + "entropy": 0.6453498676419258, + "epoch": 2.9672084402623327, + "grad_norm": 0.52734375, + "learning_rate": 4.215756307664941e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9940994381904602, + "num_tokens": 155286237.0, + "step": 1302 + }, + { + "entropy": 0.6508210673928261, + "epoch": 2.9694895922440834, + "grad_norm": 0.51953125, + "learning_rate": 4.214385958041565e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.9927458092570305, + "num_tokens": 155405780.0, + "step": 1303 + }, + { + "entropy": 0.6477246955037117, + "epoch": 2.971770744225834, + "grad_norm": 0.43359375, + "learning_rate": 4.213014635365816e-06, + "loss": 0.0249, + "mean_token_accuracy": 0.9950645342469215, + "num_tokens": 155524472.0, + "step": 1304 + }, + { + "entropy": 0.6484782621264458, + "epoch": 2.974051896207585, + "grad_norm": 0.546875, + "learning_rate": 4.2116423404160316e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9946496710181236, + "num_tokens": 155644247.0, + "step": 1305 + }, + { + "entropy": 0.6502333506941795, + "epoch": 2.9763330481893355, + "grad_norm": 0.58984375, + "learning_rate": 4.210269073971098e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.9919911324977875, + "num_tokens": 155764563.0, + "step": 1306 + }, + { + "entropy": 0.6477504819631577, + "epoch": 2.9786142001710862, + "grad_norm": 0.40234375, + "learning_rate": 4.208894836810457e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.9956165552139282, + "num_tokens": 155884093.0, + "step": 1307 + }, + { + "entropy": 0.6493853181600571, + "epoch": 2.980895352152837, + "grad_norm": 0.671875, + "learning_rate": 4.207519629714099e-06, + "loss": 0.0245, + "mean_token_accuracy": 0.9929186031222343, + "num_tokens": 156003087.0, + "step": 1308 + }, + { + "entropy": 0.649738647043705, + "epoch": 2.983176504134588, + "grad_norm": 0.63671875, + "learning_rate": 4.206143453462562e-06, + "loss": 0.0261, + "mean_token_accuracy": 0.9926538392901421, + "num_tokens": 156122426.0, + "step": 1309 + }, + { + "entropy": 0.6453644409775734, + "epoch": 2.985457656116339, + "grad_norm": 0.48046875, + "learning_rate": 4.204766308836941e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.993846520781517, + "num_tokens": 156242213.0, + "step": 1310 + }, + { + "entropy": 0.6540574952960014, + "epoch": 2.9877388080980896, + "grad_norm": 0.80078125, + "learning_rate": 4.203388196618874e-06, + "loss": 0.0252, + "mean_token_accuracy": 0.9929093420505524, + "num_tokens": 156361371.0, + "step": 1311 + }, + { + "entropy": 0.6475647836923599, + "epoch": 2.9900199600798403, + "grad_norm": 0.404296875, + "learning_rate": 4.202009117590552e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9950703382492065, + "num_tokens": 156480428.0, + "step": 1312 + }, + { + "entropy": 0.6456519067287445, + "epoch": 2.992301112061591, + "grad_norm": 0.51953125, + "learning_rate": 4.200629072534713e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9919319078326225, + "num_tokens": 156599859.0, + "step": 1313 + }, + { + "entropy": 0.6463167145848274, + "epoch": 2.994582264043342, + "grad_norm": 0.546875, + "learning_rate": 4.1992480622346455e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.994675911962986, + "num_tokens": 156718713.0, + "step": 1314 + }, + { + "entropy": 0.6481665670871735, + "epoch": 2.996863416025093, + "grad_norm": 0.60546875, + "learning_rate": 4.197866087474181e-06, + "loss": 0.024, + "mean_token_accuracy": 0.991515226662159, + "num_tokens": 156837705.0, + "step": 1315 + }, + { + "entropy": 0.6466361433267593, + "epoch": 2.9991445680068436, + "grad_norm": 0.498046875, + "learning_rate": 4.196483149037707e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9936768934130669, + "num_tokens": 156956813.0, + "step": 1316 + }, + { + "entropy": 0.6517036358515421, + "epoch": 3.0, + "grad_norm": 1.2265625, + "learning_rate": 4.195099247710147e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9927758177121481, + "num_tokens": 157000434.0, + "step": 1317 + }, + { + "entropy": 0.6511970236897469, + "epoch": 3.0022811519817507, + "grad_norm": 0.458984375, + "learning_rate": 4.1937143842769805e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9936851561069489, + "num_tokens": 157119472.0, + "step": 1318 + }, + { + "entropy": 0.6441078633069992, + "epoch": 3.0045623039635014, + "grad_norm": 0.42578125, + "learning_rate": 4.192328559524227e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9953123778104782, + "num_tokens": 157239554.0, + "step": 1319 + }, + { + "entropy": 0.6475029662251472, + "epoch": 3.006843455945252, + "grad_norm": 0.423828125, + "learning_rate": 4.190941774238454e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9964678883552551, + "num_tokens": 157359303.0, + "step": 1320 + }, + { + "epoch": 3.006843455945252, + "eval_entropy": 0.6486519647641781, + "eval_loss": 0.021649125963449478, + "eval_mean_token_accuracy": 0.9932565267548361, + "eval_num_tokens": 157359303.0, + "eval_runtime": 177.4855, + "eval_samples_per_second": 47.243, + "eval_steps_per_second": 1.482, + "step": 1320 + }, + { + "entropy": 0.6494759395718575, + "epoch": 3.0091246079270033, + "grad_norm": 0.53515625, + "learning_rate": 4.1895540292067765e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9948647022247314, + "num_tokens": 157478451.0, + "step": 1321 + }, + { + "entropy": 0.6518705859780312, + "epoch": 3.011405759908754, + "grad_norm": 0.470703125, + "learning_rate": 4.18816532521685e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9937567412853241, + "num_tokens": 157598292.0, + "step": 1322 + }, + { + "entropy": 0.6454321220517159, + "epoch": 3.0136869118905047, + "grad_norm": 0.66015625, + "learning_rate": 4.1867756630568755e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9922845959663391, + "num_tokens": 157718552.0, + "step": 1323 + }, + { + "entropy": 0.6472807005047798, + "epoch": 3.0159680638722555, + "grad_norm": 0.52734375, + "learning_rate": 4.1853850435156e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9933486208319664, + "num_tokens": 157838152.0, + "step": 1324 + }, + { + "entropy": 0.6426367089152336, + "epoch": 3.018249215854006, + "grad_norm": 0.56640625, + "learning_rate": 4.18399346738231e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9931506142020226, + "num_tokens": 157956712.0, + "step": 1325 + }, + { + "entropy": 0.6411767601966858, + "epoch": 3.020530367835757, + "grad_norm": 0.4453125, + "learning_rate": 4.18260093544684e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.995045393705368, + "num_tokens": 158075438.0, + "step": 1326 + }, + { + "entropy": 0.6463169828057289, + "epoch": 3.022811519817508, + "grad_norm": 0.6484375, + "learning_rate": 4.181207448499562e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9931499809026718, + "num_tokens": 158195153.0, + "step": 1327 + }, + { + "entropy": 0.6481476128101349, + "epoch": 3.025092671799259, + "grad_norm": 0.60546875, + "learning_rate": 4.179813007331394e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.9925877004861832, + "num_tokens": 158314274.0, + "step": 1328 + }, + { + "entropy": 0.6407941952347755, + "epoch": 3.0273738237810095, + "grad_norm": 0.51953125, + "learning_rate": 4.178417612733792e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.993454210460186, + "num_tokens": 158433185.0, + "step": 1329 + }, + { + "entropy": 0.6465622410178185, + "epoch": 3.02965497576276, + "grad_norm": 0.50390625, + "learning_rate": 4.177021265498757e-06, + "loss": 0.0119, + "mean_token_accuracy": 0.9959547892212868, + "num_tokens": 158551959.0, + "step": 1330 + }, + { + "entropy": 0.6457758769392967, + "epoch": 3.031936127744511, + "grad_norm": 0.5234375, + "learning_rate": 4.1756239664188275e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9929363280534744, + "num_tokens": 158671689.0, + "step": 1331 + }, + { + "entropy": 0.6459312960505486, + "epoch": 3.0342172797262617, + "grad_norm": 0.55859375, + "learning_rate": 4.1742257162870835e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9940722435712814, + "num_tokens": 158791061.0, + "step": 1332 + }, + { + "entropy": 0.6461051106452942, + "epoch": 3.0364984317080124, + "grad_norm": 0.455078125, + "learning_rate": 4.172826515897146e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9943135231733322, + "num_tokens": 158910399.0, + "step": 1333 + }, + { + "entropy": 0.6430990174412727, + "epoch": 3.0387795836897635, + "grad_norm": 0.5625, + "learning_rate": 4.171426366043172e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9937284961342812, + "num_tokens": 159029927.0, + "step": 1334 + }, + { + "entropy": 0.6455115452408791, + "epoch": 3.0410607356715142, + "grad_norm": 0.392578125, + "learning_rate": 4.170025267519862e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.995947040617466, + "num_tokens": 159149024.0, + "step": 1335 + }, + { + "entropy": 0.6408272013068199, + "epoch": 3.043341887653265, + "grad_norm": 0.490234375, + "learning_rate": 4.168623221122451e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9919321238994598, + "num_tokens": 159268679.0, + "step": 1336 + }, + { + "entropy": 0.6392921060323715, + "epoch": 3.0456230396350157, + "grad_norm": 0.53515625, + "learning_rate": 4.167220227646713e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.9943569526076317, + "num_tokens": 159388489.0, + "step": 1337 + }, + { + "entropy": 0.6477710604667664, + "epoch": 3.0479041916167664, + "grad_norm": 0.5, + "learning_rate": 4.165816287888962e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9928662106394768, + "num_tokens": 159508304.0, + "step": 1338 + }, + { + "entropy": 0.6441629156470299, + "epoch": 3.050185343598517, + "grad_norm": 0.4375, + "learning_rate": 4.164411402646045e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9950553327798843, + "num_tokens": 159627524.0, + "step": 1339 + }, + { + "entropy": 0.6470302492380142, + "epoch": 3.052466495580268, + "grad_norm": 0.5859375, + "learning_rate": 4.163005572715348e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.9934737160801888, + "num_tokens": 159746651.0, + "step": 1340 + }, + { + "entropy": 0.642486222088337, + "epoch": 3.054747647562019, + "grad_norm": 0.44921875, + "learning_rate": 4.161598798894795e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9955218210816383, + "num_tokens": 159865568.0, + "step": 1341 + }, + { + "entropy": 0.6436214968562126, + "epoch": 3.0570287995437697, + "grad_norm": 0.5859375, + "learning_rate": 4.160191081982841e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.9931266978383064, + "num_tokens": 159984814.0, + "step": 1342 + }, + { + "entropy": 0.646051786839962, + "epoch": 3.0593099515255204, + "grad_norm": 0.51171875, + "learning_rate": 4.15878242277848e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9936570972204208, + "num_tokens": 160105306.0, + "step": 1343 + }, + { + "entropy": 0.6426781490445137, + "epoch": 3.061591103507271, + "grad_norm": 0.478515625, + "learning_rate": 4.157372822081241e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9943535253405571, + "num_tokens": 160225110.0, + "step": 1344 + }, + { + "entropy": 0.6468538641929626, + "epoch": 3.063872255489022, + "grad_norm": 0.515625, + "learning_rate": 4.155962280691184e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9924037754535675, + "num_tokens": 160344197.0, + "step": 1345 + }, + { + "entropy": 0.6427075117826462, + "epoch": 3.0661534074707726, + "grad_norm": 0.52734375, + "learning_rate": 4.154550799408906e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9936443641781807, + "num_tokens": 160463401.0, + "step": 1346 + }, + { + "entropy": 0.6429091840982437, + "epoch": 3.0684345594525233, + "grad_norm": 0.423828125, + "learning_rate": 4.153138379035537e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9948658272624016, + "num_tokens": 160582673.0, + "step": 1347 + }, + { + "entropy": 0.6432528048753738, + "epoch": 3.0707157114342745, + "grad_norm": 0.52734375, + "learning_rate": 4.1517250203727395e-06, + "loss": 0.0254, + "mean_token_accuracy": 0.9936517179012299, + "num_tokens": 160702302.0, + "step": 1348 + }, + { + "entropy": 0.6426235437393188, + "epoch": 3.072996863416025, + "grad_norm": 0.62890625, + "learning_rate": 4.150310724222708e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9923998340964317, + "num_tokens": 160821727.0, + "step": 1349 + }, + { + "entropy": 0.6487427800893784, + "epoch": 3.075278015397776, + "grad_norm": 0.5703125, + "learning_rate": 4.14889549138817e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9921269491314888, + "num_tokens": 160941556.0, + "step": 1350 + }, + { + "entropy": 0.6468837186694145, + "epoch": 3.0775591673795266, + "grad_norm": 0.416015625, + "learning_rate": 4.147479322672383e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9948612153530121, + "num_tokens": 161060994.0, + "step": 1351 + }, + { + "entropy": 0.6518030092120171, + "epoch": 3.0798403193612773, + "grad_norm": 0.462890625, + "learning_rate": 4.14606221887914e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9949798658490181, + "num_tokens": 161180592.0, + "step": 1352 + }, + { + "entropy": 0.6416125446557999, + "epoch": 3.082121471343028, + "grad_norm": 0.54296875, + "learning_rate": 4.144644180812759e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9916021749377251, + "num_tokens": 161300400.0, + "step": 1353 + }, + { + "entropy": 0.6446148380637169, + "epoch": 3.084402623324779, + "grad_norm": 0.6484375, + "learning_rate": 4.143225209278093e-06, + "loss": 0.0285, + "mean_token_accuracy": 0.9899051040410995, + "num_tokens": 161419703.0, + "step": 1354 + }, + { + "entropy": 0.6496598124504089, + "epoch": 3.08668377530653, + "grad_norm": 0.6640625, + "learning_rate": 4.141805305080521e-06, + "loss": 0.0223, + "mean_token_accuracy": 0.9919021874666214, + "num_tokens": 161538620.0, + "step": 1355 + }, + { + "entropy": 0.6440286412835121, + "epoch": 3.0889649272882806, + "grad_norm": 0.4296875, + "learning_rate": 4.1403844690259544e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9937020018696785, + "num_tokens": 161657983.0, + "step": 1356 + }, + { + "entropy": 0.6526097506284714, + "epoch": 3.0912460792700314, + "grad_norm": 0.490234375, + "learning_rate": 4.138962701920831e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9950511157512665, + "num_tokens": 161777791.0, + "step": 1357 + }, + { + "entropy": 0.6455054134130478, + "epoch": 3.093527231251782, + "grad_norm": 0.49609375, + "learning_rate": 4.13754000457212e-06, + "loss": 0.0252, + "mean_token_accuracy": 0.9931691512465477, + "num_tokens": 161896892.0, + "step": 1358 + }, + { + "entropy": 0.6464546769857407, + "epoch": 3.095808383233533, + "grad_norm": 0.48828125, + "learning_rate": 4.136116377787317e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.992920309305191, + "num_tokens": 162016025.0, + "step": 1359 + }, + { + "entropy": 0.6455173939466476, + "epoch": 3.0980895352152835, + "grad_norm": 0.546875, + "learning_rate": 4.134691822374445e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9924940317869186, + "num_tokens": 162135248.0, + "step": 1360 + }, + { + "entropy": 0.6466520950198174, + "epoch": 3.1003706871970347, + "grad_norm": 0.404296875, + "learning_rate": 4.1332663391420515e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9961616918444633, + "num_tokens": 162254307.0, + "step": 1361 + }, + { + "entropy": 0.6467810943722725, + "epoch": 3.1026518391787854, + "grad_norm": 0.6171875, + "learning_rate": 4.131839928899217e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9929270669817924, + "num_tokens": 162374619.0, + "step": 1362 + }, + { + "entropy": 0.6484431624412537, + "epoch": 3.104932991160536, + "grad_norm": 0.5703125, + "learning_rate": 4.130412592455542e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9946770071983337, + "num_tokens": 162494275.0, + "step": 1363 + }, + { + "entropy": 0.645599476993084, + "epoch": 3.107214143142287, + "grad_norm": 0.515625, + "learning_rate": 4.128984330621157e-06, + "loss": 0.019, + "mean_token_accuracy": 0.993535004556179, + "num_tokens": 162613553.0, + "step": 1364 + }, + { + "entropy": 0.6522751748561859, + "epoch": 3.1094952951240376, + "grad_norm": 0.4296875, + "learning_rate": 4.127555144206713e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9949960634112358, + "num_tokens": 162733185.0, + "step": 1365 + }, + { + "entropy": 0.6457345113158226, + "epoch": 3.1117764471057883, + "grad_norm": 0.515625, + "learning_rate": 4.126125034023392e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9932870343327522, + "num_tokens": 162852358.0, + "step": 1366 + }, + { + "entropy": 0.6422347277402878, + "epoch": 3.1140575990875394, + "grad_norm": 0.52734375, + "learning_rate": 4.124694000882894e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9951657503843307, + "num_tokens": 162972632.0, + "step": 1367 + }, + { + "entropy": 0.647073894739151, + "epoch": 3.11633875106929, + "grad_norm": 0.5234375, + "learning_rate": 4.123262045597447e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.994612418115139, + "num_tokens": 163092090.0, + "step": 1368 + }, + { + "entropy": 0.6495248228311539, + "epoch": 3.118619903051041, + "grad_norm": 0.474609375, + "learning_rate": 4.121829168979802e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9936078265309334, + "num_tokens": 163211422.0, + "step": 1369 + }, + { + "entropy": 0.64598248898983, + "epoch": 3.1209010550327916, + "grad_norm": 0.62890625, + "learning_rate": 4.120395371843231e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.993638426065445, + "num_tokens": 163330385.0, + "step": 1370 + }, + { + "entropy": 0.6433704495429993, + "epoch": 3.1231822070145423, + "grad_norm": 0.5, + "learning_rate": 4.11896065500153e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.993085466325283, + "num_tokens": 163449264.0, + "step": 1371 + }, + { + "entropy": 0.6466878056526184, + "epoch": 3.125463358996293, + "grad_norm": 0.640625, + "learning_rate": 4.117525019269016e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9935913756489754, + "num_tokens": 163568502.0, + "step": 1372 + }, + { + "entropy": 0.6452555954456329, + "epoch": 3.1277445109780437, + "grad_norm": 0.416015625, + "learning_rate": 4.116088465460529e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9951285943388939, + "num_tokens": 163687626.0, + "step": 1373 + }, + { + "entropy": 0.6460779085755348, + "epoch": 3.130025662959795, + "grad_norm": 0.69921875, + "learning_rate": 4.114650994391428e-06, + "loss": 0.0303, + "mean_token_accuracy": 0.9915973842144012, + "num_tokens": 163807513.0, + "step": 1374 + }, + { + "entropy": 0.6441425979137421, + "epoch": 3.1323068149415456, + "grad_norm": 0.58984375, + "learning_rate": 4.113212606877596e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.992511659860611, + "num_tokens": 163926529.0, + "step": 1375 + }, + { + "entropy": 0.6461644992232323, + "epoch": 3.1345879669232963, + "grad_norm": 0.53125, + "learning_rate": 4.111773303735432e-06, + "loss": 0.0245, + "mean_token_accuracy": 0.9929399788379669, + "num_tokens": 164046216.0, + "step": 1376 + }, + { + "entropy": 0.6506053805351257, + "epoch": 3.136869118905047, + "grad_norm": 0.51171875, + "learning_rate": 4.110333085781857e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.993076466023922, + "num_tokens": 164165208.0, + "step": 1377 + }, + { + "entropy": 0.6449938490986824, + "epoch": 3.1391502708867978, + "grad_norm": 0.455078125, + "learning_rate": 4.108891953834312e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.994999460875988, + "num_tokens": 164284080.0, + "step": 1378 + }, + { + "entropy": 0.6441327556967735, + "epoch": 3.1414314228685485, + "grad_norm": 0.609375, + "learning_rate": 4.107449908710753e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9921720772981644, + "num_tokens": 164404092.0, + "step": 1379 + }, + { + "entropy": 0.6444905996322632, + "epoch": 3.143712574850299, + "grad_norm": 0.69140625, + "learning_rate": 4.106006951229661e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9929494112730026, + "num_tokens": 164523345.0, + "step": 1380 + }, + { + "entropy": 0.640085369348526, + "epoch": 3.1459937268320504, + "grad_norm": 0.5078125, + "learning_rate": 4.104563082210028e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.994428925216198, + "num_tokens": 164643012.0, + "step": 1381 + }, + { + "entropy": 0.6424135789275169, + "epoch": 3.148274878813801, + "grad_norm": 0.423828125, + "learning_rate": 4.1031183024713665e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9948406592011452, + "num_tokens": 164762354.0, + "step": 1382 + }, + { + "entropy": 0.6451975926756859, + "epoch": 3.150556030795552, + "grad_norm": 0.474609375, + "learning_rate": 4.101672612833706e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.9965225979685783, + "num_tokens": 164881486.0, + "step": 1383 + }, + { + "entropy": 0.6451384276151657, + "epoch": 3.1528371827773025, + "grad_norm": 0.59765625, + "learning_rate": 4.100226014117592e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9935673475265503, + "num_tokens": 165000577.0, + "step": 1384 + }, + { + "entropy": 0.649875819683075, + "epoch": 3.1551183347590532, + "grad_norm": 0.54296875, + "learning_rate": 4.098778507144086e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9955001398921013, + "num_tokens": 165120092.0, + "step": 1385 + }, + { + "entropy": 0.6418671607971191, + "epoch": 3.157399486740804, + "grad_norm": 0.546875, + "learning_rate": 4.097330092734765e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.992095448076725, + "num_tokens": 165238655.0, + "step": 1386 + }, + { + "entropy": 0.6456710696220398, + "epoch": 3.1596806387225547, + "grad_norm": 0.46875, + "learning_rate": 4.09588077171172e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9929564446210861, + "num_tokens": 165358476.0, + "step": 1387 + }, + { + "entropy": 0.6415742859244347, + "epoch": 3.161961790704306, + "grad_norm": 0.390625, + "learning_rate": 4.094430544897559e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9947002902626991, + "num_tokens": 165477499.0, + "step": 1388 + }, + { + "entropy": 0.6438018903136253, + "epoch": 3.1642429426860565, + "grad_norm": 0.6640625, + "learning_rate": 4.092979413115404e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9936899542808533, + "num_tokens": 165596934.0, + "step": 1389 + }, + { + "entropy": 0.6471063643693924, + "epoch": 3.1665240946678073, + "grad_norm": 0.64453125, + "learning_rate": 4.091527377188886e-06, + "loss": 0.0261, + "mean_token_accuracy": 0.9924207851290703, + "num_tokens": 165716720.0, + "step": 1390 + }, + { + "entropy": 0.6424316465854645, + "epoch": 3.168805246649558, + "grad_norm": 0.5390625, + "learning_rate": 4.090074437942155e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.9927376955747604, + "num_tokens": 165836896.0, + "step": 1391 + }, + { + "entropy": 0.645936593413353, + "epoch": 3.1710863986313087, + "grad_norm": 0.396484375, + "learning_rate": 4.088620596199872e-06, + "loss": 0.0103, + "mean_token_accuracy": 0.9958264902234077, + "num_tokens": 165956436.0, + "step": 1392 + }, + { + "entropy": 0.6425650492310524, + "epoch": 3.1733675506130594, + "grad_norm": 0.52734375, + "learning_rate": 4.087165852787206e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.9943012222647667, + "num_tokens": 166075994.0, + "step": 1393 + }, + { + "entropy": 0.6399920210242271, + "epoch": 3.1756487025948106, + "grad_norm": 0.57421875, + "learning_rate": 4.085710208529844e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9926029741764069, + "num_tokens": 166196107.0, + "step": 1394 + }, + { + "entropy": 0.6449049338698387, + "epoch": 3.1779298545765613, + "grad_norm": 0.5, + "learning_rate": 4.084253664253981e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9951788038015366, + "num_tokens": 166315382.0, + "step": 1395 + }, + { + "entropy": 0.6446117609739304, + "epoch": 3.180211006558312, + "grad_norm": 0.53125, + "learning_rate": 4.082796220786324e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9942950457334518, + "num_tokens": 166435369.0, + "step": 1396 + }, + { + "entropy": 0.6449469700455666, + "epoch": 3.1824921585400627, + "grad_norm": 0.546875, + "learning_rate": 4.081337878954088e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9946614280343056, + "num_tokens": 166554612.0, + "step": 1397 + }, + { + "entropy": 0.6456586271524429, + "epoch": 3.1847733105218134, + "grad_norm": 0.41796875, + "learning_rate": 4.079878639585002e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9948592856526375, + "num_tokens": 166674061.0, + "step": 1398 + }, + { + "entropy": 0.6425445452332497, + "epoch": 3.187054462503564, + "grad_norm": 0.443359375, + "learning_rate": 4.0784185035072996e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.996039055287838, + "num_tokens": 166792693.0, + "step": 1399 + }, + { + "entropy": 0.6393834352493286, + "epoch": 3.189335614485315, + "grad_norm": 0.478515625, + "learning_rate": 4.076957471549728e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9944398328661919, + "num_tokens": 166911749.0, + "step": 1400 + }, + { + "entropy": 0.6384956687688828, + "epoch": 3.191616766467066, + "grad_norm": 0.44140625, + "learning_rate": 4.0754955445415405e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9952273368835449, + "num_tokens": 167030928.0, + "step": 1401 + }, + { + "entropy": 0.6452071815729141, + "epoch": 3.1938979184488168, + "grad_norm": 0.427734375, + "learning_rate": 4.074032723312497e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9954302608966827, + "num_tokens": 167150258.0, + "step": 1402 + }, + { + "entropy": 0.6405121609568596, + "epoch": 3.1961790704305675, + "grad_norm": 0.56640625, + "learning_rate": 4.072569008692868e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9938759729266167, + "num_tokens": 167269341.0, + "step": 1403 + }, + { + "entropy": 0.6425043940544128, + "epoch": 3.198460222412318, + "grad_norm": 0.578125, + "learning_rate": 4.071104401513429e-06, + "loss": 0.0274, + "mean_token_accuracy": 0.9908057972788811, + "num_tokens": 167388793.0, + "step": 1404 + }, + { + "entropy": 0.6461363360285759, + "epoch": 3.200741374394069, + "grad_norm": 0.50390625, + "learning_rate": 4.069638902605464e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9942851141095161, + "num_tokens": 167509269.0, + "step": 1405 + }, + { + "entropy": 0.6447549238801003, + "epoch": 3.2030225263758196, + "grad_norm": 0.66015625, + "learning_rate": 4.06817251280076e-06, + "loss": 0.0254, + "mean_token_accuracy": 0.9934736341238022, + "num_tokens": 167628641.0, + "step": 1406 + }, + { + "entropy": 0.643791638314724, + "epoch": 3.205303678357571, + "grad_norm": 0.65625, + "learning_rate": 4.0667052329316125e-06, + "loss": 0.0325, + "mean_token_accuracy": 0.9920907318592072, + "num_tokens": 167748313.0, + "step": 1407 + }, + { + "entropy": 0.6422603502869606, + "epoch": 3.2075848303393215, + "grad_norm": 0.69140625, + "learning_rate": 4.0652370638308215e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9932881891727448, + "num_tokens": 167866860.0, + "step": 1408 + }, + { + "entropy": 0.6396091729402542, + "epoch": 3.2098659823210722, + "grad_norm": 0.54296875, + "learning_rate": 4.063768006331691e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9938270151615143, + "num_tokens": 167986152.0, + "step": 1409 + }, + { + "entropy": 0.6428561210632324, + "epoch": 3.212147134302823, + "grad_norm": 0.435546875, + "learning_rate": 4.06229806126803e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.993484690785408, + "num_tokens": 168105727.0, + "step": 1410 + }, + { + "entropy": 0.6523574590682983, + "epoch": 3.2144282862845737, + "grad_norm": 0.408203125, + "learning_rate": 4.06082722947415e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9959453865885735, + "num_tokens": 168225636.0, + "step": 1411 + }, + { + "entropy": 0.6430332139134407, + "epoch": 3.2167094382663244, + "grad_norm": 0.453125, + "learning_rate": 4.059355511784868e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9940478429198265, + "num_tokens": 168344575.0, + "step": 1412 + }, + { + "entropy": 0.6475801020860672, + "epoch": 3.218990590248075, + "grad_norm": 0.546875, + "learning_rate": 4.057882909035503e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9921234920620918, + "num_tokens": 168464278.0, + "step": 1413 + }, + { + "entropy": 0.6438763737678528, + "epoch": 3.2212717422298263, + "grad_norm": 0.470703125, + "learning_rate": 4.0564094220618735e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9952137991786003, + "num_tokens": 168583504.0, + "step": 1414 + }, + { + "entropy": 0.6391374468803406, + "epoch": 3.223552894211577, + "grad_norm": 0.462890625, + "learning_rate": 4.054935051700305e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.994407132267952, + "num_tokens": 168702916.0, + "step": 1415 + }, + { + "entropy": 0.650329239666462, + "epoch": 3.2258340461933277, + "grad_norm": 0.4453125, + "learning_rate": 4.053459798787619e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9950058683753014, + "num_tokens": 168822513.0, + "step": 1416 + }, + { + "entropy": 0.6430069357156754, + "epoch": 3.2281151981750784, + "grad_norm": 0.50390625, + "learning_rate": 4.0519836641611425e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9944458156824112, + "num_tokens": 168941734.0, + "step": 1417 + }, + { + "entropy": 0.6417672261595726, + "epoch": 3.230396350156829, + "grad_norm": 0.7109375, + "learning_rate": 4.050506648658701e-06, + "loss": 0.0314, + "mean_token_accuracy": 0.9897609353065491, + "num_tokens": 169060858.0, + "step": 1418 + }, + { + "entropy": 0.637527272105217, + "epoch": 3.23267750213858, + "grad_norm": 0.56640625, + "learning_rate": 4.049028753118619e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9945479929447174, + "num_tokens": 169180340.0, + "step": 1419 + }, + { + "entropy": 0.645523726940155, + "epoch": 3.2349586541203306, + "grad_norm": 0.68359375, + "learning_rate": 4.047549978379721e-06, + "loss": 0.024, + "mean_token_accuracy": 0.992670975625515, + "num_tokens": 169299489.0, + "step": 1420 + }, + { + "entropy": 0.6463112309575081, + "epoch": 3.2372398061020817, + "grad_norm": 0.45703125, + "learning_rate": 4.046070325281333e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.994374081492424, + "num_tokens": 169418615.0, + "step": 1421 + }, + { + "entropy": 0.646550178527832, + "epoch": 3.2395209580838324, + "grad_norm": 0.5703125, + "learning_rate": 4.044589794663275e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9932675659656525, + "num_tokens": 169538309.0, + "step": 1422 + }, + { + "entropy": 0.6492364630103111, + "epoch": 3.241802110065583, + "grad_norm": 0.7265625, + "learning_rate": 4.04310838736587e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.9930262863636017, + "num_tokens": 169658266.0, + "step": 1423 + }, + { + "entropy": 0.6430153250694275, + "epoch": 3.244083262047334, + "grad_norm": 0.419921875, + "learning_rate": 4.041626104229937e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.993217408657074, + "num_tokens": 169777385.0, + "step": 1424 + }, + { + "entropy": 0.6480439603328705, + "epoch": 3.2463644140290846, + "grad_norm": 0.55078125, + "learning_rate": 4.0401429460967864e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9936999157071114, + "num_tokens": 169895745.0, + "step": 1425 + }, + { + "entropy": 0.6498561203479767, + "epoch": 3.2486455660108353, + "grad_norm": 0.515625, + "learning_rate": 4.038658913808235e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9937942698597908, + "num_tokens": 170015393.0, + "step": 1426 + }, + { + "entropy": 0.6422076746821404, + "epoch": 3.250926717992586, + "grad_norm": 0.52734375, + "learning_rate": 4.037174008206589e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9921614825725555, + "num_tokens": 170133736.0, + "step": 1427 + }, + { + "entropy": 0.6394479498267174, + "epoch": 3.253207869974337, + "grad_norm": 0.51171875, + "learning_rate": 4.035688230134651e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9930308237671852, + "num_tokens": 170253231.0, + "step": 1428 + }, + { + "entropy": 0.6473383158445358, + "epoch": 3.255489021956088, + "grad_norm": 0.6953125, + "learning_rate": 4.034201580435723e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.9945629760622978, + "num_tokens": 170372294.0, + "step": 1429 + }, + { + "entropy": 0.6434555128216743, + "epoch": 3.2577701739378386, + "grad_norm": 0.58984375, + "learning_rate": 4.0327140599535954e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9934206455945969, + "num_tokens": 170491356.0, + "step": 1430 + }, + { + "entropy": 0.6416099071502686, + "epoch": 3.2600513259195893, + "grad_norm": 0.74609375, + "learning_rate": 4.031225669532558e-06, + "loss": 0.0301, + "mean_token_accuracy": 0.9917095750570297, + "num_tokens": 170610203.0, + "step": 1431 + }, + { + "entropy": 0.6445368155837059, + "epoch": 3.26233247790134, + "grad_norm": 0.796875, + "learning_rate": 4.029736410017392e-06, + "loss": 0.0285, + "mean_token_accuracy": 0.9913726523518562, + "num_tokens": 170729615.0, + "step": 1432 + }, + { + "entropy": 0.6457330957055092, + "epoch": 3.264613629883091, + "grad_norm": 0.5078125, + "learning_rate": 4.028246282253373e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9937168136239052, + "num_tokens": 170848511.0, + "step": 1433 + }, + { + "entropy": 0.6485878750681877, + "epoch": 3.266894781864842, + "grad_norm": 0.458984375, + "learning_rate": 4.026755287086267e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9918460920453072, + "num_tokens": 170968247.0, + "step": 1434 + }, + { + "entropy": 0.6503137871623039, + "epoch": 3.2691759338465927, + "grad_norm": 0.51171875, + "learning_rate": 4.025263425362335e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9923156425356865, + "num_tokens": 171087149.0, + "step": 1435 + }, + { + "entropy": 0.6446101143956184, + "epoch": 3.2714570858283434, + "grad_norm": 0.5, + "learning_rate": 4.0237706979283306e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9951129332184792, + "num_tokens": 171206153.0, + "step": 1436 + }, + { + "entropy": 0.6426517516374588, + "epoch": 3.273738237810094, + "grad_norm": 0.423828125, + "learning_rate": 4.022277105631495e-06, + "loss": 0.0113, + "mean_token_accuracy": 0.9960926398634911, + "num_tokens": 171325638.0, + "step": 1437 + }, + { + "entropy": 0.6482264474034309, + "epoch": 3.276019389791845, + "grad_norm": 0.498046875, + "learning_rate": 4.020782649319563e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9947508573532104, + "num_tokens": 171445288.0, + "step": 1438 + }, + { + "entropy": 0.6414114460349083, + "epoch": 3.2783005417735955, + "grad_norm": 0.5703125, + "learning_rate": 4.019287329840759e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9947855472564697, + "num_tokens": 171564251.0, + "step": 1439 + }, + { + "entropy": 0.6530464813113213, + "epoch": 3.2805816937553463, + "grad_norm": 0.455078125, + "learning_rate": 4.017791148043797e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9959487989544868, + "num_tokens": 171683621.0, + "step": 1440 + }, + { + "entropy": 0.643944650888443, + "epoch": 3.2828628457370974, + "grad_norm": 0.41796875, + "learning_rate": 4.016294104777883e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9944483265280724, + "num_tokens": 171802892.0, + "step": 1441 + }, + { + "entropy": 0.6466441825032234, + "epoch": 3.285143997718848, + "grad_norm": 0.62890625, + "learning_rate": 4.0147962008927065e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9936251416802406, + "num_tokens": 171923318.0, + "step": 1442 + }, + { + "entropy": 0.6445001661777496, + "epoch": 3.287425149700599, + "grad_norm": 0.4296875, + "learning_rate": 4.013297437238452e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9962267354130745, + "num_tokens": 172042108.0, + "step": 1443 + }, + { + "entropy": 0.6477979123592377, + "epoch": 3.2897063016823496, + "grad_norm": 0.578125, + "learning_rate": 4.011797814665787e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9928281977772713, + "num_tokens": 172161320.0, + "step": 1444 + }, + { + "entropy": 0.6425008475780487, + "epoch": 3.2919874536641003, + "grad_norm": 0.41796875, + "learning_rate": 4.010297334025869e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9933263882994652, + "num_tokens": 172280822.0, + "step": 1445 + }, + { + "entropy": 0.6412927433848381, + "epoch": 3.294268605645851, + "grad_norm": 0.51953125, + "learning_rate": 4.008795996170341e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.995164155960083, + "num_tokens": 172400152.0, + "step": 1446 + }, + { + "entropy": 0.6431070193648338, + "epoch": 3.296549757627602, + "grad_norm": 0.412109375, + "learning_rate": 4.0072938019513345e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9951274245977402, + "num_tokens": 172520055.0, + "step": 1447 + }, + { + "entropy": 0.6420279368758202, + "epoch": 3.298830909609353, + "grad_norm": 0.625, + "learning_rate": 4.0057907522214646e-06, + "loss": 0.0237, + "mean_token_accuracy": 0.9915828928351402, + "num_tokens": 172640436.0, + "step": 1448 + }, + { + "entropy": 0.6492867171764374, + "epoch": 3.3011120615911036, + "grad_norm": 0.462890625, + "learning_rate": 4.004286847833835e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.994450755417347, + "num_tokens": 172760185.0, + "step": 1449 + }, + { + "entropy": 0.643481120467186, + "epoch": 3.3033932135728543, + "grad_norm": 0.5234375, + "learning_rate": 4.002782089642031e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9943273738026619, + "num_tokens": 172879320.0, + "step": 1450 + }, + { + "entropy": 0.6409605443477631, + "epoch": 3.305674365554605, + "grad_norm": 0.60546875, + "learning_rate": 4.001276478500127e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9949767738580704, + "num_tokens": 172999852.0, + "step": 1451 + }, + { + "entropy": 0.6437208876013756, + "epoch": 3.3079555175363557, + "grad_norm": 0.4921875, + "learning_rate": 3.9997700152626755e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9934428483247757, + "num_tokens": 173118792.0, + "step": 1452 + }, + { + "entropy": 0.6479907408356667, + "epoch": 3.3102366695181065, + "grad_norm": 0.4765625, + "learning_rate": 3.9982627007847186e-06, + "loss": 0.014, + "mean_token_accuracy": 0.9945449158549309, + "num_tokens": 173239346.0, + "step": 1453 + }, + { + "entropy": 0.6410037204623222, + "epoch": 3.312517821499857, + "grad_norm": 0.42578125, + "learning_rate": 3.996754535921777e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9935107529163361, + "num_tokens": 173359023.0, + "step": 1454 + }, + { + "entropy": 0.6433015018701553, + "epoch": 3.3147989734816083, + "grad_norm": 0.42578125, + "learning_rate": 3.995245521529857e-06, + "loss": 0.0123, + "mean_token_accuracy": 0.9959154725074768, + "num_tokens": 173478261.0, + "step": 1455 + }, + { + "entropy": 0.6392124369740486, + "epoch": 3.317080125463359, + "grad_norm": 0.52734375, + "learning_rate": 3.993735658465446e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9938846901059151, + "num_tokens": 173596792.0, + "step": 1456 + }, + { + "entropy": 0.6408686190843582, + "epoch": 3.31936127744511, + "grad_norm": 0.484375, + "learning_rate": 3.992224947585513e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9949236586689949, + "num_tokens": 173716328.0, + "step": 1457 + }, + { + "entropy": 0.6452233716845512, + "epoch": 3.3216424294268605, + "grad_norm": 0.484375, + "learning_rate": 3.990713389747508e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.996146559715271, + "num_tokens": 173835339.0, + "step": 1458 + }, + { + "entropy": 0.6406038552522659, + "epoch": 3.323923581408611, + "grad_norm": 0.5390625, + "learning_rate": 3.989200985809362e-06, + "loss": 0.0127, + "mean_token_accuracy": 0.9953884184360504, + "num_tokens": 173954594.0, + "step": 1459 + }, + { + "entropy": 0.6401559710502625, + "epoch": 3.3262047333903624, + "grad_norm": 0.49609375, + "learning_rate": 3.987687736629487e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9949696287512779, + "num_tokens": 174073538.0, + "step": 1460 + }, + { + "entropy": 0.6413179785013199, + "epoch": 3.328485885372113, + "grad_norm": 0.6015625, + "learning_rate": 3.986173643066774e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9938244968652725, + "num_tokens": 174192430.0, + "step": 1461 + }, + { + "entropy": 0.641913890838623, + "epoch": 3.330767037353864, + "grad_norm": 0.6875, + "learning_rate": 3.984658705980593e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9925032034516335, + "num_tokens": 174312609.0, + "step": 1462 + }, + { + "entropy": 0.6415663734078407, + "epoch": 3.3330481893356145, + "grad_norm": 0.640625, + "learning_rate": 3.983142926230792e-06, + "loss": 0.0246, + "mean_token_accuracy": 0.9917317852377892, + "num_tokens": 174432258.0, + "step": 1463 + }, + { + "entropy": 0.64004947245121, + "epoch": 3.3353293413173652, + "grad_norm": 0.50390625, + "learning_rate": 3.981626304677701e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9944213703274727, + "num_tokens": 174552017.0, + "step": 1464 + }, + { + "entropy": 0.6443495005369186, + "epoch": 3.337610493299116, + "grad_norm": 0.48828125, + "learning_rate": 3.980108842182121e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9936074838042259, + "num_tokens": 174671872.0, + "step": 1465 + }, + { + "entropy": 0.6452600806951523, + "epoch": 3.3398916452808667, + "grad_norm": 0.54296875, + "learning_rate": 3.978590539605338e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9931915625929832, + "num_tokens": 174792330.0, + "step": 1466 + }, + { + "entropy": 0.6413128972053528, + "epoch": 3.3421727972626174, + "grad_norm": 0.439453125, + "learning_rate": 3.97707139780911e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9935553073883057, + "num_tokens": 174912615.0, + "step": 1467 + }, + { + "entropy": 0.6448782533407211, + "epoch": 3.3444539492443686, + "grad_norm": 0.439453125, + "learning_rate": 3.975551417655673e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9948340505361557, + "num_tokens": 175031586.0, + "step": 1468 + }, + { + "entropy": 0.6417443677783012, + "epoch": 3.3467351012261193, + "grad_norm": 0.5234375, + "learning_rate": 3.974030600007737e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9943606331944466, + "num_tokens": 175150646.0, + "step": 1469 + }, + { + "entropy": 0.6407751441001892, + "epoch": 3.34901625320787, + "grad_norm": 0.59375, + "learning_rate": 3.97250894572849e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9933466464281082, + "num_tokens": 175270040.0, + "step": 1470 + }, + { + "entropy": 0.646542377769947, + "epoch": 3.3512974051896207, + "grad_norm": 0.5078125, + "learning_rate": 3.970986455681593e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.99338099360466, + "num_tokens": 175389889.0, + "step": 1471 + }, + { + "entropy": 0.6408480256795883, + "epoch": 3.3535785571713714, + "grad_norm": 0.5234375, + "learning_rate": 3.969463130731183e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9949145540595055, + "num_tokens": 175508448.0, + "step": 1472 + }, + { + "entropy": 0.6414823532104492, + "epoch": 3.355859709153122, + "grad_norm": 0.59375, + "learning_rate": 3.967938971741869e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9953278675675392, + "num_tokens": 175628508.0, + "step": 1473 + }, + { + "entropy": 0.6437659710645676, + "epoch": 3.3581408611348733, + "grad_norm": 0.52734375, + "learning_rate": 3.966413979578734e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9949772506952286, + "num_tokens": 175747853.0, + "step": 1474 + }, + { + "entropy": 0.6411682143807411, + "epoch": 3.360422013116624, + "grad_norm": 0.546875, + "learning_rate": 3.964888155107335e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9946545287966728, + "num_tokens": 175867104.0, + "step": 1475 + }, + { + "entropy": 0.6455703601241112, + "epoch": 3.3627031650983747, + "grad_norm": 0.412109375, + "learning_rate": 3.963361499193699e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9946174696087837, + "num_tokens": 175986984.0, + "step": 1476 + }, + { + "entropy": 0.6421468928456306, + "epoch": 3.3649843170801255, + "grad_norm": 0.66015625, + "learning_rate": 3.9618340127043274e-06, + "loss": 0.0271, + "mean_token_accuracy": 0.9924413338303566, + "num_tokens": 176106022.0, + "step": 1477 + }, + { + "entropy": 0.6452531442046165, + "epoch": 3.367265469061876, + "grad_norm": 0.4765625, + "learning_rate": 3.960305696506192e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9947572723031044, + "num_tokens": 176225315.0, + "step": 1478 + }, + { + "entropy": 0.6396555081009865, + "epoch": 3.369546621043627, + "grad_norm": 0.55078125, + "learning_rate": 3.958776551466737e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9936129152774811, + "num_tokens": 176344302.0, + "step": 1479 + }, + { + "entropy": 0.6397648975253105, + "epoch": 3.3718277730253776, + "grad_norm": 0.46875, + "learning_rate": 3.957246578453873e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9938685819506645, + "num_tokens": 176462679.0, + "step": 1480 + }, + { + "entropy": 0.6428644508123398, + "epoch": 3.374108925007129, + "grad_norm": 0.515625, + "learning_rate": 3.955715778335984e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9945546239614487, + "num_tokens": 176581626.0, + "step": 1481 + }, + { + "entropy": 0.6457257270812988, + "epoch": 3.3763900769888795, + "grad_norm": 0.478515625, + "learning_rate": 3.954184151981924e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.995005764067173, + "num_tokens": 176700811.0, + "step": 1482 + }, + { + "entropy": 0.6432905048131943, + "epoch": 3.37867122897063, + "grad_norm": 0.36328125, + "learning_rate": 3.952651700261012e-06, + "loss": 0.0123, + "mean_token_accuracy": 0.997337780892849, + "num_tokens": 176820080.0, + "step": 1483 + }, + { + "entropy": 0.6501444205641747, + "epoch": 3.380952380952381, + "grad_norm": 0.51953125, + "learning_rate": 3.95111842404304e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9952785521745682, + "num_tokens": 176940287.0, + "step": 1484 + }, + { + "entropy": 0.6442901864647865, + "epoch": 3.3832335329341316, + "grad_norm": 0.56640625, + "learning_rate": 3.949584324198266e-06, + "loss": 0.0223, + "mean_token_accuracy": 0.99433683604002, + "num_tokens": 177059990.0, + "step": 1485 + }, + { + "entropy": 0.6476445198059082, + "epoch": 3.3855146849158824, + "grad_norm": 0.546875, + "learning_rate": 3.948049401597414e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9950647875666618, + "num_tokens": 177179380.0, + "step": 1486 + }, + { + "entropy": 0.6454700753092766, + "epoch": 3.3877958368976335, + "grad_norm": 0.59765625, + "learning_rate": 3.946513657111678e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9943785145878792, + "num_tokens": 177299304.0, + "step": 1487 + }, + { + "entropy": 0.6480259150266647, + "epoch": 3.3900769888793842, + "grad_norm": 0.4140625, + "learning_rate": 3.944977091612716e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9955564066767693, + "num_tokens": 177418729.0, + "step": 1488 + }, + { + "entropy": 0.6450707912445068, + "epoch": 3.392358140861135, + "grad_norm": 0.53515625, + "learning_rate": 3.943439705972654e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.993788331747055, + "num_tokens": 177538022.0, + "step": 1489 + }, + { + "entropy": 0.6436382904648781, + "epoch": 3.3946392928428857, + "grad_norm": 0.412109375, + "learning_rate": 3.94190150106408e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9953142628073692, + "num_tokens": 177657468.0, + "step": 1490 + }, + { + "entropy": 0.6453414633870125, + "epoch": 3.3969204448246364, + "grad_norm": 0.66015625, + "learning_rate": 3.9403624777600526e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9939195513725281, + "num_tokens": 177776724.0, + "step": 1491 + }, + { + "entropy": 0.645867109298706, + "epoch": 3.399201596806387, + "grad_norm": 0.49609375, + "learning_rate": 3.938822636934089e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9926558583974838, + "num_tokens": 177895878.0, + "step": 1492 + }, + { + "entropy": 0.6489920914173126, + "epoch": 3.401482748788138, + "grad_norm": 0.53515625, + "learning_rate": 3.937281979460175e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.994483582675457, + "num_tokens": 178015139.0, + "step": 1493 + }, + { + "entropy": 0.64028300344944, + "epoch": 3.4037639007698886, + "grad_norm": 0.46875, + "learning_rate": 3.9357405062127565e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9940220490098, + "num_tokens": 178134335.0, + "step": 1494 + }, + { + "entropy": 0.6447064131498337, + "epoch": 3.4060450527516397, + "grad_norm": 0.4375, + "learning_rate": 3.934198218066745e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9955517873167992, + "num_tokens": 178254154.0, + "step": 1495 + }, + { + "entropy": 0.643914632499218, + "epoch": 3.4083262047333904, + "grad_norm": 0.46875, + "learning_rate": 3.932655115897513e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9945842176675797, + "num_tokens": 178373195.0, + "step": 1496 + }, + { + "entropy": 0.6435788944363594, + "epoch": 3.410607356715141, + "grad_norm": 0.5078125, + "learning_rate": 3.9311112005808955e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.9925685301423073, + "num_tokens": 178492820.0, + "step": 1497 + }, + { + "entropy": 0.6449022516608238, + "epoch": 3.412888508696892, + "grad_norm": 0.59375, + "learning_rate": 3.92956647299319e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9942852184176445, + "num_tokens": 178612214.0, + "step": 1498 + }, + { + "entropy": 0.6448007747530937, + "epoch": 3.4151696606786426, + "grad_norm": 0.5625, + "learning_rate": 3.928020934011153e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9933376833796501, + "num_tokens": 178731615.0, + "step": 1499 + }, + { + "entropy": 0.646712101995945, + "epoch": 3.4174508126603937, + "grad_norm": 0.546875, + "learning_rate": 3.926474584512002e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9956984370946884, + "num_tokens": 178850693.0, + "step": 1500 + }, + { + "entropy": 0.6423618569970131, + "epoch": 3.4197319646421445, + "grad_norm": 0.48828125, + "learning_rate": 3.924927425373417e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9946617111563683, + "num_tokens": 178970215.0, + "step": 1501 + }, + { + "entropy": 0.6434356346726418, + "epoch": 3.422013116623895, + "grad_norm": 0.52734375, + "learning_rate": 3.9233794574735345e-06, + "loss": 0.023, + "mean_token_accuracy": 0.9926030337810516, + "num_tokens": 179089917.0, + "step": 1502 + }, + { + "entropy": 0.6402776539325714, + "epoch": 3.424294268605646, + "grad_norm": 0.498046875, + "learning_rate": 3.921830681690951e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9953401982784271, + "num_tokens": 179208270.0, + "step": 1503 + }, + { + "entropy": 0.6402065232396126, + "epoch": 3.4265754205873966, + "grad_norm": 0.5078125, + "learning_rate": 3.920281098904722e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9944653064012527, + "num_tokens": 179327844.0, + "step": 1504 + }, + { + "entropy": 0.6392836198210716, + "epoch": 3.4288565725691473, + "grad_norm": 0.50390625, + "learning_rate": 3.918730709994361e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9929201379418373, + "num_tokens": 179447435.0, + "step": 1505 + }, + { + "entropy": 0.6445184424519539, + "epoch": 3.431137724550898, + "grad_norm": 0.466796875, + "learning_rate": 3.91717951583984e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.9966553822159767, + "num_tokens": 179566631.0, + "step": 1506 + }, + { + "entropy": 0.6412975713610649, + "epoch": 3.4334188765326488, + "grad_norm": 0.42578125, + "learning_rate": 3.915627517321584e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9952492415904999, + "num_tokens": 179686244.0, + "step": 1507 + }, + { + "entropy": 0.6458780765533447, + "epoch": 3.4357000285144, + "grad_norm": 0.45703125, + "learning_rate": 3.914074715320479e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.992873877286911, + "num_tokens": 179805734.0, + "step": 1508 + }, + { + "entropy": 0.6423771530389786, + "epoch": 3.4379811804961506, + "grad_norm": 0.515625, + "learning_rate": 3.912521110717866e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9928637742996216, + "num_tokens": 179925498.0, + "step": 1509 + }, + { + "entropy": 0.6449649184942245, + "epoch": 3.4402623324779014, + "grad_norm": 0.5078125, + "learning_rate": 3.9109667043955405e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9936571344733238, + "num_tokens": 180044809.0, + "step": 1510 + }, + { + "entropy": 0.6392904445528984, + "epoch": 3.442543484459652, + "grad_norm": 0.45703125, + "learning_rate": 3.909411497235752e-06, + "loss": 0.0101, + "mean_token_accuracy": 0.9965698719024658, + "num_tokens": 180163888.0, + "step": 1511 + }, + { + "entropy": 0.6466270759701729, + "epoch": 3.444824636441403, + "grad_norm": 0.51171875, + "learning_rate": 3.907855490121208e-06, + "loss": 0.023, + "mean_token_accuracy": 0.9947274550795555, + "num_tokens": 180283254.0, + "step": 1512 + }, + { + "entropy": 0.6446692422032356, + "epoch": 3.4471057884231535, + "grad_norm": 0.609375, + "learning_rate": 3.906298683935068e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9946559369564056, + "num_tokens": 180402716.0, + "step": 1513 + }, + { + "entropy": 0.643944039940834, + "epoch": 3.4493869404049047, + "grad_norm": 0.6875, + "learning_rate": 3.904741079560944e-06, + "loss": 0.0278, + "mean_token_accuracy": 0.9922839030623436, + "num_tokens": 180522417.0, + "step": 1514 + }, + { + "entropy": 0.644996739923954, + "epoch": 3.4516680923866554, + "grad_norm": 0.466796875, + "learning_rate": 3.903182677882904e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9931444749236107, + "num_tokens": 180642091.0, + "step": 1515 + }, + { + "entropy": 0.6420714780688286, + "epoch": 3.453949244368406, + "grad_norm": 0.5390625, + "learning_rate": 3.901623479785465e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9925729483366013, + "num_tokens": 180761027.0, + "step": 1516 + }, + { + "entropy": 0.6419919952750206, + "epoch": 3.456230396350157, + "grad_norm": 0.65234375, + "learning_rate": 3.900063486153598e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9945408031344414, + "num_tokens": 180880360.0, + "step": 1517 + }, + { + "entropy": 0.6426337286829948, + "epoch": 3.4585115483319075, + "grad_norm": 0.61328125, + "learning_rate": 3.898502697872725e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9939490407705307, + "num_tokens": 180999787.0, + "step": 1518 + }, + { + "entropy": 0.6462291404604912, + "epoch": 3.4607927003136583, + "grad_norm": 0.578125, + "learning_rate": 3.896941115828721e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.993421733379364, + "num_tokens": 181119078.0, + "step": 1519 + }, + { + "entropy": 0.6406579539179802, + "epoch": 3.463073852295409, + "grad_norm": 0.5, + "learning_rate": 3.895378740907908e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9959812164306641, + "num_tokens": 181237721.0, + "step": 1520 + }, + { + "entropy": 0.6418476998806, + "epoch": 3.46535500427716, + "grad_norm": 0.73046875, + "learning_rate": 3.89381557399706e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9921476617455482, + "num_tokens": 181357008.0, + "step": 1521 + }, + { + "entropy": 0.6378888562321663, + "epoch": 3.467636156258911, + "grad_norm": 0.4765625, + "learning_rate": 3.892251615983401e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9941611513495445, + "num_tokens": 181476791.0, + "step": 1522 + }, + { + "entropy": 0.6433370858430862, + "epoch": 3.4699173082406616, + "grad_norm": 0.609375, + "learning_rate": 3.890686867754604e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9935717135667801, + "num_tokens": 181595990.0, + "step": 1523 + }, + { + "entropy": 0.641566276550293, + "epoch": 3.4721984602224123, + "grad_norm": 0.4765625, + "learning_rate": 3.889121330198788e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9948035851120949, + "num_tokens": 181715493.0, + "step": 1524 + }, + { + "entropy": 0.642994187772274, + "epoch": 3.474479612204163, + "grad_norm": 0.486328125, + "learning_rate": 3.887555004204524e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9947699084877968, + "num_tokens": 181835272.0, + "step": 1525 + }, + { + "entropy": 0.6502713263034821, + "epoch": 3.4767607641859137, + "grad_norm": 0.6875, + "learning_rate": 3.885987890660828e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9929601550102234, + "num_tokens": 181955101.0, + "step": 1526 + }, + { + "entropy": 0.6396136358380318, + "epoch": 3.479041916167665, + "grad_norm": 0.5859375, + "learning_rate": 3.884419990457161e-06, + "loss": 0.0297, + "mean_token_accuracy": 0.9929802268743515, + "num_tokens": 182074487.0, + "step": 1527 + }, + { + "entropy": 0.6423270478844643, + "epoch": 3.4813230681494156, + "grad_norm": 0.6875, + "learning_rate": 3.882851304483436e-06, + "loss": 0.0271, + "mean_token_accuracy": 0.9924932792782784, + "num_tokens": 182193914.0, + "step": 1528 + }, + { + "entropy": 0.6442876309156418, + "epoch": 3.4836042201311663, + "grad_norm": 0.51953125, + "learning_rate": 3.881281833630007e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9946982339024544, + "num_tokens": 182314016.0, + "step": 1529 + }, + { + "entropy": 0.6349642276763916, + "epoch": 3.485885372112917, + "grad_norm": 0.5390625, + "learning_rate": 3.879711578787676e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9925785958766937, + "num_tokens": 182434080.0, + "step": 1530 + }, + { + "entropy": 0.6466870680451393, + "epoch": 3.4881665240946678, + "grad_norm": 0.375, + "learning_rate": 3.87814054084769e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9950389489531517, + "num_tokens": 182553627.0, + "step": 1531 + }, + { + "entropy": 0.6435984522104263, + "epoch": 3.4904476760764185, + "grad_norm": 0.3984375, + "learning_rate": 3.8765687207017375e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.9962380602955818, + "num_tokens": 182673061.0, + "step": 1532 + }, + { + "entropy": 0.6500435620546341, + "epoch": 3.492728828058169, + "grad_norm": 0.48828125, + "learning_rate": 3.874996119241956e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9952431917190552, + "num_tokens": 182792731.0, + "step": 1533 + }, + { + "entropy": 0.6396032720804214, + "epoch": 3.49500998003992, + "grad_norm": 0.4921875, + "learning_rate": 3.873422737360922e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9932463392615318, + "num_tokens": 182911848.0, + "step": 1534 + }, + { + "entropy": 0.6461816430091858, + "epoch": 3.497291132021671, + "grad_norm": 0.53125, + "learning_rate": 3.871848575951658e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9939378723502159, + "num_tokens": 183031105.0, + "step": 1535 + }, + { + "entropy": 0.6471435353159904, + "epoch": 3.499572284003422, + "grad_norm": 0.41796875, + "learning_rate": 3.8702736359076265e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9932906478643417, + "num_tokens": 183150628.0, + "step": 1536 + }, + { + "entropy": 0.6429877951741219, + "epoch": 3.5018534359851725, + "grad_norm": 0.52734375, + "learning_rate": 3.868697918122733e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9934845715761185, + "num_tokens": 183269829.0, + "step": 1537 + }, + { + "entropy": 0.6373982280492783, + "epoch": 3.5041345879669232, + "grad_norm": 0.421875, + "learning_rate": 3.867121423491325e-06, + "loss": 0.011, + "mean_token_accuracy": 0.9958240389823914, + "num_tokens": 183389653.0, + "step": 1538 + }, + { + "entropy": 0.6408857852220535, + "epoch": 3.506415739948674, + "grad_norm": 0.482421875, + "learning_rate": 3.86554415290819e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9926960691809654, + "num_tokens": 183509239.0, + "step": 1539 + }, + { + "entropy": 0.6507543325424194, + "epoch": 3.508696891930425, + "grad_norm": 0.482421875, + "learning_rate": 3.8639661072685575e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9958390668034554, + "num_tokens": 183628347.0, + "step": 1540 + }, + { + "epoch": 3.508696891930425, + "eval_entropy": 0.6408129925963543, + "eval_loss": 0.021278096362948418, + "eval_mean_token_accuracy": 0.9934140554852359, + "eval_num_tokens": 183628347.0, + "eval_runtime": 177.4938, + "eval_samples_per_second": 47.241, + "eval_steps_per_second": 1.482, + "step": 1540 + }, + { + "entropy": 0.6422957479953766, + "epoch": 3.510978043912176, + "grad_norm": 0.49609375, + "learning_rate": 3.862387287468095e-06, + "loss": 0.014, + "mean_token_accuracy": 0.9947423413395882, + "num_tokens": 183748073.0, + "step": 1541 + }, + { + "entropy": 0.6464937031269073, + "epoch": 3.5132591958939265, + "grad_norm": 0.69140625, + "learning_rate": 3.860807694402909e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9923305213451385, + "num_tokens": 183866790.0, + "step": 1542 + }, + { + "entropy": 0.6418327167630196, + "epoch": 3.5155403478756773, + "grad_norm": 0.396484375, + "learning_rate": 3.859227328969547e-06, + "loss": 0.015, + "mean_token_accuracy": 0.994832843542099, + "num_tokens": 183986724.0, + "step": 1543 + }, + { + "entropy": 0.6428635120391846, + "epoch": 3.517821499857428, + "grad_norm": 0.59765625, + "learning_rate": 3.857646192064995e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9937710911035538, + "num_tokens": 184106286.0, + "step": 1544 + }, + { + "entropy": 0.6441489085555077, + "epoch": 3.5201026518391787, + "grad_norm": 0.408203125, + "learning_rate": 3.856064284586674e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9956079870462418, + "num_tokens": 184225555.0, + "step": 1545 + }, + { + "entropy": 0.6410426124930382, + "epoch": 3.5223838038209294, + "grad_norm": 0.46484375, + "learning_rate": 3.854481607432445e-06, + "loss": 0.0088, + "mean_token_accuracy": 0.9975039511919022, + "num_tokens": 184344521.0, + "step": 1546 + }, + { + "entropy": 0.638005405664444, + "epoch": 3.52466495580268, + "grad_norm": 0.421875, + "learning_rate": 3.852898161500605e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.995017021894455, + "num_tokens": 184464157.0, + "step": 1547 + }, + { + "entropy": 0.6463857963681221, + "epoch": 3.5269461077844313, + "grad_norm": 0.6796875, + "learning_rate": 3.851313947689888e-06, + "loss": 0.0271, + "mean_token_accuracy": 0.9921988919377327, + "num_tokens": 184583307.0, + "step": 1548 + }, + { + "entropy": 0.6459466740489006, + "epoch": 3.529227259766182, + "grad_norm": 0.65625, + "learning_rate": 3.849728966899462e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.9929019883275032, + "num_tokens": 184702802.0, + "step": 1549 + }, + { + "entropy": 0.636448584496975, + "epoch": 3.5315084117479327, + "grad_norm": 0.4453125, + "learning_rate": 3.848143220028931e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9963820427656174, + "num_tokens": 184821210.0, + "step": 1550 + }, + { + "entropy": 0.6362445428967476, + "epoch": 3.5337895637296834, + "grad_norm": 0.3671875, + "learning_rate": 3.846556707978337e-06, + "loss": 0.0136, + "mean_token_accuracy": 0.9958627745509148, + "num_tokens": 184940385.0, + "step": 1551 + }, + { + "entropy": 0.6413716375827789, + "epoch": 3.536070715711434, + "grad_norm": 0.419921875, + "learning_rate": 3.844969431648151e-06, + "loss": 0.0128, + "mean_token_accuracy": 0.9941679313778877, + "num_tokens": 185059331.0, + "step": 1552 + }, + { + "entropy": 0.6436913087964058, + "epoch": 3.5383518676931853, + "grad_norm": 0.48828125, + "learning_rate": 3.843381391939281e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9951520189642906, + "num_tokens": 185178590.0, + "step": 1553 + }, + { + "entropy": 0.6348922327160835, + "epoch": 3.540633019674936, + "grad_norm": 0.4609375, + "learning_rate": 3.841792589753067e-06, + "loss": 0.0135, + "mean_token_accuracy": 0.9955623000860214, + "num_tokens": 185298112.0, + "step": 1554 + }, + { + "entropy": 0.6438176333904266, + "epoch": 3.5429141716566868, + "grad_norm": 0.41015625, + "learning_rate": 3.840203025991285e-06, + "loss": 0.0118, + "mean_token_accuracy": 0.9964404329657555, + "num_tokens": 185417545.0, + "step": 1555 + }, + { + "entropy": 0.6448199152946472, + "epoch": 3.5451953236384375, + "grad_norm": 0.45703125, + "learning_rate": 3.838612701556138e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9950509443879128, + "num_tokens": 185537330.0, + "step": 1556 + }, + { + "entropy": 0.6359336227178574, + "epoch": 3.547476475620188, + "grad_norm": 0.470703125, + "learning_rate": 3.837021617350266e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.9956230595707893, + "num_tokens": 185656546.0, + "step": 1557 + }, + { + "entropy": 0.6396361514925957, + "epoch": 3.549757627601939, + "grad_norm": 0.5390625, + "learning_rate": 3.8354297742767345e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9950877204537392, + "num_tokens": 185776263.0, + "step": 1558 + }, + { + "entropy": 0.6390326172113419, + "epoch": 3.5520387795836896, + "grad_norm": 0.486328125, + "learning_rate": 3.833837173239044e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.994154654443264, + "num_tokens": 185895695.0, + "step": 1559 + }, + { + "entropy": 0.639279194176197, + "epoch": 3.5543199315654403, + "grad_norm": 0.5234375, + "learning_rate": 3.832243815141126e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.9931284859776497, + "num_tokens": 186014525.0, + "step": 1560 + }, + { + "entropy": 0.6384925097227097, + "epoch": 3.556601083547191, + "grad_norm": 0.6640625, + "learning_rate": 3.830649700887339e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9932339489459991, + "num_tokens": 186133552.0, + "step": 1561 + }, + { + "entropy": 0.6371860057115555, + "epoch": 3.5588822355289422, + "grad_norm": 0.421875, + "learning_rate": 3.829054831382471e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9958512783050537, + "num_tokens": 186253143.0, + "step": 1562 + }, + { + "entropy": 0.6427948251366615, + "epoch": 3.561163387510693, + "grad_norm": 0.609375, + "learning_rate": 3.827459207531739e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.9933373406529427, + "num_tokens": 186372816.0, + "step": 1563 + }, + { + "entropy": 0.640685610473156, + "epoch": 3.5634445394924437, + "grad_norm": 0.58203125, + "learning_rate": 3.825862830240787e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9932584837079048, + "num_tokens": 186492334.0, + "step": 1564 + }, + { + "entropy": 0.6342457681894302, + "epoch": 3.5657256914741944, + "grad_norm": 0.4296875, + "learning_rate": 3.82426570041569e-06, + "loss": 0.0091, + "mean_token_accuracy": 0.9969242811203003, + "num_tokens": 186610903.0, + "step": 1565 + }, + { + "entropy": 0.6399726420640945, + "epoch": 3.568006843455945, + "grad_norm": 0.578125, + "learning_rate": 3.822667818962948e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9935248866677284, + "num_tokens": 186730147.0, + "step": 1566 + }, + { + "entropy": 0.6415124386548996, + "epoch": 3.5702879954376963, + "grad_norm": 0.55859375, + "learning_rate": 3.821069186789486e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9933437928557396, + "num_tokens": 186849333.0, + "step": 1567 + }, + { + "entropy": 0.6452320292592049, + "epoch": 3.572569147419447, + "grad_norm": 0.4765625, + "learning_rate": 3.819469804802659e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9952565506100655, + "num_tokens": 186968813.0, + "step": 1568 + }, + { + "entropy": 0.6410217359662056, + "epoch": 3.5748502994011977, + "grad_norm": 0.5703125, + "learning_rate": 3.8178696739102435e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9935290738940239, + "num_tokens": 187088195.0, + "step": 1569 + }, + { + "entropy": 0.6354967728257179, + "epoch": 3.5771314513829484, + "grad_norm": 0.44140625, + "learning_rate": 3.816268795020443e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9961283728480339, + "num_tokens": 187207983.0, + "step": 1570 + }, + { + "entropy": 0.6364392563700676, + "epoch": 3.579412603364699, + "grad_norm": 0.66015625, + "learning_rate": 3.814667169041887e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.994110606610775, + "num_tokens": 187326777.0, + "step": 1571 + }, + { + "entropy": 0.6413834393024445, + "epoch": 3.58169375534645, + "grad_norm": 0.6953125, + "learning_rate": 3.8130647968836254e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9927206784486771, + "num_tokens": 187446090.0, + "step": 1572 + }, + { + "entropy": 0.6407516002655029, + "epoch": 3.5839749073282006, + "grad_norm": 0.376953125, + "learning_rate": 3.811461679455136e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.9961866214871407, + "num_tokens": 187564984.0, + "step": 1573 + }, + { + "entropy": 0.6450999155640602, + "epoch": 3.5862560593099513, + "grad_norm": 0.5859375, + "learning_rate": 3.809857817666316e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9943684414029121, + "num_tokens": 187685238.0, + "step": 1574 + }, + { + "entropy": 0.6436958163976669, + "epoch": 3.5885372112917024, + "grad_norm": 0.458984375, + "learning_rate": 3.808253212427486e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9948701858520508, + "num_tokens": 187804662.0, + "step": 1575 + }, + { + "entropy": 0.6399054452776909, + "epoch": 3.590818363273453, + "grad_norm": 0.494140625, + "learning_rate": 3.8066478646493898e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9959145188331604, + "num_tokens": 187924054.0, + "step": 1576 + }, + { + "entropy": 0.642268106341362, + "epoch": 3.593099515255204, + "grad_norm": 0.6953125, + "learning_rate": 3.805041775243191e-06, + "loss": 0.0257, + "mean_token_accuracy": 0.991722822189331, + "num_tokens": 188043700.0, + "step": 1577 + }, + { + "entropy": 0.6441535726189613, + "epoch": 3.5953806672369546, + "grad_norm": 0.498046875, + "learning_rate": 3.803434945120475e-06, + "loss": 0.018, + "mean_token_accuracy": 0.994595468044281, + "num_tokens": 188163337.0, + "step": 1578 + }, + { + "entropy": 0.643576093018055, + "epoch": 3.5976618192187053, + "grad_norm": 0.55078125, + "learning_rate": 3.801827375193249e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9924560263752937, + "num_tokens": 188282589.0, + "step": 1579 + }, + { + "entropy": 0.6372067257761955, + "epoch": 3.5999429712004565, + "grad_norm": 0.53125, + "learning_rate": 3.8002190663739362e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9942797720432281, + "num_tokens": 188401923.0, + "step": 1580 + }, + { + "entropy": 0.6405310034751892, + "epoch": 3.602224123182207, + "grad_norm": 0.64453125, + "learning_rate": 3.798610019575384e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9925331696867943, + "num_tokens": 188520817.0, + "step": 1581 + }, + { + "entropy": 0.6425912007689476, + "epoch": 3.604505275163958, + "grad_norm": 0.50390625, + "learning_rate": 3.7970002357108554e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.993175745010376, + "num_tokens": 188639929.0, + "step": 1582 + }, + { + "entropy": 0.6424811482429504, + "epoch": 3.6067864271457086, + "grad_norm": 0.435546875, + "learning_rate": 3.7953897156940323e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9952887520194054, + "num_tokens": 188758918.0, + "step": 1583 + }, + { + "entropy": 0.643259309232235, + "epoch": 3.6090675791274593, + "grad_norm": 0.55859375, + "learning_rate": 3.793778460439015e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9934065416455269, + "num_tokens": 188877864.0, + "step": 1584 + }, + { + "entropy": 0.6469481810927391, + "epoch": 3.61134873110921, + "grad_norm": 0.5078125, + "learning_rate": 3.792166470860321e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.993035301566124, + "num_tokens": 188996961.0, + "step": 1585 + }, + { + "entropy": 0.6396861150860786, + "epoch": 3.613629883090961, + "grad_norm": 0.578125, + "learning_rate": 3.790553747872885e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9919690191745758, + "num_tokens": 189116101.0, + "step": 1586 + }, + { + "entropy": 0.646305225789547, + "epoch": 3.6159110350727115, + "grad_norm": 0.4609375, + "learning_rate": 3.788940292392056e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9924203529953957, + "num_tokens": 189236056.0, + "step": 1587 + }, + { + "entropy": 0.6447113305330276, + "epoch": 3.6181921870544627, + "grad_norm": 0.703125, + "learning_rate": 3.787326105333601e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9950546845793724, + "num_tokens": 189355946.0, + "step": 1588 + }, + { + "entropy": 0.6443922519683838, + "epoch": 3.6204733390362134, + "grad_norm": 0.6640625, + "learning_rate": 3.7857111876137017e-06, + "loss": 0.0223, + "mean_token_accuracy": 0.9935116022825241, + "num_tokens": 189475042.0, + "step": 1589 + }, + { + "entropy": 0.6441709920763969, + "epoch": 3.622754491017964, + "grad_norm": 0.55078125, + "learning_rate": 3.784095540148954e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9921174049377441, + "num_tokens": 189594046.0, + "step": 1590 + }, + { + "entropy": 0.6481105238199234, + "epoch": 3.625035642999715, + "grad_norm": 0.6796875, + "learning_rate": 3.7824791638563674e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9941760003566742, + "num_tokens": 189713357.0, + "step": 1591 + }, + { + "entropy": 0.6500027999281883, + "epoch": 3.6273167949814655, + "grad_norm": 0.427734375, + "learning_rate": 3.7808620596533675e-06, + "loss": 0.0126, + "mean_token_accuracy": 0.9963845163583755, + "num_tokens": 189832712.0, + "step": 1592 + }, + { + "entropy": 0.6466986387968063, + "epoch": 3.6295979469632167, + "grad_norm": 0.52734375, + "learning_rate": 3.77924422845779e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9938987344503403, + "num_tokens": 189951440.0, + "step": 1593 + }, + { + "entropy": 0.6442811638116837, + "epoch": 3.6318790989449674, + "grad_norm": 0.5546875, + "learning_rate": 3.7776256711878856e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9929486885666847, + "num_tokens": 190070637.0, + "step": 1594 + }, + { + "entropy": 0.6462531462311745, + "epoch": 3.634160250926718, + "grad_norm": 0.46484375, + "learning_rate": 3.7760063887623155e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9936605617403984, + "num_tokens": 190189587.0, + "step": 1595 + }, + { + "entropy": 0.6410996913909912, + "epoch": 3.636441402908469, + "grad_norm": 0.423828125, + "learning_rate": 3.7743863821001538e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9948446601629257, + "num_tokens": 190309631.0, + "step": 1596 + }, + { + "entropy": 0.642403669655323, + "epoch": 3.6387225548902196, + "grad_norm": 0.5546875, + "learning_rate": 3.7727656521208843e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9943488836288452, + "num_tokens": 190428650.0, + "step": 1597 + }, + { + "entropy": 0.641104444861412, + "epoch": 3.6410037068719703, + "grad_norm": 0.61328125, + "learning_rate": 3.771144199744402e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9935358688235283, + "num_tokens": 190547498.0, + "step": 1598 + }, + { + "entropy": 0.6391033679246902, + "epoch": 3.643284858853721, + "grad_norm": 0.408203125, + "learning_rate": 3.7695220258910124e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9952377602458, + "num_tokens": 190666731.0, + "step": 1599 + }, + { + "entropy": 0.6398038119077682, + "epoch": 3.6455660108354717, + "grad_norm": 0.52734375, + "learning_rate": 3.7678991314814305e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9934115558862686, + "num_tokens": 190785453.0, + "step": 1600 + }, + { + "entropy": 0.6447111815214157, + "epoch": 3.6478471628172224, + "grad_norm": 0.474609375, + "learning_rate": 3.766275517436779e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9950995296239853, + "num_tokens": 190904732.0, + "step": 1601 + }, + { + "entropy": 0.6402760297060013, + "epoch": 3.6501283147989736, + "grad_norm": 0.6328125, + "learning_rate": 3.7646511846785904e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.9916575253009796, + "num_tokens": 191024278.0, + "step": 1602 + }, + { + "entropy": 0.6428078711032867, + "epoch": 3.6524094667807243, + "grad_norm": 0.4609375, + "learning_rate": 3.7630261341288044e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.9953137040138245, + "num_tokens": 191143451.0, + "step": 1603 + }, + { + "entropy": 0.6413597390055656, + "epoch": 3.654690618762475, + "grad_norm": 0.609375, + "learning_rate": 3.7614003667097674e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9930373132228851, + "num_tokens": 191262833.0, + "step": 1604 + }, + { + "entropy": 0.6442285031080246, + "epoch": 3.6569717707442257, + "grad_norm": 0.640625, + "learning_rate": 3.759773883344236e-06, + "loss": 0.0247, + "mean_token_accuracy": 0.9911560267210007, + "num_tokens": 191381489.0, + "step": 1605 + }, + { + "entropy": 0.6510807350277901, + "epoch": 3.6592529227259765, + "grad_norm": 0.52734375, + "learning_rate": 3.7581466849553685e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9965724647045135, + "num_tokens": 191501867.0, + "step": 1606 + }, + { + "entropy": 0.6418421193957329, + "epoch": 3.6615340747077276, + "grad_norm": 0.48828125, + "learning_rate": 3.7565187724667324e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9950907528400421, + "num_tokens": 191621648.0, + "step": 1607 + }, + { + "entropy": 0.6392733976244926, + "epoch": 3.6638152266894783, + "grad_norm": 0.5625, + "learning_rate": 3.7548901468022993e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9932411313056946, + "num_tokens": 191741002.0, + "step": 1608 + }, + { + "entropy": 0.6408142000436783, + "epoch": 3.666096378671229, + "grad_norm": 0.50390625, + "learning_rate": 3.7532608088864444e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9940482005476952, + "num_tokens": 191859996.0, + "step": 1609 + }, + { + "entropy": 0.6391451731324196, + "epoch": 3.66837753065298, + "grad_norm": 0.5, + "learning_rate": 3.75163075964395e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9949234500527382, + "num_tokens": 191979415.0, + "step": 1610 + }, + { + "entropy": 0.6372823342680931, + "epoch": 3.6706586826347305, + "grad_norm": 0.578125, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0249, + "mean_token_accuracy": 0.9900059401988983, + "num_tokens": 192099370.0, + "step": 1611 + }, + { + "entropy": 0.6409793719649315, + "epoch": 3.672939834616481, + "grad_norm": 0.453125, + "learning_rate": 3.748368530880183e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.995820052921772, + "num_tokens": 192217913.0, + "step": 1612 + }, + { + "entropy": 0.6400706395506859, + "epoch": 3.675220986598232, + "grad_norm": 0.412109375, + "learning_rate": 3.7467363532104874e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.9957587793469429, + "num_tokens": 192337734.0, + "step": 1613 + }, + { + "entropy": 0.6414958834648132, + "epoch": 3.6775021385799826, + "grad_norm": 0.546875, + "learning_rate": 3.7451034679173082e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.9927613511681557, + "num_tokens": 192456942.0, + "step": 1614 + }, + { + "entropy": 0.642272062599659, + "epoch": 3.679783290561734, + "grad_norm": 0.55859375, + "learning_rate": 3.7434698759274366e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9941774308681488, + "num_tokens": 192577055.0, + "step": 1615 + }, + { + "entropy": 0.6428976655006409, + "epoch": 3.6820644425434845, + "grad_norm": 0.400390625, + "learning_rate": 3.741835578168071e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9960945770144463, + "num_tokens": 192696847.0, + "step": 1616 + }, + { + "entropy": 0.6396533474326134, + "epoch": 3.6843455945252352, + "grad_norm": 0.5546875, + "learning_rate": 3.740200575566806e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9937810003757477, + "num_tokens": 192815921.0, + "step": 1617 + }, + { + "entropy": 0.6364499032497406, + "epoch": 3.686626746506986, + "grad_norm": 0.375, + "learning_rate": 3.7385648690516364e-06, + "loss": 0.0112, + "mean_token_accuracy": 0.9965809285640717, + "num_tokens": 192935458.0, + "step": 1618 + }, + { + "entropy": 0.643454059958458, + "epoch": 3.6889078984887367, + "grad_norm": 0.431640625, + "learning_rate": 3.7369284595509587e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9943812564015388, + "num_tokens": 193055425.0, + "step": 1619 + }, + { + "entropy": 0.6392692252993584, + "epoch": 3.691189050470488, + "grad_norm": 0.447265625, + "learning_rate": 3.7352913479935672e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9947753250598907, + "num_tokens": 193175394.0, + "step": 1620 + }, + { + "entropy": 0.6432261392474174, + "epoch": 3.6934702024522386, + "grad_norm": 0.5625, + "learning_rate": 3.7336535353086546e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9936353266239166, + "num_tokens": 193295108.0, + "step": 1621 + }, + { + "entropy": 0.6406015232205391, + "epoch": 3.6957513544339893, + "grad_norm": 0.62109375, + "learning_rate": 3.7320150224258124e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9953729137778282, + "num_tokens": 193414554.0, + "step": 1622 + }, + { + "entropy": 0.6408230438828468, + "epoch": 3.69803250641574, + "grad_norm": 0.6171875, + "learning_rate": 3.7303758102750274e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9952927678823471, + "num_tokens": 193534040.0, + "step": 1623 + }, + { + "entropy": 0.6383534595370293, + "epoch": 3.7003136583974907, + "grad_norm": 0.4375, + "learning_rate": 3.7287358997866872e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9932822212576866, + "num_tokens": 193653133.0, + "step": 1624 + }, + { + "entropy": 0.6399819478392601, + "epoch": 3.7025948103792414, + "grad_norm": 0.65234375, + "learning_rate": 3.7270952918915715e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.9925026297569275, + "num_tokens": 193771968.0, + "step": 1625 + }, + { + "entropy": 0.639611229300499, + "epoch": 3.704875962360992, + "grad_norm": 0.3984375, + "learning_rate": 3.7254539875208577e-06, + "loss": 0.0118, + "mean_token_accuracy": 0.9962161928415298, + "num_tokens": 193891277.0, + "step": 1626 + }, + { + "entropy": 0.6403531208634377, + "epoch": 3.707157114342743, + "grad_norm": 0.65625, + "learning_rate": 3.7238119876061196e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9940015748143196, + "num_tokens": 194010264.0, + "step": 1627 + }, + { + "entropy": 0.63954808562994, + "epoch": 3.709438266324494, + "grad_norm": 0.51171875, + "learning_rate": 3.7221692930793234e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9952571764588356, + "num_tokens": 194129445.0, + "step": 1628 + }, + { + "entropy": 0.6342084780335426, + "epoch": 3.7117194183062447, + "grad_norm": 0.51953125, + "learning_rate": 3.7205259048728316e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9949179515242577, + "num_tokens": 194249398.0, + "step": 1629 + }, + { + "entropy": 0.6382554620504379, + "epoch": 3.7140005702879955, + "grad_norm": 0.7734375, + "learning_rate": 3.718881823919399e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.9934902191162109, + "num_tokens": 194369252.0, + "step": 1630 + }, + { + "entropy": 0.6371995359659195, + "epoch": 3.716281722269746, + "grad_norm": 0.408203125, + "learning_rate": 3.717237051152175e-06, + "loss": 0.0101, + "mean_token_accuracy": 0.9956794902682304, + "num_tokens": 194488760.0, + "step": 1631 + }, + { + "entropy": 0.6400424987077713, + "epoch": 3.718562874251497, + "grad_norm": 0.455078125, + "learning_rate": 3.7155915875047005e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9952758774161339, + "num_tokens": 194608178.0, + "step": 1632 + }, + { + "entropy": 0.6398855596780777, + "epoch": 3.720844026233248, + "grad_norm": 0.5234375, + "learning_rate": 3.7139454339109082e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9938675165176392, + "num_tokens": 194727299.0, + "step": 1633 + }, + { + "entropy": 0.6414341628551483, + "epoch": 3.7231251782149988, + "grad_norm": 0.4375, + "learning_rate": 3.7122985913051242e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9962469041347504, + "num_tokens": 194846975.0, + "step": 1634 + }, + { + "entropy": 0.6428819224238396, + "epoch": 3.7254063301967495, + "grad_norm": 0.55859375, + "learning_rate": 3.710651060622064e-06, + "loss": 0.0113, + "mean_token_accuracy": 0.9967411980032921, + "num_tokens": 194966224.0, + "step": 1635 + }, + { + "entropy": 0.6397853717207909, + "epoch": 3.7276874821785, + "grad_norm": 0.60546875, + "learning_rate": 3.7090028427968343e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9926569238305092, + "num_tokens": 195084862.0, + "step": 1636 + }, + { + "entropy": 0.6464239731431007, + "epoch": 3.729968634160251, + "grad_norm": 0.55078125, + "learning_rate": 3.7073539387649316e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9949110969901085, + "num_tokens": 195203863.0, + "step": 1637 + }, + { + "entropy": 0.638613760471344, + "epoch": 3.7322497861420016, + "grad_norm": 0.51953125, + "learning_rate": 3.7057043494622423e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9940949976444244, + "num_tokens": 195323120.0, + "step": 1638 + }, + { + "entropy": 0.6370201483368874, + "epoch": 3.7345309381237524, + "grad_norm": 0.57421875, + "learning_rate": 3.704054075825042e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9932908043265343, + "num_tokens": 195442188.0, + "step": 1639 + }, + { + "entropy": 0.6432831287384033, + "epoch": 3.736812090105503, + "grad_norm": 0.5546875, + "learning_rate": 3.702403118789992e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9936428591609001, + "num_tokens": 195561846.0, + "step": 1640 + }, + { + "entropy": 0.6449670195579529, + "epoch": 3.739093242087254, + "grad_norm": 0.380859375, + "learning_rate": 3.7007514792941462e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.9959082677960396, + "num_tokens": 195681413.0, + "step": 1641 + }, + { + "entropy": 0.6404432952404022, + "epoch": 3.741374394069005, + "grad_norm": 0.5859375, + "learning_rate": 3.6990991582749414e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9939389675855637, + "num_tokens": 195800952.0, + "step": 1642 + }, + { + "entropy": 0.642871230840683, + "epoch": 3.7436555460507557, + "grad_norm": 0.53125, + "learning_rate": 3.6974461566702048e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9931874424219131, + "num_tokens": 195920572.0, + "step": 1643 + }, + { + "entropy": 0.6347827389836311, + "epoch": 3.7459366980325064, + "grad_norm": 0.671875, + "learning_rate": 3.695792475418146e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.9933823645114899, + "num_tokens": 196040003.0, + "step": 1644 + }, + { + "entropy": 0.6381771340966225, + "epoch": 3.748217850014257, + "grad_norm": 0.470703125, + "learning_rate": 3.6941381154573646e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.994377925992012, + "num_tokens": 196159120.0, + "step": 1645 + }, + { + "entropy": 0.6414132341742516, + "epoch": 3.750499001996008, + "grad_norm": 0.58203125, + "learning_rate": 3.692483077726843e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9934748187661171, + "num_tokens": 196278624.0, + "step": 1646 + }, + { + "entropy": 0.634661853313446, + "epoch": 3.752780153977759, + "grad_norm": 0.451171875, + "learning_rate": 3.6908273631659475e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9948225095868111, + "num_tokens": 196397192.0, + "step": 1647 + }, + { + "entropy": 0.6416900679469109, + "epoch": 3.7550613059595097, + "grad_norm": 0.62109375, + "learning_rate": 3.689170972714431e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.9922632724046707, + "num_tokens": 196517052.0, + "step": 1648 + }, + { + "entropy": 0.6383163183927536, + "epoch": 3.7573424579412604, + "grad_norm": 0.412109375, + "learning_rate": 3.6875139073124277e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9950861036777496, + "num_tokens": 196636422.0, + "step": 1649 + }, + { + "entropy": 0.6382315382361412, + "epoch": 3.759623609923011, + "grad_norm": 0.515625, + "learning_rate": 3.6858561679004567e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9941170588135719, + "num_tokens": 196755812.0, + "step": 1650 + }, + { + "entropy": 0.6432105004787445, + "epoch": 3.761904761904762, + "grad_norm": 0.609375, + "learning_rate": 3.684197755419419e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.99325330555439, + "num_tokens": 196875260.0, + "step": 1651 + }, + { + "entropy": 0.6422563940286636, + "epoch": 3.7641859138865126, + "grad_norm": 0.55078125, + "learning_rate": 3.6825386708105963e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.993855707347393, + "num_tokens": 196994523.0, + "step": 1652 + }, + { + "entropy": 0.64356629550457, + "epoch": 3.7664670658682633, + "grad_norm": 0.484375, + "learning_rate": 3.6808789150156545e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9939728528261185, + "num_tokens": 197113852.0, + "step": 1653 + }, + { + "entropy": 0.6407823041081429, + "epoch": 3.768748217850014, + "grad_norm": 0.6640625, + "learning_rate": 3.679218488976638e-06, + "loss": 0.0271, + "mean_token_accuracy": 0.9916809722781181, + "num_tokens": 197233310.0, + "step": 1654 + }, + { + "entropy": 0.6483634412288666, + "epoch": 3.771029369831765, + "grad_norm": 0.6015625, + "learning_rate": 3.677557393635973e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9940032586455345, + "num_tokens": 197352781.0, + "step": 1655 + }, + { + "entropy": 0.6396982744336128, + "epoch": 3.773310521813516, + "grad_norm": 0.4765625, + "learning_rate": 3.6758956299364643e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9946533516049385, + "num_tokens": 197471666.0, + "step": 1656 + }, + { + "entropy": 0.6435931921005249, + "epoch": 3.7755916737952666, + "grad_norm": 0.51953125, + "learning_rate": 3.674233198821299e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9951743707060814, + "num_tokens": 197592132.0, + "step": 1657 + }, + { + "entropy": 0.6372792944312096, + "epoch": 3.7778728257770173, + "grad_norm": 0.427734375, + "learning_rate": 3.6725701012340387e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.9961308985948563, + "num_tokens": 197711298.0, + "step": 1658 + }, + { + "entropy": 0.6407987922430038, + "epoch": 3.780153977758768, + "grad_norm": 0.515625, + "learning_rate": 3.6709063381186267e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.9919411092996597, + "num_tokens": 197831274.0, + "step": 1659 + }, + { + "entropy": 0.6443175300955772, + "epoch": 3.782435129740519, + "grad_norm": 0.61328125, + "learning_rate": 3.6692419104193823e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9944733455777168, + "num_tokens": 197950595.0, + "step": 1660 + }, + { + "entropy": 0.6429272517561913, + "epoch": 3.78471628172227, + "grad_norm": 0.5703125, + "learning_rate": 3.6675768190810023e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9944271445274353, + "num_tokens": 198069853.0, + "step": 1661 + }, + { + "entropy": 0.6430104747414589, + "epoch": 3.7869974337040206, + "grad_norm": 0.57421875, + "learning_rate": 3.665911065048561e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9927431121468544, + "num_tokens": 198190059.0, + "step": 1662 + }, + { + "entropy": 0.6453769207000732, + "epoch": 3.7892785856857714, + "grad_norm": 0.40234375, + "learning_rate": 3.6642446492675075e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9948774576187134, + "num_tokens": 198309183.0, + "step": 1663 + }, + { + "entropy": 0.6507212445139885, + "epoch": 3.791559737667522, + "grad_norm": 0.51171875, + "learning_rate": 3.6625775726836677e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9942691624164581, + "num_tokens": 198428513.0, + "step": 1664 + }, + { + "entropy": 0.6354446485638618, + "epoch": 3.793840889649273, + "grad_norm": 0.490234375, + "learning_rate": 3.6609098362432425e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9961624816060066, + "num_tokens": 198547543.0, + "step": 1665 + }, + { + "entropy": 0.643363781273365, + "epoch": 3.7961220416310235, + "grad_norm": 0.53515625, + "learning_rate": 3.659241440892806e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9929633438587189, + "num_tokens": 198666871.0, + "step": 1666 + }, + { + "entropy": 0.6433338150382042, + "epoch": 3.7984031936127742, + "grad_norm": 0.490234375, + "learning_rate": 3.6575723875793085e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9920924752950668, + "num_tokens": 198786086.0, + "step": 1667 + }, + { + "entropy": 0.6453112363815308, + "epoch": 3.8006843455945254, + "grad_norm": 0.443359375, + "learning_rate": 3.655902677250071e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9954273253679276, + "num_tokens": 198905597.0, + "step": 1668 + }, + { + "entropy": 0.6411551386117935, + "epoch": 3.802965497576276, + "grad_norm": 0.54296875, + "learning_rate": 3.6542323108527896e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9933052957057953, + "num_tokens": 199025067.0, + "step": 1669 + }, + { + "entropy": 0.6434338688850403, + "epoch": 3.805246649558027, + "grad_norm": 0.53125, + "learning_rate": 3.652561289335532e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9928085505962372, + "num_tokens": 199143503.0, + "step": 1670 + }, + { + "entropy": 0.6436351090669632, + "epoch": 3.8075278015397775, + "grad_norm": 0.4921875, + "learning_rate": 3.6508896136467376e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9942793399095535, + "num_tokens": 199263174.0, + "step": 1671 + }, + { + "entropy": 0.6448797211050987, + "epoch": 3.8098089535215283, + "grad_norm": 1.125, + "learning_rate": 3.649217284735217e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9937953501939774, + "num_tokens": 199382749.0, + "step": 1672 + }, + { + "entropy": 0.641070045530796, + "epoch": 3.8120901055032794, + "grad_norm": 0.40625, + "learning_rate": 3.6475443035501522e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9964389503002167, + "num_tokens": 199501746.0, + "step": 1673 + }, + { + "entropy": 0.6428672671318054, + "epoch": 3.81437125748503, + "grad_norm": 0.5234375, + "learning_rate": 3.645870671041095e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.995056077837944, + "num_tokens": 199621074.0, + "step": 1674 + }, + { + "entropy": 0.643464706838131, + "epoch": 3.816652409466781, + "grad_norm": 0.54296875, + "learning_rate": 3.6441963881579668e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.9929622411727905, + "num_tokens": 199740781.0, + "step": 1675 + }, + { + "entropy": 0.6424876824021339, + "epoch": 3.8189335614485316, + "grad_norm": 0.47265625, + "learning_rate": 3.642521455851058e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9926480501890182, + "num_tokens": 199859961.0, + "step": 1676 + }, + { + "entropy": 0.6467471569776535, + "epoch": 3.8212147134302823, + "grad_norm": 0.396484375, + "learning_rate": 3.6408458750710284e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9956288412213326, + "num_tokens": 199979497.0, + "step": 1677 + }, + { + "entropy": 0.6459074169397354, + "epoch": 3.823495865412033, + "grad_norm": 0.62890625, + "learning_rate": 3.639169646768905e-06, + "loss": 0.0235, + "mean_token_accuracy": 0.9917472004890442, + "num_tokens": 200099333.0, + "step": 1678 + }, + { + "entropy": 0.648176796734333, + "epoch": 3.8257770173937837, + "grad_norm": 0.53125, + "learning_rate": 3.637492771896082e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9944525957107544, + "num_tokens": 200218305.0, + "step": 1679 + }, + { + "entropy": 0.6430159136652946, + "epoch": 3.8280581693755344, + "grad_norm": 0.58984375, + "learning_rate": 3.6358152514043226e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.9961045235395432, + "num_tokens": 200337660.0, + "step": 1680 + }, + { + "entropy": 0.6453122645616531, + "epoch": 3.830339321357285, + "grad_norm": 0.515625, + "learning_rate": 3.634137086245754e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9944258704781532, + "num_tokens": 200457287.0, + "step": 1681 + }, + { + "entropy": 0.6398368626832962, + "epoch": 3.8326204733390363, + "grad_norm": 0.6796875, + "learning_rate": 3.6324582773728712e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.993286207318306, + "num_tokens": 200576953.0, + "step": 1682 + }, + { + "entropy": 0.6453606933355331, + "epoch": 3.834901625320787, + "grad_norm": 0.69921875, + "learning_rate": 3.6307788257385325e-06, + "loss": 0.0281, + "mean_token_accuracy": 0.9919368922710419, + "num_tokens": 200696116.0, + "step": 1683 + }, + { + "entropy": 0.6400988027453423, + "epoch": 3.8371827773025378, + "grad_norm": 0.48828125, + "learning_rate": 3.6290987322959624e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9944823011755943, + "num_tokens": 200816301.0, + "step": 1684 + }, + { + "entropy": 0.6404668539762497, + "epoch": 3.8394639292842885, + "grad_norm": 0.5078125, + "learning_rate": 3.6274179979987507e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.994559608399868, + "num_tokens": 200935377.0, + "step": 1685 + }, + { + "entropy": 0.638628788292408, + "epoch": 3.841745081266039, + "grad_norm": 0.65234375, + "learning_rate": 3.625736623800849e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.992220863699913, + "num_tokens": 201054841.0, + "step": 1686 + }, + { + "entropy": 0.6496201455593109, + "epoch": 3.8440262332477904, + "grad_norm": 0.6171875, + "learning_rate": 3.624054610656572e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.991427056491375, + "num_tokens": 201174668.0, + "step": 1687 + }, + { + "entropy": 0.6464825570583344, + "epoch": 3.846307385229541, + "grad_norm": 0.5625, + "learning_rate": 3.622371959520599e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.994487076997757, + "num_tokens": 201293897.0, + "step": 1688 + }, + { + "entropy": 0.6426267772912979, + "epoch": 3.848588537211292, + "grad_norm": 0.63671875, + "learning_rate": 3.6206886713479705e-06, + "loss": 0.0267, + "mean_token_accuracy": 0.9921610280871391, + "num_tokens": 201413175.0, + "step": 1689 + }, + { + "entropy": 0.6398816704750061, + "epoch": 3.8508696891930425, + "grad_norm": 0.53125, + "learning_rate": 3.6190047470940875e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9940913841128349, + "num_tokens": 201532261.0, + "step": 1690 + }, + { + "entropy": 0.6407239809632301, + "epoch": 3.8531508411747932, + "grad_norm": 0.5390625, + "learning_rate": 3.6173201877147134e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9946774542331696, + "num_tokens": 201651052.0, + "step": 1691 + }, + { + "entropy": 0.6414723172783852, + "epoch": 3.855431993156544, + "grad_norm": 0.478515625, + "learning_rate": 3.6156349941659717e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9947574362158775, + "num_tokens": 201770757.0, + "step": 1692 + }, + { + "entropy": 0.6408701837062836, + "epoch": 3.8577131451382947, + "grad_norm": 0.5, + "learning_rate": 3.613949167404345e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9941118210554123, + "num_tokens": 201889632.0, + "step": 1693 + }, + { + "entropy": 0.6399394422769547, + "epoch": 3.8599942971200454, + "grad_norm": 0.51171875, + "learning_rate": 3.6122627083866773e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9950448721647263, + "num_tokens": 202008549.0, + "step": 1694 + }, + { + "entropy": 0.6437289193272591, + "epoch": 3.8622754491017965, + "grad_norm": 0.41015625, + "learning_rate": 3.610575618070169e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9955235496163368, + "num_tokens": 202127684.0, + "step": 1695 + }, + { + "entropy": 0.6457908377051353, + "epoch": 3.8645566010835473, + "grad_norm": 0.53125, + "learning_rate": 3.6088878974123796e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9954199343919754, + "num_tokens": 202247514.0, + "step": 1696 + }, + { + "entropy": 0.6448739171028137, + "epoch": 3.866837753065298, + "grad_norm": 0.55078125, + "learning_rate": 3.6071995473712284e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9941673204302788, + "num_tokens": 202366551.0, + "step": 1697 + }, + { + "entropy": 0.6437110900878906, + "epoch": 3.8691189050470487, + "grad_norm": 0.486328125, + "learning_rate": 3.605510568904989e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9939732328057289, + "num_tokens": 202485926.0, + "step": 1698 + }, + { + "entropy": 0.6445790156722069, + "epoch": 3.8714000570287994, + "grad_norm": 0.484375, + "learning_rate": 3.6038209629722936e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9953648075461388, + "num_tokens": 202605426.0, + "step": 1699 + }, + { + "entropy": 0.6392658948898315, + "epoch": 3.8736812090105506, + "grad_norm": 0.546875, + "learning_rate": 3.6021307305321295e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9930849596858025, + "num_tokens": 202725302.0, + "step": 1700 + }, + { + "entropy": 0.6406156942248344, + "epoch": 3.8759623609923013, + "grad_norm": 0.39453125, + "learning_rate": 3.6004398725438406e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9953471645712852, + "num_tokens": 202844806.0, + "step": 1701 + }, + { + "entropy": 0.6429390907287598, + "epoch": 3.878243512974052, + "grad_norm": 0.55078125, + "learning_rate": 3.5987483899671245e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9934186413884163, + "num_tokens": 202964475.0, + "step": 1702 + }, + { + "entropy": 0.6401058956980705, + "epoch": 3.8805246649558027, + "grad_norm": 0.58984375, + "learning_rate": 3.597056283762034e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9961468502879143, + "num_tokens": 203084724.0, + "step": 1703 + }, + { + "entropy": 0.6400596350431442, + "epoch": 3.8828058169375534, + "grad_norm": 0.5390625, + "learning_rate": 3.5953635548889777e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9940367341041565, + "num_tokens": 203203535.0, + "step": 1704 + }, + { + "entropy": 0.6430227756500244, + "epoch": 3.885086968919304, + "grad_norm": 0.55078125, + "learning_rate": 3.5936702043087134e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9945536404848099, + "num_tokens": 203322547.0, + "step": 1705 + }, + { + "entropy": 0.6416208073496819, + "epoch": 3.887368120901055, + "grad_norm": 0.62890625, + "learning_rate": 3.5919762329823556e-06, + "loss": 0.0275, + "mean_token_accuracy": 0.9921538531780243, + "num_tokens": 203442309.0, + "step": 1706 + }, + { + "entropy": 0.6468053758144379, + "epoch": 3.8896492728828056, + "grad_norm": 0.439453125, + "learning_rate": 3.5902816418713694e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9951266720890999, + "num_tokens": 203561747.0, + "step": 1707 + }, + { + "entropy": 0.6441782340407372, + "epoch": 3.8919304248645568, + "grad_norm": 0.3984375, + "learning_rate": 3.5885864319375717e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9951953589916229, + "num_tokens": 203681820.0, + "step": 1708 + }, + { + "entropy": 0.6459113135933876, + "epoch": 3.8942115768463075, + "grad_norm": 0.53125, + "learning_rate": 3.5868906041431313e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9931679368019104, + "num_tokens": 203800829.0, + "step": 1709 + }, + { + "entropy": 0.6354555636644363, + "epoch": 3.896492728828058, + "grad_norm": 0.625, + "learning_rate": 3.5851941594505674e-06, + "loss": 0.0295, + "mean_token_accuracy": 0.9928982108831406, + "num_tokens": 203919900.0, + "step": 1710 + }, + { + "entropy": 0.6423862352967262, + "epoch": 3.898773880809809, + "grad_norm": 0.58984375, + "learning_rate": 3.5834970988227484e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9924761205911636, + "num_tokens": 204039507.0, + "step": 1711 + }, + { + "entropy": 0.6404493823647499, + "epoch": 3.9010550327915596, + "grad_norm": 0.53515625, + "learning_rate": 3.581799423222895e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9940173551440239, + "num_tokens": 204159067.0, + "step": 1712 + }, + { + "entropy": 0.6409786865115166, + "epoch": 3.903336184773311, + "grad_norm": 0.69921875, + "learning_rate": 3.580101133614573e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9939274564385414, + "num_tokens": 204279536.0, + "step": 1713 + }, + { + "entropy": 0.6397727653384209, + "epoch": 3.9056173367550615, + "grad_norm": 0.66796875, + "learning_rate": 3.5784022309617006e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9933241456747055, + "num_tokens": 204399325.0, + "step": 1714 + }, + { + "entropy": 0.6442185714840889, + "epoch": 3.9078984887368122, + "grad_norm": 0.451171875, + "learning_rate": 3.57670271622854e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9943146109580994, + "num_tokens": 204518143.0, + "step": 1715 + }, + { + "entropy": 0.6408437788486481, + "epoch": 3.910179640718563, + "grad_norm": 0.46484375, + "learning_rate": 3.5750025903797053e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9935752227902412, + "num_tokens": 204636676.0, + "step": 1716 + }, + { + "entropy": 0.6407039314508438, + "epoch": 3.9124607927003137, + "grad_norm": 0.55078125, + "learning_rate": 3.5733018543801534e-06, + "loss": 0.0274, + "mean_token_accuracy": 0.9915737882256508, + "num_tokens": 204755398.0, + "step": 1717 + }, + { + "entropy": 0.6420653015375137, + "epoch": 3.9147419446820644, + "grad_norm": 0.5546875, + "learning_rate": 3.5716005091951906e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.9923620373010635, + "num_tokens": 204874175.0, + "step": 1718 + }, + { + "entropy": 0.6423268243670464, + "epoch": 3.917023096663815, + "grad_norm": 0.47265625, + "learning_rate": 3.569898555790466e-06, + "loss": 0.023, + "mean_token_accuracy": 0.9947631806135178, + "num_tokens": 204992933.0, + "step": 1719 + }, + { + "entropy": 0.6471187323331833, + "epoch": 3.919304248645566, + "grad_norm": 0.43359375, + "learning_rate": 3.5681959951319766e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9946877434849739, + "num_tokens": 205112745.0, + "step": 1720 + }, + { + "entropy": 0.6439356803894043, + "epoch": 3.9215854006273165, + "grad_norm": 0.5390625, + "learning_rate": 3.566492828186063e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9942131489515305, + "num_tokens": 205231720.0, + "step": 1721 + }, + { + "entropy": 0.6524837091565132, + "epoch": 3.9238665526090677, + "grad_norm": 0.484375, + "learning_rate": 3.564789055919409e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9954639375209808, + "num_tokens": 205351147.0, + "step": 1722 + }, + { + "entropy": 0.6437823176383972, + "epoch": 3.9261477045908184, + "grad_norm": 0.478515625, + "learning_rate": 3.5630846792990435e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9942243322730064, + "num_tokens": 205470477.0, + "step": 1723 + }, + { + "entropy": 0.6479365602135658, + "epoch": 3.928428856572569, + "grad_norm": 0.455078125, + "learning_rate": 3.5613796992923382e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9957788661122322, + "num_tokens": 205590066.0, + "step": 1724 + }, + { + "entropy": 0.6479618921875954, + "epoch": 3.93071000855432, + "grad_norm": 0.39453125, + "learning_rate": 3.559674116867006e-06, + "loss": 0.0112, + "mean_token_accuracy": 0.9971956983208656, + "num_tokens": 205709558.0, + "step": 1725 + }, + { + "entropy": 0.6426609605550766, + "epoch": 3.9329911605360706, + "grad_norm": 0.52734375, + "learning_rate": 3.5579679329911025e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9935547634959221, + "num_tokens": 205829036.0, + "step": 1726 + }, + { + "entropy": 0.6462381556630135, + "epoch": 3.9352723125178217, + "grad_norm": 0.45703125, + "learning_rate": 3.556261148633026e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9949177652597427, + "num_tokens": 205948639.0, + "step": 1727 + }, + { + "entropy": 0.6470400243997574, + "epoch": 3.9375534644995724, + "grad_norm": 0.5, + "learning_rate": 3.5545537647615125e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9927120432257652, + "num_tokens": 206067761.0, + "step": 1728 + }, + { + "entropy": 0.6425961032509804, + "epoch": 3.939834616481323, + "grad_norm": 0.61328125, + "learning_rate": 3.552845782345642e-06, + "loss": 0.0297, + "mean_token_accuracy": 0.99269949644804, + "num_tokens": 206186714.0, + "step": 1729 + }, + { + "entropy": 0.6429673805832863, + "epoch": 3.942115768463074, + "grad_norm": 0.47265625, + "learning_rate": 3.551137202354831e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9953842312097549, + "num_tokens": 206305831.0, + "step": 1730 + }, + { + "entropy": 0.6414889693260193, + "epoch": 3.9443969204448246, + "grad_norm": 0.41796875, + "learning_rate": 3.5494280257588367e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.9960954710841179, + "num_tokens": 206425394.0, + "step": 1731 + }, + { + "entropy": 0.6468653306365013, + "epoch": 3.9466780724265753, + "grad_norm": 0.54296875, + "learning_rate": 3.547718253527755e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9946894347667694, + "num_tokens": 206544993.0, + "step": 1732 + }, + { + "entropy": 0.6456347927451134, + "epoch": 3.948959224408326, + "grad_norm": 0.52734375, + "learning_rate": 3.546007886632019e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9931619316339493, + "num_tokens": 206663602.0, + "step": 1733 + }, + { + "entropy": 0.6510143727064133, + "epoch": 3.9512403763900767, + "grad_norm": 0.4921875, + "learning_rate": 3.5442969260424022e-06, + "loss": 0.0223, + "mean_token_accuracy": 0.9947517961263657, + "num_tokens": 206783024.0, + "step": 1734 + }, + { + "entropy": 0.6414418295025826, + "epoch": 3.953521528371828, + "grad_norm": 0.4453125, + "learning_rate": 3.5425853727300095e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9959689751267433, + "num_tokens": 206902789.0, + "step": 1735 + }, + { + "entropy": 0.6415505856275558, + "epoch": 3.9558026803535786, + "grad_norm": 0.40234375, + "learning_rate": 3.5408732276662882e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.995775118470192, + "num_tokens": 207021769.0, + "step": 1736 + }, + { + "entropy": 0.6371714249253273, + "epoch": 3.9580838323353293, + "grad_norm": 0.54296875, + "learning_rate": 3.5391604918230173e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.994187206029892, + "num_tokens": 207141979.0, + "step": 1737 + }, + { + "entropy": 0.6395134180784225, + "epoch": 3.96036498431708, + "grad_norm": 0.404296875, + "learning_rate": 3.537447166172313e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9951215907931328, + "num_tokens": 207261273.0, + "step": 1738 + }, + { + "entropy": 0.6431818455457687, + "epoch": 3.962646136298831, + "grad_norm": 0.396484375, + "learning_rate": 3.5357332516866256e-06, + "loss": 0.0131, + "mean_token_accuracy": 0.9961171820759773, + "num_tokens": 207380325.0, + "step": 1739 + }, + { + "entropy": 0.6421076357364655, + "epoch": 3.964927288280582, + "grad_norm": 0.60546875, + "learning_rate": 3.534018749338741e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9946480765938759, + "num_tokens": 207500115.0, + "step": 1740 + }, + { + "entropy": 0.6391152143478394, + "epoch": 3.9672084402623327, + "grad_norm": 0.48828125, + "learning_rate": 3.532303660101776e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9934677481651306, + "num_tokens": 207619268.0, + "step": 1741 + }, + { + "entropy": 0.6448415890336037, + "epoch": 3.9694895922440834, + "grad_norm": 0.56640625, + "learning_rate": 3.530587984949183e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9932295978069305, + "num_tokens": 207738325.0, + "step": 1742 + }, + { + "entropy": 0.6476233005523682, + "epoch": 3.971770744225834, + "grad_norm": 0.5390625, + "learning_rate": 3.5288717248547453e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9947032555937767, + "num_tokens": 207858662.0, + "step": 1743 + }, + { + "entropy": 0.642040342092514, + "epoch": 3.974051896207585, + "grad_norm": 0.48046875, + "learning_rate": 3.5271548807925803e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9950909614562988, + "num_tokens": 207978009.0, + "step": 1744 + }, + { + "entropy": 0.6457936838269234, + "epoch": 3.9763330481893355, + "grad_norm": 0.416015625, + "learning_rate": 3.525437453737136e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9952508583664894, + "num_tokens": 208097378.0, + "step": 1745 + }, + { + "entropy": 0.6392067596316338, + "epoch": 3.9786142001710862, + "grad_norm": 0.443359375, + "learning_rate": 3.5237194446631883e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9946862757205963, + "num_tokens": 208215778.0, + "step": 1746 + }, + { + "entropy": 0.6389280259609222, + "epoch": 3.980895352152837, + "grad_norm": 0.59765625, + "learning_rate": 3.522000854545849e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9938551560044289, + "num_tokens": 208334863.0, + "step": 1747 + }, + { + "entropy": 0.6416886821389198, + "epoch": 3.983176504134588, + "grad_norm": 0.59375, + "learning_rate": 3.520281684360554e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.9926659092307091, + "num_tokens": 208454005.0, + "step": 1748 + }, + { + "entropy": 0.6467065662145615, + "epoch": 3.985457656116339, + "grad_norm": 0.494140625, + "learning_rate": 3.5185619350830725e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.992884911596775, + "num_tokens": 208573011.0, + "step": 1749 + }, + { + "entropy": 0.6404049843549728, + "epoch": 3.9877388080980896, + "grad_norm": 0.384765625, + "learning_rate": 3.516841607689501e-06, + "loss": 0.0118, + "mean_token_accuracy": 0.9973809942603111, + "num_tokens": 208693005.0, + "step": 1750 + }, + { + "entropy": 0.6382281556725502, + "epoch": 3.9900199600798403, + "grad_norm": 0.51953125, + "learning_rate": 3.515120703156264e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9928053766489029, + "num_tokens": 208811410.0, + "step": 1751 + }, + { + "entropy": 0.6435566619038582, + "epoch": 3.992301112061591, + "grad_norm": 0.640625, + "learning_rate": 3.5133992224601126e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9932365715503693, + "num_tokens": 208930561.0, + "step": 1752 + }, + { + "entropy": 0.6448219120502472, + "epoch": 3.994582264043342, + "grad_norm": 0.6171875, + "learning_rate": 3.511677166578128e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9921178296208382, + "num_tokens": 209050719.0, + "step": 1753 + }, + { + "entropy": 0.6395334526896477, + "epoch": 3.996863416025093, + "grad_norm": 0.482421875, + "learning_rate": 3.509954536487714e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9942538738250732, + "num_tokens": 209170042.0, + "step": 1754 + }, + { + "entropy": 0.6496885716915131, + "epoch": 3.9991445680068436, + "grad_norm": 0.8515625, + "learning_rate": 3.5082313331666035e-06, + "loss": 0.032, + "mean_token_accuracy": 0.9894365221261978, + "num_tokens": 209290216.0, + "step": 1755 + }, + { + "entropy": 0.6517043908437093, + "epoch": 4.0, + "grad_norm": 0.82421875, + "learning_rate": 3.506507557592853e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9957078695297241, + "num_tokens": 209333912.0, + "step": 1756 + }, + { + "entropy": 0.6455219984054565, + "epoch": 4.002281151981751, + "grad_norm": 0.48046875, + "learning_rate": 3.5047832107448437e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9934390559792519, + "num_tokens": 209452705.0, + "step": 1757 + }, + { + "entropy": 0.6424630060791969, + "epoch": 4.004562303963501, + "grad_norm": 0.63671875, + "learning_rate": 3.503058293601283e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9938667640089989, + "num_tokens": 209572481.0, + "step": 1758 + }, + { + "entropy": 0.6469209492206573, + "epoch": 4.006843455945252, + "grad_norm": 0.5859375, + "learning_rate": 3.5013328071411995e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9944320693612099, + "num_tokens": 209692374.0, + "step": 1759 + }, + { + "entropy": 0.6468159481883049, + "epoch": 4.009124607927003, + "grad_norm": 0.76171875, + "learning_rate": 3.499606752343945e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.99288459867239, + "num_tokens": 209812578.0, + "step": 1760 + }, + { + "epoch": 4.009124607927003, + "eval_entropy": 0.6447644344754092, + "eval_loss": 0.02095600962638855, + "eval_mean_token_accuracy": 0.993453662205105, + "eval_num_tokens": 209812578.0, + "eval_runtime": 177.4992, + "eval_samples_per_second": 47.24, + "eval_steps_per_second": 1.482, + "step": 1760 + }, + { + "entropy": 0.6453612148761749, + "epoch": 4.011405759908754, + "grad_norm": 0.5078125, + "learning_rate": 3.4978801301891972e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9932890459895134, + "num_tokens": 209932073.0, + "step": 1761 + }, + { + "entropy": 0.6447707340121269, + "epoch": 4.013686911890504, + "grad_norm": 0.60546875, + "learning_rate": 3.496152941656952e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9931846037507057, + "num_tokens": 210051123.0, + "step": 1762 + }, + { + "entropy": 0.6450674086809158, + "epoch": 4.015968063872256, + "grad_norm": 0.53125, + "learning_rate": 3.494425187727528e-06, + "loss": 0.0136, + "mean_token_accuracy": 0.99530029296875, + "num_tokens": 210170617.0, + "step": 1763 + }, + { + "entropy": 0.6448216363787651, + "epoch": 4.018249215854007, + "grad_norm": 0.5390625, + "learning_rate": 3.4926968693815667e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9941430315375328, + "num_tokens": 210289821.0, + "step": 1764 + }, + { + "entropy": 0.6478173434734344, + "epoch": 4.020530367835757, + "grad_norm": 0.46875, + "learning_rate": 3.4909679876000256e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9948790594935417, + "num_tokens": 210409485.0, + "step": 1765 + }, + { + "entropy": 0.6449841037392616, + "epoch": 4.022811519817508, + "grad_norm": 0.41796875, + "learning_rate": 3.4892385433641875e-06, + "loss": 0.0117, + "mean_token_accuracy": 0.9961125031113625, + "num_tokens": 210528814.0, + "step": 1766 + }, + { + "entropy": 0.6483430564403534, + "epoch": 4.025092671799259, + "grad_norm": 0.447265625, + "learning_rate": 3.4875085376556493e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9932544305920601, + "num_tokens": 210648534.0, + "step": 1767 + }, + { + "entropy": 0.6440916359424591, + "epoch": 4.0273738237810095, + "grad_norm": 0.5703125, + "learning_rate": 3.4857779714563305e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9913944378495216, + "num_tokens": 210767729.0, + "step": 1768 + }, + { + "entropy": 0.6431915760040283, + "epoch": 4.02965497576276, + "grad_norm": 0.462890625, + "learning_rate": 3.4840468457484654e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9944159537553787, + "num_tokens": 210887362.0, + "step": 1769 + }, + { + "entropy": 0.6453786641359329, + "epoch": 4.031936127744511, + "grad_norm": 0.404296875, + "learning_rate": 3.4823151615146093e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9945005923509598, + "num_tokens": 211006894.0, + "step": 1770 + }, + { + "entropy": 0.6430908516049385, + "epoch": 4.034217279726262, + "grad_norm": 0.53515625, + "learning_rate": 3.480582919737631e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.994571641087532, + "num_tokens": 211126156.0, + "step": 1771 + }, + { + "entropy": 0.6453077718615532, + "epoch": 4.036498431708012, + "grad_norm": 0.498046875, + "learning_rate": 3.478850121400719e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9941440895199776, + "num_tokens": 211246003.0, + "step": 1772 + }, + { + "entropy": 0.639974944293499, + "epoch": 4.038779583689763, + "grad_norm": 1.2578125, + "learning_rate": 3.477116767487375e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9945034310221672, + "num_tokens": 211365356.0, + "step": 1773 + }, + { + "entropy": 0.6442905962467194, + "epoch": 4.041060735671514, + "grad_norm": 0.482421875, + "learning_rate": 3.475382858981418e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9961519464850426, + "num_tokens": 211484528.0, + "step": 1774 + }, + { + "entropy": 0.6525718718767166, + "epoch": 4.0433418876532645, + "grad_norm": 0.458984375, + "learning_rate": 3.473648396866981e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9942761212587357, + "num_tokens": 211603649.0, + "step": 1775 + }, + { + "entropy": 0.6435666009783745, + "epoch": 4.045623039635016, + "grad_norm": 0.4609375, + "learning_rate": 3.4719133821285108e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9947159290313721, + "num_tokens": 211723376.0, + "step": 1776 + }, + { + "entropy": 0.6437467113137245, + "epoch": 4.047904191616767, + "grad_norm": 0.49609375, + "learning_rate": 3.470177815750769e-06, + "loss": 0.018, + "mean_token_accuracy": 0.993337020277977, + "num_tokens": 211843731.0, + "step": 1777 + }, + { + "entropy": 0.6418144479393959, + "epoch": 4.050185343598518, + "grad_norm": 0.5, + "learning_rate": 3.4684416987188273e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9950406476855278, + "num_tokens": 211963717.0, + "step": 1778 + }, + { + "entropy": 0.64313043653965, + "epoch": 4.052466495580268, + "grad_norm": 0.5546875, + "learning_rate": 3.4667050320180755e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9932253137230873, + "num_tokens": 212083055.0, + "step": 1779 + }, + { + "entropy": 0.6436839029192924, + "epoch": 4.054747647562019, + "grad_norm": 0.63671875, + "learning_rate": 3.4649678166342104e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9942403435707092, + "num_tokens": 212202112.0, + "step": 1780 + }, + { + "entropy": 0.6452621966600418, + "epoch": 4.05702879954377, + "grad_norm": 0.56640625, + "learning_rate": 3.4632300535532415e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9937450662255287, + "num_tokens": 212321813.0, + "step": 1781 + }, + { + "entropy": 0.645625077188015, + "epoch": 4.05930995152552, + "grad_norm": 0.51171875, + "learning_rate": 3.46149174376149e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9948355406522751, + "num_tokens": 212440704.0, + "step": 1782 + }, + { + "entropy": 0.641372948884964, + "epoch": 4.061591103507271, + "grad_norm": 0.46484375, + "learning_rate": 3.459752888245587e-06, + "loss": 0.0119, + "mean_token_accuracy": 0.9964514970779419, + "num_tokens": 212560198.0, + "step": 1783 + }, + { + "entropy": 0.6398207545280457, + "epoch": 4.063872255489022, + "grad_norm": 0.6328125, + "learning_rate": 3.4580134879924732e-06, + "loss": 0.027, + "mean_token_accuracy": 0.9898210614919662, + "num_tokens": 212679442.0, + "step": 1784 + }, + { + "entropy": 0.6410047933459282, + "epoch": 4.066153407470773, + "grad_norm": 0.466796875, + "learning_rate": 3.4562735439894e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9956777021288872, + "num_tokens": 212799315.0, + "step": 1785 + }, + { + "entropy": 0.6420860216021538, + "epoch": 4.068434559452523, + "grad_norm": 0.66796875, + "learning_rate": 3.4545330572239234e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9922531023621559, + "num_tokens": 212918391.0, + "step": 1786 + }, + { + "entropy": 0.6411397159099579, + "epoch": 4.070715711434274, + "grad_norm": 0.51171875, + "learning_rate": 3.452792028683912e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9940186142921448, + "num_tokens": 213037908.0, + "step": 1787 + }, + { + "entropy": 0.6413707062602043, + "epoch": 4.072996863416025, + "grad_norm": 0.5234375, + "learning_rate": 3.4510504593575396e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.994599848985672, + "num_tokens": 213157178.0, + "step": 1788 + }, + { + "entropy": 0.6354794800281525, + "epoch": 4.0752780153977755, + "grad_norm": 0.59765625, + "learning_rate": 3.449308350233287e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9929919019341469, + "num_tokens": 213277199.0, + "step": 1789 + }, + { + "entropy": 0.6417021527886391, + "epoch": 4.077559167379527, + "grad_norm": 0.6015625, + "learning_rate": 3.447565702299942e-06, + "loss": 0.026, + "mean_token_accuracy": 0.9911521226167679, + "num_tokens": 213396546.0, + "step": 1790 + }, + { + "entropy": 0.6399157345294952, + "epoch": 4.079840319361278, + "grad_norm": 0.5390625, + "learning_rate": 3.445822516546598e-06, + "loss": 0.0246, + "mean_token_accuracy": 0.9938328489661217, + "num_tokens": 213516182.0, + "step": 1791 + }, + { + "entropy": 0.6447793915867805, + "epoch": 4.0821214713430285, + "grad_norm": 0.45703125, + "learning_rate": 3.444078793962653e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9951696172356606, + "num_tokens": 213635323.0, + "step": 1792 + }, + { + "entropy": 0.6427733600139618, + "epoch": 4.084402623324779, + "grad_norm": 0.478515625, + "learning_rate": 3.4423345355378114e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9936816021800041, + "num_tokens": 213754783.0, + "step": 1793 + }, + { + "entropy": 0.644688606262207, + "epoch": 4.08668377530653, + "grad_norm": 0.451171875, + "learning_rate": 3.440589742262079e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9966248050332069, + "num_tokens": 213873849.0, + "step": 1794 + }, + { + "entropy": 0.6377925798296928, + "epoch": 4.088964927288281, + "grad_norm": 0.5390625, + "learning_rate": 3.438844415125768e-06, + "loss": 0.0252, + "mean_token_accuracy": 0.9922417625784874, + "num_tokens": 213993262.0, + "step": 1795 + }, + { + "entropy": 0.6375655233860016, + "epoch": 4.091246079270031, + "grad_norm": 0.4765625, + "learning_rate": 3.437098555119493e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9957069009542465, + "num_tokens": 214112721.0, + "step": 1796 + }, + { + "entropy": 0.6427261680364609, + "epoch": 4.093527231251782, + "grad_norm": 0.578125, + "learning_rate": 3.4353521632341686e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9929164722561836, + "num_tokens": 214231856.0, + "step": 1797 + }, + { + "entropy": 0.638318382203579, + "epoch": 4.095808383233533, + "grad_norm": 0.431640625, + "learning_rate": 3.4336052404610138e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.994861550629139, + "num_tokens": 214351452.0, + "step": 1798 + }, + { + "entropy": 0.6428381949663162, + "epoch": 4.0980895352152835, + "grad_norm": 0.40234375, + "learning_rate": 3.431857787791549e-06, + "loss": 0.0125, + "mean_token_accuracy": 0.9955770969390869, + "num_tokens": 214470853.0, + "step": 1799 + }, + { + "entropy": 0.6448817327618599, + "epoch": 4.100370687197034, + "grad_norm": 0.53125, + "learning_rate": 3.4301098062175936e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.994481585919857, + "num_tokens": 214590681.0, + "step": 1800 + }, + { + "entropy": 0.6471136435866356, + "epoch": 4.102651839178785, + "grad_norm": 0.58203125, + "learning_rate": 3.4283612967312692e-06, + "loss": 0.0269, + "mean_token_accuracy": 0.9917333126068115, + "num_tokens": 214709345.0, + "step": 1801 + }, + { + "entropy": 0.6451018080115318, + "epoch": 4.104932991160536, + "grad_norm": 0.546875, + "learning_rate": 3.426612260324996e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9941064491868019, + "num_tokens": 214828384.0, + "step": 1802 + }, + { + "entropy": 0.6430217325687408, + "epoch": 4.107214143142287, + "grad_norm": 0.58984375, + "learning_rate": 3.424862697991491e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9920637831091881, + "num_tokens": 214947859.0, + "step": 1803 + }, + { + "entropy": 0.6423445641994476, + "epoch": 4.109495295124038, + "grad_norm": 0.44140625, + "learning_rate": 3.4231126107237754e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9928935840725899, + "num_tokens": 215068927.0, + "step": 1804 + }, + { + "entropy": 0.6379528790712357, + "epoch": 4.111776447105789, + "grad_norm": 0.419921875, + "learning_rate": 3.4213619995151628e-06, + "loss": 0.0121, + "mean_token_accuracy": 0.9963446632027626, + "num_tokens": 215187976.0, + "step": 1805 + }, + { + "entropy": 0.6418527513742447, + "epoch": 4.114057599087539, + "grad_norm": 0.34765625, + "learning_rate": 3.4196108653592662e-06, + "loss": 0.0101, + "mean_token_accuracy": 0.9965877830982208, + "num_tokens": 215307607.0, + "step": 1806 + }, + { + "entropy": 0.6407713741064072, + "epoch": 4.11633875106929, + "grad_norm": 0.4765625, + "learning_rate": 3.417859209249997e-06, + "loss": 0.0101, + "mean_token_accuracy": 0.9960427507758141, + "num_tokens": 215426510.0, + "step": 1807 + }, + { + "entropy": 0.6421719714999199, + "epoch": 4.118619903051041, + "grad_norm": 0.6171875, + "learning_rate": 3.4161070321815605e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.993411973118782, + "num_tokens": 215545627.0, + "step": 1808 + }, + { + "entropy": 0.6457884609699249, + "epoch": 4.120901055032792, + "grad_norm": 0.640625, + "learning_rate": 3.4143543351484585e-06, + "loss": 0.024, + "mean_token_accuracy": 0.9918258413672447, + "num_tokens": 215664837.0, + "step": 1809 + }, + { + "entropy": 0.6413973644375801, + "epoch": 4.123182207014542, + "grad_norm": 0.4921875, + "learning_rate": 3.4126011191454877e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9954197630286217, + "num_tokens": 215783666.0, + "step": 1810 + }, + { + "entropy": 0.6362199410796165, + "epoch": 4.125463358996293, + "grad_norm": 0.60546875, + "learning_rate": 3.4108473851677408e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9927454963326454, + "num_tokens": 215902618.0, + "step": 1811 + }, + { + "entropy": 0.6400426402688026, + "epoch": 4.127744510978044, + "grad_norm": 0.5078125, + "learning_rate": 3.4090931342106024e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9948577433824539, + "num_tokens": 216022748.0, + "step": 1812 + }, + { + "entropy": 0.6464749351143837, + "epoch": 4.1300256629597945, + "grad_norm": 0.4296875, + "learning_rate": 3.4073383672697524e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9953896403312683, + "num_tokens": 216142336.0, + "step": 1813 + }, + { + "entropy": 0.6443880051374435, + "epoch": 4.132306814941545, + "grad_norm": 0.61328125, + "learning_rate": 3.4055830853411616e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9931292086839676, + "num_tokens": 216261345.0, + "step": 1814 + }, + { + "entropy": 0.643412820994854, + "epoch": 4.134587966923296, + "grad_norm": 0.55859375, + "learning_rate": 3.4038272894210945e-06, + "loss": 0.0115, + "mean_token_accuracy": 0.9974248558282852, + "num_tokens": 216380241.0, + "step": 1815 + }, + { + "entropy": 0.6374855041503906, + "epoch": 4.136869118905047, + "grad_norm": 0.5, + "learning_rate": 3.4020709805061066e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9949627742171288, + "num_tokens": 216499963.0, + "step": 1816 + }, + { + "entropy": 0.6461105495691299, + "epoch": 4.139150270886798, + "grad_norm": 0.58203125, + "learning_rate": 3.4003141595930456e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9953344762325287, + "num_tokens": 216619153.0, + "step": 1817 + }, + { + "entropy": 0.6521079167723656, + "epoch": 4.141431422868549, + "grad_norm": 0.494140625, + "learning_rate": 3.3985568276790487e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9917076528072357, + "num_tokens": 216739432.0, + "step": 1818 + }, + { + "entropy": 0.6413451507687569, + "epoch": 4.1437125748503, + "grad_norm": 0.51953125, + "learning_rate": 3.3967989857615434e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.992711141705513, + "num_tokens": 216858881.0, + "step": 1819 + }, + { + "entropy": 0.6427788510918617, + "epoch": 4.14599372683205, + "grad_norm": 0.609375, + "learning_rate": 3.3950406348382483e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9933005720376968, + "num_tokens": 216977705.0, + "step": 1820 + }, + { + "entropy": 0.6404507756233215, + "epoch": 4.148274878813801, + "grad_norm": 0.54296875, + "learning_rate": 3.3932817759071666e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9913292750716209, + "num_tokens": 217096908.0, + "step": 1821 + }, + { + "entropy": 0.6395808458328247, + "epoch": 4.150556030795552, + "grad_norm": 0.5390625, + "learning_rate": 3.3915224099665962e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9960099905729294, + "num_tokens": 217216673.0, + "step": 1822 + }, + { + "entropy": 0.6416726186871529, + "epoch": 4.1528371827773025, + "grad_norm": 0.458984375, + "learning_rate": 3.389762538015116e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9946426898241043, + "num_tokens": 217335811.0, + "step": 1823 + }, + { + "entropy": 0.6416124328970909, + "epoch": 4.155118334759053, + "grad_norm": 0.6484375, + "learning_rate": 3.388002161051598e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9947148337960243, + "num_tokens": 217454888.0, + "step": 1824 + }, + { + "entropy": 0.6419819071888924, + "epoch": 4.157399486740804, + "grad_norm": 0.5390625, + "learning_rate": 3.3862412800751963e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9942022487521172, + "num_tokens": 217573577.0, + "step": 1825 + }, + { + "entropy": 0.6398373693227768, + "epoch": 4.159680638722555, + "grad_norm": 0.50390625, + "learning_rate": 3.3844798960853533e-06, + "loss": 0.0266, + "mean_token_accuracy": 0.9932426363229752, + "num_tokens": 217692908.0, + "step": 1826 + }, + { + "entropy": 0.6421637386083603, + "epoch": 4.161961790704305, + "grad_norm": 0.58203125, + "learning_rate": 3.382718010081797e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9942246526479721, + "num_tokens": 217812365.0, + "step": 1827 + }, + { + "entropy": 0.6434148326516151, + "epoch": 4.164242942686056, + "grad_norm": 0.59375, + "learning_rate": 3.38095562306454e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9933365359902382, + "num_tokens": 217931477.0, + "step": 1828 + }, + { + "entropy": 0.6475835144519806, + "epoch": 4.166524094667807, + "grad_norm": 0.494140625, + "learning_rate": 3.3791927360338785e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.995684839785099, + "num_tokens": 218051175.0, + "step": 1829 + }, + { + "entropy": 0.6376415640115738, + "epoch": 4.168805246649558, + "grad_norm": 0.46484375, + "learning_rate": 3.3774293499903934e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9942801222205162, + "num_tokens": 218169941.0, + "step": 1830 + }, + { + "entropy": 0.6404718682169914, + "epoch": 4.171086398631309, + "grad_norm": 0.7890625, + "learning_rate": 3.3756654659349487e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9924803525209427, + "num_tokens": 218288852.0, + "step": 1831 + }, + { + "entropy": 0.6374930962920189, + "epoch": 4.17336755061306, + "grad_norm": 0.51953125, + "learning_rate": 3.373901084868691e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9948865696787834, + "num_tokens": 218408244.0, + "step": 1832 + }, + { + "entropy": 0.6456034481525421, + "epoch": 4.175648702594811, + "grad_norm": 0.43359375, + "learning_rate": 3.372136207793049e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9953016936779022, + "num_tokens": 218527482.0, + "step": 1833 + }, + { + "entropy": 0.6416222602128983, + "epoch": 4.177929854576561, + "grad_norm": 0.455078125, + "learning_rate": 3.3703708357097333e-06, + "loss": 0.0126, + "mean_token_accuracy": 0.9962608218193054, + "num_tokens": 218646675.0, + "step": 1834 + }, + { + "entropy": 0.6389907673001289, + "epoch": 4.180211006558312, + "grad_norm": 0.45703125, + "learning_rate": 3.3686049696207336e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9945767968893051, + "num_tokens": 218765896.0, + "step": 1835 + }, + { + "entropy": 0.640201210975647, + "epoch": 4.182492158540063, + "grad_norm": 0.4140625, + "learning_rate": 3.3668386105283226e-06, + "loss": 0.01, + "mean_token_accuracy": 0.9957859963178635, + "num_tokens": 218884959.0, + "step": 1836 + }, + { + "entropy": 0.64270830899477, + "epoch": 4.1847733105218134, + "grad_norm": 0.419921875, + "learning_rate": 3.365071759435051e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9941525012254715, + "num_tokens": 219004357.0, + "step": 1837 + }, + { + "entropy": 0.6374094560742378, + "epoch": 4.187054462503564, + "grad_norm": 0.59375, + "learning_rate": 3.363304417343749e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9928470030426979, + "num_tokens": 219124027.0, + "step": 1838 + }, + { + "entropy": 0.643007904291153, + "epoch": 4.189335614485315, + "grad_norm": 0.62890625, + "learning_rate": 3.3615365852575276e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9935889542102814, + "num_tokens": 219244486.0, + "step": 1839 + }, + { + "entropy": 0.6473117396235466, + "epoch": 4.191616766467066, + "grad_norm": 0.55078125, + "learning_rate": 3.359768264179772e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.994306318461895, + "num_tokens": 219364285.0, + "step": 1840 + }, + { + "entropy": 0.6428944990038872, + "epoch": 4.193897918448816, + "grad_norm": 0.421875, + "learning_rate": 3.357999455114148e-06, + "loss": 0.0116, + "mean_token_accuracy": 0.9956967905163765, + "num_tokens": 219483476.0, + "step": 1841 + }, + { + "entropy": 0.6382761225104332, + "epoch": 4.196179070430567, + "grad_norm": 0.390625, + "learning_rate": 3.356230159064599e-06, + "loss": 0.0121, + "mean_token_accuracy": 0.9961008727550507, + "num_tokens": 219602601.0, + "step": 1842 + }, + { + "entropy": 0.6391697824001312, + "epoch": 4.198460222412319, + "grad_norm": 0.466796875, + "learning_rate": 3.3544603770353407e-06, + "loss": 0.014, + "mean_token_accuracy": 0.9953900650143623, + "num_tokens": 219722243.0, + "step": 1843 + }, + { + "entropy": 0.6406760662794113, + "epoch": 4.200741374394069, + "grad_norm": 0.494140625, + "learning_rate": 3.352690110030869e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9950715079903603, + "num_tokens": 219841980.0, + "step": 1844 + }, + { + "entropy": 0.6439218148589134, + "epoch": 4.20302252637582, + "grad_norm": 0.609375, + "learning_rate": 3.350919359055953e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.994478702545166, + "num_tokens": 219961516.0, + "step": 1845 + }, + { + "entropy": 0.6409249752759933, + "epoch": 4.205303678357571, + "grad_norm": 0.486328125, + "learning_rate": 3.3491481251156355e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9952225312590599, + "num_tokens": 220081230.0, + "step": 1846 + }, + { + "entropy": 0.6457728147506714, + "epoch": 4.2075848303393215, + "grad_norm": 0.427734375, + "learning_rate": 3.347376409215236e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9948369711637497, + "num_tokens": 220200911.0, + "step": 1847 + }, + { + "entropy": 0.6422413438558578, + "epoch": 4.209865982321072, + "grad_norm": 0.66015625, + "learning_rate": 3.345604212360346e-06, + "loss": 0.0266, + "mean_token_accuracy": 0.9928175806999207, + "num_tokens": 220319841.0, + "step": 1848 + }, + { + "entropy": 0.6416943520307541, + "epoch": 4.212147134302823, + "grad_norm": 0.451171875, + "learning_rate": 3.3438315355568295e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9951535388827324, + "num_tokens": 220438685.0, + "step": 1849 + }, + { + "entropy": 0.6425519958138466, + "epoch": 4.214428286284574, + "grad_norm": 0.451171875, + "learning_rate": 3.3420583798108253e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9944318234920502, + "num_tokens": 220557718.0, + "step": 1850 + }, + { + "entropy": 0.6411484554409981, + "epoch": 4.216709438266324, + "grad_norm": 0.4453125, + "learning_rate": 3.34028474612874e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9943792447447777, + "num_tokens": 220676852.0, + "step": 1851 + }, + { + "entropy": 0.6405615881085396, + "epoch": 4.218990590248075, + "grad_norm": 0.59375, + "learning_rate": 3.338510635517256e-06, + "loss": 0.0243, + "mean_token_accuracy": 0.9934446662664413, + "num_tokens": 220796547.0, + "step": 1852 + }, + { + "entropy": 0.6449724063277245, + "epoch": 4.221271742229826, + "grad_norm": 0.466796875, + "learning_rate": 3.3367360489833236e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9959066137671471, + "num_tokens": 220916792.0, + "step": 1853 + }, + { + "entropy": 0.6381378397345543, + "epoch": 4.2235528942115765, + "grad_norm": 0.462890625, + "learning_rate": 3.3349609875341626e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.995077982544899, + "num_tokens": 221036259.0, + "step": 1854 + }, + { + "entropy": 0.6364856958389282, + "epoch": 4.225834046193327, + "grad_norm": 0.5234375, + "learning_rate": 3.3331854521772656e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9947091639041901, + "num_tokens": 221155336.0, + "step": 1855 + }, + { + "entropy": 0.6419543251395226, + "epoch": 4.228115198175079, + "grad_norm": 0.52734375, + "learning_rate": 3.3314094439203903e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9940764456987381, + "num_tokens": 221274333.0, + "step": 1856 + }, + { + "entropy": 0.6369626522064209, + "epoch": 4.23039635015683, + "grad_norm": 0.5625, + "learning_rate": 3.3296329637715662e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9957297071814537, + "num_tokens": 221392983.0, + "step": 1857 + }, + { + "entropy": 0.6385047137737274, + "epoch": 4.23267750213858, + "grad_norm": 0.5234375, + "learning_rate": 3.3278560127390892e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9956211000680923, + "num_tokens": 221512451.0, + "step": 1858 + }, + { + "entropy": 0.6393530890345573, + "epoch": 4.234958654120331, + "grad_norm": 0.45703125, + "learning_rate": 3.32607859183152e-06, + "loss": 0.0113, + "mean_token_accuracy": 0.9953261986374855, + "num_tokens": 221630887.0, + "step": 1859 + }, + { + "entropy": 0.6344076991081238, + "epoch": 4.237239806102082, + "grad_norm": 0.60546875, + "learning_rate": 3.3243007020576917e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9938429072499275, + "num_tokens": 221750035.0, + "step": 1860 + }, + { + "entropy": 0.6424268037080765, + "epoch": 4.2395209580838324, + "grad_norm": 0.455078125, + "learning_rate": 3.322522344426698e-06, + "loss": 0.0111, + "mean_token_accuracy": 0.9964053779840469, + "num_tokens": 221868892.0, + "step": 1861 + }, + { + "entropy": 0.6402264833450317, + "epoch": 4.241802110065583, + "grad_norm": 0.484375, + "learning_rate": 3.320743519947901e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9948954731225967, + "num_tokens": 221988456.0, + "step": 1862 + }, + { + "entropy": 0.6403216868638992, + "epoch": 4.244083262047334, + "grad_norm": 0.6015625, + "learning_rate": 3.318964229630927e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.9930480718612671, + "num_tokens": 222107567.0, + "step": 1863 + }, + { + "entropy": 0.6397815570235252, + "epoch": 4.246364414029085, + "grad_norm": 0.578125, + "learning_rate": 3.3171844744856675e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9958015605807304, + "num_tokens": 222226640.0, + "step": 1864 + }, + { + "entropy": 0.6428464725613594, + "epoch": 4.248645566010835, + "grad_norm": 0.6015625, + "learning_rate": 3.3154042555222758e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9932858496904373, + "num_tokens": 222346366.0, + "step": 1865 + }, + { + "entropy": 0.642001561820507, + "epoch": 4.250926717992586, + "grad_norm": 0.64453125, + "learning_rate": 3.3136235737511715e-06, + "loss": 0.029, + "mean_token_accuracy": 0.9923763498663902, + "num_tokens": 222465234.0, + "step": 1866 + }, + { + "entropy": 0.6360239386558533, + "epoch": 4.253207869974337, + "grad_norm": 0.435546875, + "learning_rate": 3.3118424301830343e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9953946024179459, + "num_tokens": 222584422.0, + "step": 1867 + }, + { + "entropy": 0.6420237943530083, + "epoch": 4.2554890219560875, + "grad_norm": 0.625, + "learning_rate": 3.310060825828807e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.9927022308111191, + "num_tokens": 222703474.0, + "step": 1868 + }, + { + "entropy": 0.6447739601135254, + "epoch": 4.257770173937839, + "grad_norm": 0.5234375, + "learning_rate": 3.3082787616996938e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9934853091835976, + "num_tokens": 222823530.0, + "step": 1869 + }, + { + "entropy": 0.6382445991039276, + "epoch": 4.26005132591959, + "grad_norm": 0.609375, + "learning_rate": 3.3064962388071586e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9936426654458046, + "num_tokens": 222942844.0, + "step": 1870 + }, + { + "entropy": 0.6413829103112221, + "epoch": 4.2623324779013405, + "grad_norm": 0.470703125, + "learning_rate": 3.3047132581629297e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9967142641544342, + "num_tokens": 223062864.0, + "step": 1871 + }, + { + "entropy": 0.6449387893080711, + "epoch": 4.264613629883091, + "grad_norm": 0.44140625, + "learning_rate": 3.3029298207789907e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9947159141302109, + "num_tokens": 223183076.0, + "step": 1872 + }, + { + "entropy": 0.6428952738642693, + "epoch": 4.266894781864842, + "grad_norm": 0.474609375, + "learning_rate": 3.301145927667586e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9944512993097305, + "num_tokens": 223302533.0, + "step": 1873 + }, + { + "entropy": 0.6376186534762383, + "epoch": 4.269175933846593, + "grad_norm": 0.412109375, + "learning_rate": 3.2993615798412204e-06, + "loss": 0.0123, + "mean_token_accuracy": 0.9964757561683655, + "num_tokens": 223421803.0, + "step": 1874 + }, + { + "entropy": 0.6422068104147911, + "epoch": 4.271457085828343, + "grad_norm": 0.455078125, + "learning_rate": 3.297576778312654e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9931000992655754, + "num_tokens": 223541309.0, + "step": 1875 + }, + { + "entropy": 0.647542804479599, + "epoch": 4.273738237810094, + "grad_norm": 0.6171875, + "learning_rate": 3.295791524094906e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9947569891810417, + "num_tokens": 223660994.0, + "step": 1876 + }, + { + "entropy": 0.6400193870067596, + "epoch": 4.276019389791845, + "grad_norm": 0.466796875, + "learning_rate": 3.294005818201252e-06, + "loss": 0.0128, + "mean_token_accuracy": 0.9957887902855873, + "num_tokens": 223779917.0, + "step": 1877 + }, + { + "entropy": 0.6337326914072037, + "epoch": 4.2783005417735955, + "grad_norm": 0.48046875, + "learning_rate": 3.2922196616452253e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9938401281833649, + "num_tokens": 223898957.0, + "step": 1878 + }, + { + "entropy": 0.6403861194849014, + "epoch": 4.280581693755346, + "grad_norm": 0.486328125, + "learning_rate": 3.2904330554406126e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9949310347437859, + "num_tokens": 224018655.0, + "step": 1879 + }, + { + "entropy": 0.6404683440923691, + "epoch": 4.282862845737097, + "grad_norm": 0.4375, + "learning_rate": 3.288646000601457e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9957966208457947, + "num_tokens": 224138136.0, + "step": 1880 + }, + { + "entropy": 0.6399213746190071, + "epoch": 4.285143997718848, + "grad_norm": 0.51171875, + "learning_rate": 3.286858498142057e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9932239726185799, + "num_tokens": 224257529.0, + "step": 1881 + }, + { + "entropy": 0.6427848190069199, + "epoch": 4.287425149700598, + "grad_norm": 0.53515625, + "learning_rate": 3.285070549076965e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9937142804265022, + "num_tokens": 224376912.0, + "step": 1882 + }, + { + "entropy": 0.6406867280602455, + "epoch": 4.289706301682349, + "grad_norm": 0.4609375, + "learning_rate": 3.283282154420985e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.993935763835907, + "num_tokens": 224496570.0, + "step": 1883 + }, + { + "entropy": 0.636949211359024, + "epoch": 4.291987453664101, + "grad_norm": 0.54296875, + "learning_rate": 3.2814933151891766e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9933280348777771, + "num_tokens": 224615821.0, + "step": 1884 + }, + { + "entropy": 0.6424317061901093, + "epoch": 4.2942686056458514, + "grad_norm": 0.76953125, + "learning_rate": 3.2797040323968493e-06, + "loss": 0.0265, + "mean_token_accuracy": 0.9907650724053383, + "num_tokens": 224735238.0, + "step": 1885 + }, + { + "entropy": 0.6430623978376389, + "epoch": 4.296549757627602, + "grad_norm": 0.58984375, + "learning_rate": 3.277914307059566e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9950076788663864, + "num_tokens": 224854753.0, + "step": 1886 + }, + { + "entropy": 0.6370247453451157, + "epoch": 4.298830909609353, + "grad_norm": 0.376953125, + "learning_rate": 3.276124140193141e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9950675591826439, + "num_tokens": 224974972.0, + "step": 1887 + }, + { + "entropy": 0.6412686482071877, + "epoch": 4.301112061591104, + "grad_norm": 0.51953125, + "learning_rate": 3.274333532813637e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9930486977100372, + "num_tokens": 225094180.0, + "step": 1888 + }, + { + "entropy": 0.6393831819295883, + "epoch": 4.303393213572854, + "grad_norm": 0.4140625, + "learning_rate": 3.272542485937369e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9941622242331505, + "num_tokens": 225213567.0, + "step": 1889 + }, + { + "entropy": 0.6498856022953987, + "epoch": 4.305674365554605, + "grad_norm": 0.51953125, + "learning_rate": 3.2707510005809005e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9950681105256081, + "num_tokens": 225333611.0, + "step": 1890 + }, + { + "entropy": 0.6482260301709175, + "epoch": 4.307955517536356, + "grad_norm": 0.5703125, + "learning_rate": 3.2689590777610443e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9927352741360664, + "num_tokens": 225453167.0, + "step": 1891 + }, + { + "entropy": 0.6431052088737488, + "epoch": 4.3102366695181065, + "grad_norm": 0.52734375, + "learning_rate": 3.267166718494861e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9917781203985214, + "num_tokens": 225572112.0, + "step": 1892 + }, + { + "entropy": 0.6444010734558105, + "epoch": 4.312517821499857, + "grad_norm": 0.40234375, + "learning_rate": 3.265373923799658e-06, + "loss": 0.0113, + "mean_token_accuracy": 0.9966996908187866, + "num_tokens": 225691798.0, + "step": 1893 + }, + { + "entropy": 0.643227256834507, + "epoch": 4.314798973481608, + "grad_norm": 0.53515625, + "learning_rate": 3.263580694692992e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9944680333137512, + "num_tokens": 225811682.0, + "step": 1894 + }, + { + "entropy": 0.6419089958071709, + "epoch": 4.317080125463359, + "grad_norm": 0.515625, + "learning_rate": 3.261787032192666e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9940001964569092, + "num_tokens": 225930476.0, + "step": 1895 + }, + { + "entropy": 0.6377283781766891, + "epoch": 4.319361277445109, + "grad_norm": 0.53515625, + "learning_rate": 3.259992937316727e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9937692731618881, + "num_tokens": 226048672.0, + "step": 1896 + }, + { + "entropy": 0.6436439529061317, + "epoch": 4.321642429426861, + "grad_norm": 0.44921875, + "learning_rate": 3.258198411083469e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9951334372162819, + "num_tokens": 226167355.0, + "step": 1897 + }, + { + "entropy": 0.6431718915700912, + "epoch": 4.323923581408612, + "grad_norm": 0.458984375, + "learning_rate": 3.2564034545114308e-06, + "loss": 0.0127, + "mean_token_accuracy": 0.9953879714012146, + "num_tokens": 226287221.0, + "step": 1898 + }, + { + "entropy": 0.6460432037711143, + "epoch": 4.326204733390362, + "grad_norm": 0.431640625, + "learning_rate": 3.2546080686193947e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9958002492785454, + "num_tokens": 226406837.0, + "step": 1899 + }, + { + "entropy": 0.6407450735569, + "epoch": 4.328485885372113, + "grad_norm": 0.5078125, + "learning_rate": 3.2528122544263873e-06, + "loss": 0.0135, + "mean_token_accuracy": 0.9955217465758324, + "num_tokens": 226525973.0, + "step": 1900 + }, + { + "entropy": 0.6384291723370552, + "epoch": 4.330767037353864, + "grad_norm": 0.49609375, + "learning_rate": 3.251016012951678e-06, + "loss": 0.0123, + "mean_token_accuracy": 0.9958822503685951, + "num_tokens": 226645382.0, + "step": 1901 + }, + { + "entropy": 0.6401782259345055, + "epoch": 4.3330481893356145, + "grad_norm": 0.45703125, + "learning_rate": 3.2492193452147774e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9949276894330978, + "num_tokens": 226764097.0, + "step": 1902 + }, + { + "entropy": 0.6432746052742004, + "epoch": 4.335329341317365, + "grad_norm": 0.515625, + "learning_rate": 3.247422252235442e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9952229112386703, + "num_tokens": 226883630.0, + "step": 1903 + }, + { + "entropy": 0.6391656696796417, + "epoch": 4.337610493299116, + "grad_norm": 0.52734375, + "learning_rate": 3.245624735033665e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9947796612977982, + "num_tokens": 227003023.0, + "step": 1904 + }, + { + "entropy": 0.6418261751532555, + "epoch": 4.339891645280867, + "grad_norm": 0.55078125, + "learning_rate": 3.2438267946296836e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9939739406108856, + "num_tokens": 227122424.0, + "step": 1905 + }, + { + "entropy": 0.6397288963198662, + "epoch": 4.342172797262617, + "grad_norm": 0.41796875, + "learning_rate": 3.242028432043974e-06, + "loss": 0.0135, + "mean_token_accuracy": 0.9954008013010025, + "num_tokens": 227242575.0, + "step": 1906 + }, + { + "entropy": 0.6469406336545944, + "epoch": 4.344453949244368, + "grad_norm": 0.5390625, + "learning_rate": 3.2402296482972513e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9950236976146698, + "num_tokens": 227362676.0, + "step": 1907 + }, + { + "entropy": 0.6419104561209679, + "epoch": 4.346735101226119, + "grad_norm": 0.54296875, + "learning_rate": 3.238430444410471e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9950080290436745, + "num_tokens": 227482604.0, + "step": 1908 + }, + { + "entropy": 0.6440920382738113, + "epoch": 4.3490162532078696, + "grad_norm": 0.45703125, + "learning_rate": 3.2366308214048262e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.994262233376503, + "num_tokens": 227601832.0, + "step": 1909 + }, + { + "entropy": 0.6346658691763878, + "epoch": 4.351297405189621, + "grad_norm": 0.6171875, + "learning_rate": 3.2348307803017493e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9935380592942238, + "num_tokens": 227721032.0, + "step": 1910 + }, + { + "entropy": 0.6388025134801865, + "epoch": 4.353578557171372, + "grad_norm": 0.44140625, + "learning_rate": 3.2330303221229078e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.994829073548317, + "num_tokens": 227840076.0, + "step": 1911 + }, + { + "entropy": 0.643476776778698, + "epoch": 4.355859709153123, + "grad_norm": 0.365234375, + "learning_rate": 3.231229447890206e-06, + "loss": 0.0127, + "mean_token_accuracy": 0.9969058260321617, + "num_tokens": 227959359.0, + "step": 1912 + }, + { + "entropy": 0.6404720544815063, + "epoch": 4.358140861134873, + "grad_norm": 0.53125, + "learning_rate": 3.229428158625787e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9947412014007568, + "num_tokens": 228079032.0, + "step": 1913 + }, + { + "entropy": 0.6400147825479507, + "epoch": 4.360422013116624, + "grad_norm": 0.482421875, + "learning_rate": 3.2276264553520275e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9941817000508308, + "num_tokens": 228197799.0, + "step": 1914 + }, + { + "entropy": 0.6381105184555054, + "epoch": 4.362703165098375, + "grad_norm": 0.796875, + "learning_rate": 3.2258243390915397e-06, + "loss": 0.0315, + "mean_token_accuracy": 0.992643691599369, + "num_tokens": 228317582.0, + "step": 1915 + }, + { + "entropy": 0.6436234638094902, + "epoch": 4.3649843170801255, + "grad_norm": 0.57421875, + "learning_rate": 3.2240218108671683e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9957699477672577, + "num_tokens": 228436806.0, + "step": 1916 + }, + { + "entropy": 0.6401450037956238, + "epoch": 4.367265469061876, + "grad_norm": 0.5390625, + "learning_rate": 3.2222188717019965e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9929714947938919, + "num_tokens": 228555808.0, + "step": 1917 + }, + { + "entropy": 0.6425024271011353, + "epoch": 4.369546621043627, + "grad_norm": 0.6328125, + "learning_rate": 3.220415522619335e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9943019896745682, + "num_tokens": 228675394.0, + "step": 1918 + }, + { + "entropy": 0.6352293714880943, + "epoch": 4.371827773025378, + "grad_norm": 0.58203125, + "learning_rate": 3.218611764642732e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.994585707783699, + "num_tokens": 228794787.0, + "step": 1919 + }, + { + "entropy": 0.6376413553953171, + "epoch": 4.374108925007128, + "grad_norm": 0.484375, + "learning_rate": 3.2168075987959633e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9950112998485565, + "num_tokens": 228913712.0, + "step": 1920 + }, + { + "entropy": 0.6372517943382263, + "epoch": 4.376390076988879, + "grad_norm": 0.5703125, + "learning_rate": 3.2150030261030414e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9927583262324333, + "num_tokens": 229032881.0, + "step": 1921 + }, + { + "entropy": 0.6423334330320358, + "epoch": 4.37867122897063, + "grad_norm": 0.373046875, + "learning_rate": 3.2131980475882053e-06, + "loss": 0.0082, + "mean_token_accuracy": 0.9973317235708237, + "num_tokens": 229152130.0, + "step": 1922 + }, + { + "entropy": 0.6445911973714828, + "epoch": 4.380952380952381, + "grad_norm": 0.609375, + "learning_rate": 3.2113926642759256e-06, + "loss": 0.0268, + "mean_token_accuracy": 0.9924745634198189, + "num_tokens": 229270648.0, + "step": 1923 + }, + { + "entropy": 0.6393702477216721, + "epoch": 4.383233532934132, + "grad_norm": 0.60546875, + "learning_rate": 3.2095868771909037e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9941354095935822, + "num_tokens": 229389641.0, + "step": 1924 + }, + { + "entropy": 0.6469630971550941, + "epoch": 4.385514684915883, + "grad_norm": 0.435546875, + "learning_rate": 3.2077806873580696e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.995250403881073, + "num_tokens": 229508828.0, + "step": 1925 + }, + { + "entropy": 0.6408749148249626, + "epoch": 4.3877958368976335, + "grad_norm": 0.423828125, + "learning_rate": 3.205974095802582e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.995446152985096, + "num_tokens": 229627378.0, + "step": 1926 + }, + { + "entropy": 0.6369495391845703, + "epoch": 4.390076988879384, + "grad_norm": 0.490234375, + "learning_rate": 3.204167103549827e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9952741339802742, + "num_tokens": 229746625.0, + "step": 1927 + }, + { + "entropy": 0.6444288417696953, + "epoch": 4.392358140861135, + "grad_norm": 0.5078125, + "learning_rate": 3.2023597116254175e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9944415614008904, + "num_tokens": 229865540.0, + "step": 1928 + }, + { + "entropy": 0.6399930641055107, + "epoch": 4.394639292842886, + "grad_norm": 0.5234375, + "learning_rate": 3.2005519210551955e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.99361751973629, + "num_tokens": 229984693.0, + "step": 1929 + }, + { + "entropy": 0.6406848505139351, + "epoch": 4.396920444824636, + "grad_norm": 0.52734375, + "learning_rate": 3.1987437328652287e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9937151670455933, + "num_tokens": 230104062.0, + "step": 1930 + }, + { + "entropy": 0.6487103030085564, + "epoch": 4.399201596806387, + "grad_norm": 0.546875, + "learning_rate": 3.196935148081808e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9927753731608391, + "num_tokens": 230223656.0, + "step": 1931 + }, + { + "entropy": 0.6405986323952675, + "epoch": 4.401482748788138, + "grad_norm": 0.4140625, + "learning_rate": 3.1951261677314526e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9956740513443947, + "num_tokens": 230343535.0, + "step": 1932 + }, + { + "entropy": 0.6449386328458786, + "epoch": 4.4037639007698886, + "grad_norm": 0.5078125, + "learning_rate": 3.1933167928409046e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9935710355639458, + "num_tokens": 230463516.0, + "step": 1933 + }, + { + "entropy": 0.6429858207702637, + "epoch": 4.406045052751639, + "grad_norm": 0.361328125, + "learning_rate": 3.1915070244371295e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.996049553155899, + "num_tokens": 230583348.0, + "step": 1934 + }, + { + "entropy": 0.6439896300435066, + "epoch": 4.40832620473339, + "grad_norm": 0.51171875, + "learning_rate": 3.1896968635473174e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9938567653298378, + "num_tokens": 230703059.0, + "step": 1935 + }, + { + "entropy": 0.6472795829176903, + "epoch": 4.410607356715142, + "grad_norm": 0.58203125, + "learning_rate": 3.187886311198881e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9930974766612053, + "num_tokens": 230822730.0, + "step": 1936 + }, + { + "entropy": 0.6384473666548729, + "epoch": 4.412888508696892, + "grad_norm": 0.5390625, + "learning_rate": 3.1860753684194536e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9936178550124168, + "num_tokens": 230942435.0, + "step": 1937 + }, + { + "entropy": 0.644935816526413, + "epoch": 4.415169660678643, + "grad_norm": 0.5703125, + "learning_rate": 3.1842640362368932e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9934541136026382, + "num_tokens": 231062570.0, + "step": 1938 + }, + { + "entropy": 0.6387219503521919, + "epoch": 4.417450812660394, + "grad_norm": 0.45703125, + "learning_rate": 3.182452315679276e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9939060807228088, + "num_tokens": 231182559.0, + "step": 1939 + }, + { + "entropy": 0.6462222933769226, + "epoch": 4.4197319646421445, + "grad_norm": 0.53125, + "learning_rate": 3.1806402077748987e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9948486760258675, + "num_tokens": 231302171.0, + "step": 1940 + }, + { + "entropy": 0.6469761207699776, + "epoch": 4.422013116623895, + "grad_norm": 0.41796875, + "learning_rate": 3.178827713552281e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.9956322610378265, + "num_tokens": 231421663.0, + "step": 1941 + }, + { + "entropy": 0.6378902345895767, + "epoch": 4.424294268605646, + "grad_norm": 0.51953125, + "learning_rate": 3.177014834040158e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9941515922546387, + "num_tokens": 231541078.0, + "step": 1942 + }, + { + "entropy": 0.639821782708168, + "epoch": 4.426575420587397, + "grad_norm": 0.49609375, + "learning_rate": 3.1752015702674855e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9948368221521378, + "num_tokens": 231660278.0, + "step": 1943 + }, + { + "entropy": 0.6430630013346672, + "epoch": 4.428856572569147, + "grad_norm": 0.447265625, + "learning_rate": 3.173387923263437e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9950626119971275, + "num_tokens": 231779622.0, + "step": 1944 + }, + { + "entropy": 0.6441057473421097, + "epoch": 4.431137724550898, + "grad_norm": 0.5703125, + "learning_rate": 3.1715738940574032e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9942259341478348, + "num_tokens": 231899142.0, + "step": 1945 + }, + { + "entropy": 0.6411804035305977, + "epoch": 4.433418876532649, + "grad_norm": 0.486328125, + "learning_rate": 3.1697594836789924e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9951049834489822, + "num_tokens": 232018846.0, + "step": 1946 + }, + { + "entropy": 0.63901586830616, + "epoch": 4.4357000285143995, + "grad_norm": 0.49609375, + "learning_rate": 3.167944693158029e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9956228137016296, + "num_tokens": 232137820.0, + "step": 1947 + }, + { + "entropy": 0.6390811875462532, + "epoch": 4.43798118049615, + "grad_norm": 0.447265625, + "learning_rate": 3.166129523524553e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.9958387315273285, + "num_tokens": 232256884.0, + "step": 1948 + }, + { + "entropy": 0.6428522765636444, + "epoch": 4.440262332477902, + "grad_norm": 0.53515625, + "learning_rate": 3.1643139758088194e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9939460605382919, + "num_tokens": 232376791.0, + "step": 1949 + }, + { + "entropy": 0.641300767660141, + "epoch": 4.4425434844596525, + "grad_norm": 0.578125, + "learning_rate": 3.1624980510412984e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9940467700362206, + "num_tokens": 232496134.0, + "step": 1950 + }, + { + "entropy": 0.6417373344302177, + "epoch": 4.444824636441403, + "grad_norm": 0.56640625, + "learning_rate": 3.160681750252674e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9944719448685646, + "num_tokens": 232615174.0, + "step": 1951 + }, + { + "entropy": 0.6408861428499222, + "epoch": 4.447105788423154, + "grad_norm": 0.42578125, + "learning_rate": 3.1588650744738418e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9947429075837135, + "num_tokens": 232734562.0, + "step": 1952 + }, + { + "entropy": 0.637002095580101, + "epoch": 4.449386940404905, + "grad_norm": 0.484375, + "learning_rate": 3.1570480247359147e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9936731234192848, + "num_tokens": 232854055.0, + "step": 1953 + }, + { + "entropy": 0.6422733664512634, + "epoch": 4.451668092386655, + "grad_norm": 0.671875, + "learning_rate": 3.155230602070213e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9940493851900101, + "num_tokens": 232972912.0, + "step": 1954 + }, + { + "entropy": 0.6343877837061882, + "epoch": 4.453949244368406, + "grad_norm": 0.40625, + "learning_rate": 3.153412807508271e-06, + "loss": 0.0131, + "mean_token_accuracy": 0.9958498701453209, + "num_tokens": 233091013.0, + "step": 1955 + }, + { + "entropy": 0.6441372036933899, + "epoch": 4.456230396350157, + "grad_norm": 0.51171875, + "learning_rate": 3.1515946420818343e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9940150678157806, + "num_tokens": 233210407.0, + "step": 1956 + }, + { + "entropy": 0.6382331550121307, + "epoch": 4.4585115483319075, + "grad_norm": 0.474609375, + "learning_rate": 3.1497761068228585e-06, + "loss": 0.0115, + "mean_token_accuracy": 0.9957957416772842, + "num_tokens": 233330036.0, + "step": 1957 + }, + { + "entropy": 0.6421244516968727, + "epoch": 4.460792700313658, + "grad_norm": 0.49609375, + "learning_rate": 3.1479572027635085e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9942096918821335, + "num_tokens": 233449089.0, + "step": 1958 + }, + { + "entropy": 0.6441273540258408, + "epoch": 4.463073852295409, + "grad_norm": 0.609375, + "learning_rate": 3.1461379309361594e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9962288215756416, + "num_tokens": 233568512.0, + "step": 1959 + }, + { + "entropy": 0.6363359838724136, + "epoch": 4.46535500427716, + "grad_norm": 0.51171875, + "learning_rate": 3.144318292373395e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9935462549328804, + "num_tokens": 233687919.0, + "step": 1960 + }, + { + "entropy": 0.636901468038559, + "epoch": 4.46763615625891, + "grad_norm": 0.56640625, + "learning_rate": 3.142498288108007e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9946739599108696, + "num_tokens": 233807722.0, + "step": 1961 + }, + { + "entropy": 0.6323037222027779, + "epoch": 4.469917308240661, + "grad_norm": 0.3671875, + "learning_rate": 3.1406779191729954e-06, + "loss": 0.012, + "mean_token_accuracy": 0.9969158694148064, + "num_tokens": 233926671.0, + "step": 1962 + }, + { + "entropy": 0.6416372731328011, + "epoch": 4.472198460222412, + "grad_norm": 0.6640625, + "learning_rate": 3.1388571866015645e-06, + "loss": 0.0275, + "mean_token_accuracy": 0.992642305791378, + "num_tokens": 234045880.0, + "step": 1963 + }, + { + "entropy": 0.6395493298768997, + "epoch": 4.4744796122041635, + "grad_norm": 0.474609375, + "learning_rate": 3.1370360914271286e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9958846718072891, + "num_tokens": 234165027.0, + "step": 1964 + }, + { + "entropy": 0.641558974981308, + "epoch": 4.476760764185914, + "grad_norm": 0.58203125, + "learning_rate": 3.1352146346833057e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9932300224900246, + "num_tokens": 234284664.0, + "step": 1965 + }, + { + "entropy": 0.6508661583065987, + "epoch": 4.479041916167665, + "grad_norm": 0.5859375, + "learning_rate": 3.133392817403919e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9940534308552742, + "num_tokens": 234404131.0, + "step": 1966 + }, + { + "entropy": 0.6424180343747139, + "epoch": 4.481323068149416, + "grad_norm": 0.4765625, + "learning_rate": 3.131570640622998e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9954101517796516, + "num_tokens": 234523392.0, + "step": 1967 + }, + { + "entropy": 0.6448946893215179, + "epoch": 4.483604220131166, + "grad_norm": 0.62109375, + "learning_rate": 3.1297481053747737e-06, + "loss": 0.0252, + "mean_token_accuracy": 0.9926745146512985, + "num_tokens": 234643508.0, + "step": 1968 + }, + { + "entropy": 0.6333435401320457, + "epoch": 4.485885372112917, + "grad_norm": 0.65625, + "learning_rate": 3.127925212693682e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9934757798910141, + "num_tokens": 234762385.0, + "step": 1969 + }, + { + "entropy": 0.6384085193276405, + "epoch": 4.488166524094668, + "grad_norm": 0.515625, + "learning_rate": 3.1261019636143636e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9937704205513, + "num_tokens": 234881514.0, + "step": 1970 + }, + { + "entropy": 0.645080491900444, + "epoch": 4.4904476760764185, + "grad_norm": 0.4296875, + "learning_rate": 3.124278359171657e-06, + "loss": 0.012, + "mean_token_accuracy": 0.9968867376446724, + "num_tokens": 235001279.0, + "step": 1971 + }, + { + "entropy": 0.6443276703357697, + "epoch": 4.492728828058169, + "grad_norm": 0.5390625, + "learning_rate": 3.122454400400606e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9952632561326027, + "num_tokens": 235120242.0, + "step": 1972 + }, + { + "entropy": 0.638325534760952, + "epoch": 4.49500998003992, + "grad_norm": 0.39453125, + "learning_rate": 3.1206300883364547e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9968599677085876, + "num_tokens": 235240272.0, + "step": 1973 + }, + { + "entropy": 0.6433271691203117, + "epoch": 4.497291132021671, + "grad_norm": 0.53515625, + "learning_rate": 3.1188054240146463e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9938552156090736, + "num_tokens": 235359963.0, + "step": 1974 + }, + { + "entropy": 0.641676053404808, + "epoch": 4.499572284003421, + "grad_norm": 0.54296875, + "learning_rate": 3.1169804084708267e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9944954439997673, + "num_tokens": 235479427.0, + "step": 1975 + }, + { + "entropy": 0.6433752179145813, + "epoch": 4.501853435985172, + "grad_norm": 0.5859375, + "learning_rate": 3.1151550427408383e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9946691244840622, + "num_tokens": 235599032.0, + "step": 1976 + }, + { + "entropy": 0.6459911987185478, + "epoch": 4.504134587966924, + "grad_norm": 0.53125, + "learning_rate": 3.1133293278607228e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9957546889781952, + "num_tokens": 235718433.0, + "step": 1977 + }, + { + "entropy": 0.6383399218320847, + "epoch": 4.506415739948674, + "grad_norm": 0.427734375, + "learning_rate": 3.1115032648667224e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9948806315660477, + "num_tokens": 235838171.0, + "step": 1978 + }, + { + "entropy": 0.6414690762758255, + "epoch": 4.508696891930425, + "grad_norm": 0.482421875, + "learning_rate": 3.1096768547952743e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9952015280723572, + "num_tokens": 235957624.0, + "step": 1979 + }, + { + "entropy": 0.6388564631342888, + "epoch": 4.510978043912176, + "grad_norm": 0.51953125, + "learning_rate": 3.1078500986830134e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9945461973547935, + "num_tokens": 236077148.0, + "step": 1980 + }, + { + "epoch": 4.510978043912176, + "eval_entropy": 0.6399998757775746, + "eval_loss": 0.020875511690974236, + "eval_mean_token_accuracy": 0.9935461006237074, + "eval_num_tokens": 236077148.0, + "eval_runtime": 177.4948, + "eval_samples_per_second": 47.241, + "eval_steps_per_second": 1.482, + "step": 1980 + }, + { + "entropy": 0.6415435671806335, + "epoch": 4.5132591958939265, + "grad_norm": 0.4453125, + "learning_rate": 3.1060229975667716e-06, + "loss": 0.0112, + "mean_token_accuracy": 0.9948644191026688, + "num_tokens": 236196477.0, + "step": 1981 + }, + { + "entropy": 0.6373633742332458, + "epoch": 4.515540347875677, + "grad_norm": 0.640625, + "learning_rate": 3.104195552483576e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9920546486973763, + "num_tokens": 236315645.0, + "step": 1982 + }, + { + "entropy": 0.6366719007492065, + "epoch": 4.517821499857428, + "grad_norm": 0.474609375, + "learning_rate": 3.102367764470649e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9956617802381516, + "num_tokens": 236434915.0, + "step": 1983 + }, + { + "entropy": 0.6383824944496155, + "epoch": 4.520102651839179, + "grad_norm": 0.58203125, + "learning_rate": 3.1005396345654087e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9938947632908821, + "num_tokens": 236555262.0, + "step": 1984 + }, + { + "entropy": 0.6402171105146408, + "epoch": 4.522383803820929, + "grad_norm": 0.62109375, + "learning_rate": 3.0987111638054657e-06, + "loss": 0.0252, + "mean_token_accuracy": 0.9912980049848557, + "num_tokens": 236674375.0, + "step": 1985 + }, + { + "entropy": 0.6384939104318619, + "epoch": 4.52466495580268, + "grad_norm": 0.375, + "learning_rate": 3.0968823532286246e-06, + "loss": 0.0087, + "mean_token_accuracy": 0.9967661872506142, + "num_tokens": 236794165.0, + "step": 1986 + }, + { + "entropy": 0.640986405313015, + "epoch": 4.526946107784431, + "grad_norm": 0.42578125, + "learning_rate": 3.095053203872883e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9963825196027756, + "num_tokens": 236913798.0, + "step": 1987 + }, + { + "entropy": 0.6401104629039764, + "epoch": 4.529227259766182, + "grad_norm": 0.58203125, + "learning_rate": 3.0932237167764306e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9931648820638657, + "num_tokens": 237033177.0, + "step": 1988 + }, + { + "entropy": 0.6406176760792732, + "epoch": 4.531508411747932, + "grad_norm": 0.6796875, + "learning_rate": 3.0913938929776493e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.99339210242033, + "num_tokens": 237152235.0, + "step": 1989 + }, + { + "entropy": 0.6465351656079292, + "epoch": 4.533789563729684, + "grad_norm": 0.5625, + "learning_rate": 3.0895637335151117e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9952415302395821, + "num_tokens": 237271671.0, + "step": 1990 + }, + { + "entropy": 0.6388420611619949, + "epoch": 4.536070715711435, + "grad_norm": 0.5, + "learning_rate": 3.0877332394275806e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9941336885094643, + "num_tokens": 237390092.0, + "step": 1991 + }, + { + "entropy": 0.6395936384797096, + "epoch": 4.538351867693185, + "grad_norm": 0.56640625, + "learning_rate": 3.08590241175401e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.992883175611496, + "num_tokens": 237509320.0, + "step": 1992 + }, + { + "entropy": 0.6411499455571175, + "epoch": 4.540633019674936, + "grad_norm": 0.44140625, + "learning_rate": 3.0840712515335412e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9944086372852325, + "num_tokens": 237628215.0, + "step": 1993 + }, + { + "entropy": 0.6446524932980537, + "epoch": 4.542914171656687, + "grad_norm": 0.546875, + "learning_rate": 3.0822397598055065e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9936841130256653, + "num_tokens": 237748267.0, + "step": 1994 + }, + { + "entropy": 0.642828181385994, + "epoch": 4.5451953236384375, + "grad_norm": 0.640625, + "learning_rate": 3.080407937609424e-06, + "loss": 0.023, + "mean_token_accuracy": 0.9934860840439796, + "num_tokens": 237868085.0, + "step": 1995 + }, + { + "entropy": 0.6357817277312279, + "epoch": 4.547476475620188, + "grad_norm": 0.5078125, + "learning_rate": 3.0785757859850025e-06, + "loss": 0.014, + "mean_token_accuracy": 0.9939573705196381, + "num_tokens": 237987449.0, + "step": 1996 + }, + { + "entropy": 0.6397433504462242, + "epoch": 4.549757627601939, + "grad_norm": 0.5625, + "learning_rate": 3.0767433059721338e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9946674257516861, + "num_tokens": 238106517.0, + "step": 1997 + }, + { + "entropy": 0.6387136876583099, + "epoch": 4.55203877958369, + "grad_norm": 0.54296875, + "learning_rate": 3.074910498610899e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9945537894964218, + "num_tokens": 238226306.0, + "step": 1998 + }, + { + "entropy": 0.6365977451205254, + "epoch": 4.55431993156544, + "grad_norm": 0.546875, + "learning_rate": 3.0730773649415647e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9939752146601677, + "num_tokens": 238345161.0, + "step": 1999 + }, + { + "entropy": 0.6376422718167305, + "epoch": 4.556601083547191, + "grad_norm": 0.412109375, + "learning_rate": 3.0712439060045818e-06, + "loss": 0.0127, + "mean_token_accuracy": 0.9964746385812759, + "num_tokens": 238464203.0, + "step": 2000 + }, + { + "entropy": 0.6402837410569191, + "epoch": 4.558882235528942, + "grad_norm": 0.482421875, + "learning_rate": 3.069410122840585e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9940552115440369, + "num_tokens": 238583702.0, + "step": 2001 + }, + { + "entropy": 0.6466026231646538, + "epoch": 4.5611633875106925, + "grad_norm": 0.65625, + "learning_rate": 3.0675760164903972e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9930732548236847, + "num_tokens": 238702843.0, + "step": 2002 + }, + { + "entropy": 0.637811616063118, + "epoch": 4.563444539492444, + "grad_norm": 0.578125, + "learning_rate": 3.065741587995019e-06, + "loss": 0.0119, + "mean_token_accuracy": 0.9955395460128784, + "num_tokens": 238822149.0, + "step": 2003 + }, + { + "entropy": 0.6363209709525108, + "epoch": 4.565725691474195, + "grad_norm": 0.6484375, + "learning_rate": 3.0639068383956373e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9933129101991653, + "num_tokens": 238941092.0, + "step": 2004 + }, + { + "entropy": 0.6423340439796448, + "epoch": 4.5680068434559455, + "grad_norm": 0.49609375, + "learning_rate": 3.062071768733621e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9940783381462097, + "num_tokens": 239060572.0, + "step": 2005 + }, + { + "entropy": 0.6368811950087547, + "epoch": 4.570287995437696, + "grad_norm": 0.4375, + "learning_rate": 3.0602363800505198e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.9961346313357353, + "num_tokens": 239180461.0, + "step": 2006 + }, + { + "entropy": 0.6400297656655312, + "epoch": 4.572569147419447, + "grad_norm": 0.59765625, + "learning_rate": 3.0584006733880656e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9941554293036461, + "num_tokens": 239299532.0, + "step": 2007 + }, + { + "entropy": 0.6353910192847252, + "epoch": 4.574850299401198, + "grad_norm": 0.5, + "learning_rate": 3.0565646497881697e-06, + "loss": 0.0131, + "mean_token_accuracy": 0.996129535138607, + "num_tokens": 239418700.0, + "step": 2008 + }, + { + "entropy": 0.638245515525341, + "epoch": 4.577131451382948, + "grad_norm": 0.44140625, + "learning_rate": 3.0547283102929228e-06, + "loss": 0.0109, + "mean_token_accuracy": 0.9965691566467285, + "num_tokens": 239537896.0, + "step": 2009 + }, + { + "entropy": 0.6429686099290848, + "epoch": 4.579412603364699, + "grad_norm": 0.45703125, + "learning_rate": 3.0528916559445967e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9952444136142731, + "num_tokens": 239657471.0, + "step": 2010 + }, + { + "entropy": 0.6414675191044807, + "epoch": 4.58169375534645, + "grad_norm": 0.50390625, + "learning_rate": 3.05105468778564e-06, + "loss": 0.014, + "mean_token_accuracy": 0.995491273701191, + "num_tokens": 239777505.0, + "step": 2011 + }, + { + "entropy": 0.6451347395777702, + "epoch": 4.583974907328201, + "grad_norm": 0.609375, + "learning_rate": 3.049217406858681e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9943966865539551, + "num_tokens": 239896535.0, + "step": 2012 + }, + { + "entropy": 0.641023725271225, + "epoch": 4.586256059309951, + "grad_norm": 0.59765625, + "learning_rate": 3.047379814206526e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9936529025435448, + "num_tokens": 240015724.0, + "step": 2013 + }, + { + "entropy": 0.6397752985358238, + "epoch": 4.588537211291702, + "grad_norm": 0.50390625, + "learning_rate": 3.0455419108721556e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9943569004535675, + "num_tokens": 240134790.0, + "step": 2014 + }, + { + "entropy": 0.6436570659279823, + "epoch": 4.590818363273453, + "grad_norm": 0.482421875, + "learning_rate": 3.043703697898728e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.996186651289463, + "num_tokens": 240254477.0, + "step": 2015 + }, + { + "entropy": 0.6397695243358612, + "epoch": 4.593099515255204, + "grad_norm": 0.45703125, + "learning_rate": 3.041865176329579e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9947513043880463, + "num_tokens": 240374170.0, + "step": 2016 + }, + { + "entropy": 0.6421918645501137, + "epoch": 4.595380667236955, + "grad_norm": 0.53515625, + "learning_rate": 3.040026347208217e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9943166077136993, + "num_tokens": 240494060.0, + "step": 2017 + }, + { + "entropy": 0.6429021432995796, + "epoch": 4.597661819218706, + "grad_norm": 0.453125, + "learning_rate": 3.0381872115783256e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9937751740217209, + "num_tokens": 240613913.0, + "step": 2018 + }, + { + "entropy": 0.6359845250844955, + "epoch": 4.5999429712004565, + "grad_norm": 0.765625, + "learning_rate": 3.0363477704837633e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9949750304222107, + "num_tokens": 240732949.0, + "step": 2019 + }, + { + "entropy": 0.6441051363945007, + "epoch": 4.602224123182207, + "grad_norm": 0.671875, + "learning_rate": 3.034508024968561e-06, + "loss": 0.0268, + "mean_token_accuracy": 0.9913163483142853, + "num_tokens": 240852716.0, + "step": 2020 + }, + { + "entropy": 0.6449790522456169, + "epoch": 4.604505275163958, + "grad_norm": 0.515625, + "learning_rate": 3.032667976076923e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9945819079875946, + "num_tokens": 240972162.0, + "step": 2021 + }, + { + "entropy": 0.6356042250990868, + "epoch": 4.606786427145709, + "grad_norm": 0.55078125, + "learning_rate": 3.0308276248532244e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9944297149777412, + "num_tokens": 241091315.0, + "step": 2022 + }, + { + "entropy": 0.6442453414201736, + "epoch": 4.609067579127459, + "grad_norm": 0.40625, + "learning_rate": 3.0289869723420144e-06, + "loss": 0.0118, + "mean_token_accuracy": 0.9959685429930687, + "num_tokens": 241210794.0, + "step": 2023 + }, + { + "entropy": 0.6421187445521355, + "epoch": 4.61134873110921, + "grad_norm": 0.4609375, + "learning_rate": 3.027146019588012e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9944043084979057, + "num_tokens": 241330503.0, + "step": 2024 + }, + { + "entropy": 0.6402030810713768, + "epoch": 4.613629883090961, + "grad_norm": 0.458984375, + "learning_rate": 3.025304767636105e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9933184385299683, + "num_tokens": 241450624.0, + "step": 2025 + }, + { + "entropy": 0.639320932328701, + "epoch": 4.6159110350727115, + "grad_norm": 0.4609375, + "learning_rate": 3.0234632175313537e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9953695684671402, + "num_tokens": 241569718.0, + "step": 2026 + }, + { + "entropy": 0.6430572494864464, + "epoch": 4.618192187054462, + "grad_norm": 0.478515625, + "learning_rate": 3.0216213703189856e-06, + "loss": 0.0135, + "mean_token_accuracy": 0.9944694116711617, + "num_tokens": 241688970.0, + "step": 2027 + }, + { + "entropy": 0.6403233855962753, + "epoch": 4.620473339036213, + "grad_norm": 0.421875, + "learning_rate": 3.019779227044398e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9959559664130211, + "num_tokens": 241809046.0, + "step": 2028 + }, + { + "entropy": 0.6424550116062164, + "epoch": 4.6227544910179645, + "grad_norm": 0.400390625, + "learning_rate": 3.0179367887531567e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.996355302631855, + "num_tokens": 241927935.0, + "step": 2029 + }, + { + "entropy": 0.6434010714292526, + "epoch": 4.625035642999714, + "grad_norm": 0.455078125, + "learning_rate": 3.016094056490993e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9958974421024323, + "num_tokens": 242047252.0, + "step": 2030 + }, + { + "entropy": 0.6459401324391365, + "epoch": 4.627316794981466, + "grad_norm": 0.41796875, + "learning_rate": 3.0142510313038057e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9950544685125351, + "num_tokens": 242167254.0, + "step": 2031 + }, + { + "entropy": 0.6382547169923782, + "epoch": 4.629597946963217, + "grad_norm": 0.57421875, + "learning_rate": 3.012407714237662e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9964631199836731, + "num_tokens": 242286697.0, + "step": 2032 + }, + { + "entropy": 0.6393710970878601, + "epoch": 4.631879098944967, + "grad_norm": 0.462890625, + "learning_rate": 3.010564106338791e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9930997788906097, + "num_tokens": 242406717.0, + "step": 2033 + }, + { + "entropy": 0.640256978571415, + "epoch": 4.634160250926718, + "grad_norm": 0.3984375, + "learning_rate": 3.0087202086535915e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9951852485537529, + "num_tokens": 242527102.0, + "step": 2034 + }, + { + "entropy": 0.6422373354434967, + "epoch": 4.636441402908469, + "grad_norm": 0.5, + "learning_rate": 3.006876022228622e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9946187436580658, + "num_tokens": 242646581.0, + "step": 2035 + }, + { + "entropy": 0.6400672346353531, + "epoch": 4.63872255489022, + "grad_norm": 0.59765625, + "learning_rate": 3.0050315481106074e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9941101595759392, + "num_tokens": 242765469.0, + "step": 2036 + }, + { + "entropy": 0.634016789495945, + "epoch": 4.64100370687197, + "grad_norm": 0.53125, + "learning_rate": 3.0031867873464372e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9941720217466354, + "num_tokens": 242884428.0, + "step": 2037 + }, + { + "entropy": 0.637068010866642, + "epoch": 4.643284858853721, + "grad_norm": 0.453125, + "learning_rate": 3.00134174098316e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9955308958888054, + "num_tokens": 243002335.0, + "step": 2038 + }, + { + "entropy": 0.6373956501483917, + "epoch": 4.645566010835472, + "grad_norm": 0.6796875, + "learning_rate": 2.999496410067989e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.995015561580658, + "num_tokens": 243122843.0, + "step": 2039 + }, + { + "entropy": 0.6447825208306313, + "epoch": 4.647847162817222, + "grad_norm": 0.6328125, + "learning_rate": 2.9976507956482996e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9928542450070381, + "num_tokens": 243242338.0, + "step": 2040 + }, + { + "entropy": 0.6423397958278656, + "epoch": 4.650128314798973, + "grad_norm": 0.7109375, + "learning_rate": 2.9958048987716266e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9924467280507088, + "num_tokens": 243361265.0, + "step": 2041 + }, + { + "entropy": 0.646359272301197, + "epoch": 4.652409466780725, + "grad_norm": 0.5625, + "learning_rate": 2.993958720485664e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9904635548591614, + "num_tokens": 243480543.0, + "step": 2042 + }, + { + "entropy": 0.6423735022544861, + "epoch": 4.654690618762475, + "grad_norm": 0.470703125, + "learning_rate": 2.9921122618382687e-06, + "loss": 0.0112, + "mean_token_accuracy": 0.9961130991578102, + "num_tokens": 243600375.0, + "step": 2043 + }, + { + "entropy": 0.6456871554255486, + "epoch": 4.656971770744226, + "grad_norm": 0.5078125, + "learning_rate": 2.9902655238774537e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9935755804181099, + "num_tokens": 243719508.0, + "step": 2044 + }, + { + "entropy": 0.6430426388978958, + "epoch": 4.659252922725977, + "grad_norm": 0.51171875, + "learning_rate": 2.988418507651392e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.9932248219847679, + "num_tokens": 243839081.0, + "step": 2045 + }, + { + "entropy": 0.6445721685886383, + "epoch": 4.661534074707728, + "grad_norm": 0.56640625, + "learning_rate": 2.9865712142084145e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9939185455441475, + "num_tokens": 243958877.0, + "step": 2046 + }, + { + "entropy": 0.6429309397935867, + "epoch": 4.663815226689478, + "grad_norm": 0.48828125, + "learning_rate": 2.98472364459701e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9949887841939926, + "num_tokens": 244078602.0, + "step": 2047 + }, + { + "entropy": 0.6383395045995712, + "epoch": 4.666096378671229, + "grad_norm": 0.48046875, + "learning_rate": 2.982875799865823e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9952163398265839, + "num_tokens": 244197998.0, + "step": 2048 + }, + { + "entropy": 0.644562155008316, + "epoch": 4.66837753065298, + "grad_norm": 0.400390625, + "learning_rate": 2.9810276810636535e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9941458404064178, + "num_tokens": 244317051.0, + "step": 2049 + }, + { + "entropy": 0.6407277137041092, + "epoch": 4.6706586826347305, + "grad_norm": 0.5546875, + "learning_rate": 2.97917928923946e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9940282180905342, + "num_tokens": 244436769.0, + "step": 2050 + }, + { + "entropy": 0.6409889310598373, + "epoch": 4.672939834616481, + "grad_norm": 0.53125, + "learning_rate": 2.977330625442352e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.993845209479332, + "num_tokens": 244556547.0, + "step": 2051 + }, + { + "entropy": 0.6441834270954132, + "epoch": 4.675220986598232, + "grad_norm": 0.50390625, + "learning_rate": 2.9754816907215963e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.9949487969279289, + "num_tokens": 244675651.0, + "step": 2052 + }, + { + "entropy": 0.64098110049963, + "epoch": 4.677502138579983, + "grad_norm": 0.578125, + "learning_rate": 2.9736324861266125e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9938168227672577, + "num_tokens": 244795066.0, + "step": 2053 + }, + { + "entropy": 0.6421234682202339, + "epoch": 4.679783290561733, + "grad_norm": 0.4765625, + "learning_rate": 2.9717830127069734e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9948196113109589, + "num_tokens": 244914104.0, + "step": 2054 + }, + { + "entropy": 0.6414186283946037, + "epoch": 4.682064442543484, + "grad_norm": 0.47265625, + "learning_rate": 2.969933271512404e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9944244548678398, + "num_tokens": 245033388.0, + "step": 2055 + }, + { + "entropy": 0.6415694504976273, + "epoch": 4.684345594525235, + "grad_norm": 0.5703125, + "learning_rate": 2.9680832635927824e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9940778464078903, + "num_tokens": 245152306.0, + "step": 2056 + }, + { + "entropy": 0.6401252970099449, + "epoch": 4.686626746506986, + "grad_norm": 0.625, + "learning_rate": 2.9662329899981375e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9938645958900452, + "num_tokens": 245271883.0, + "step": 2057 + }, + { + "entropy": 0.6370800882577896, + "epoch": 4.688907898488737, + "grad_norm": 0.44140625, + "learning_rate": 2.964382451778648e-06, + "loss": 0.01, + "mean_token_accuracy": 0.996836818754673, + "num_tokens": 245391626.0, + "step": 2058 + }, + { + "entropy": 0.6377317681908607, + "epoch": 4.691189050470488, + "grad_norm": 0.474609375, + "learning_rate": 2.9625316499846444e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9940750002861023, + "num_tokens": 245510635.0, + "step": 2059 + }, + { + "entropy": 0.647210143506527, + "epoch": 4.693470202452239, + "grad_norm": 0.68359375, + "learning_rate": 2.9606805856666053e-06, + "loss": 0.0235, + "mean_token_accuracy": 0.9934429153800011, + "num_tokens": 245630336.0, + "step": 2060 + }, + { + "entropy": 0.6464722901582718, + "epoch": 4.695751354433989, + "grad_norm": 0.54296875, + "learning_rate": 2.95882925987516e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9949778765439987, + "num_tokens": 245749744.0, + "step": 2061 + }, + { + "entropy": 0.6459446996450424, + "epoch": 4.69803250641574, + "grad_norm": 0.447265625, + "learning_rate": 2.9569776736610855e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9952825605869293, + "num_tokens": 245869076.0, + "step": 2062 + }, + { + "entropy": 0.6392257586121559, + "epoch": 4.700313658397491, + "grad_norm": 0.56640625, + "learning_rate": 2.9551258280753046e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.991569958627224, + "num_tokens": 245988603.0, + "step": 2063 + }, + { + "entropy": 0.6393361538648605, + "epoch": 4.702594810379241, + "grad_norm": 0.5546875, + "learning_rate": 2.953273724168891e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9949479028582573, + "num_tokens": 246108003.0, + "step": 2064 + }, + { + "entropy": 0.6409130245447159, + "epoch": 4.704875962360992, + "grad_norm": 0.68359375, + "learning_rate": 2.9514213629930614e-06, + "loss": 0.0286, + "mean_token_accuracy": 0.9902507364749908, + "num_tokens": 246226747.0, + "step": 2065 + }, + { + "entropy": 0.6400060728192329, + "epoch": 4.707157114342743, + "grad_norm": 0.44140625, + "learning_rate": 2.949568745599182e-06, + "loss": 0.0103, + "mean_token_accuracy": 0.9964773207902908, + "num_tokens": 246345803.0, + "step": 2066 + }, + { + "entropy": 0.640043742954731, + "epoch": 4.709438266324494, + "grad_norm": 0.478515625, + "learning_rate": 2.9477158730387615e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9948458820581436, + "num_tokens": 246464797.0, + "step": 2067 + }, + { + "entropy": 0.6415300816297531, + "epoch": 4.711719418306244, + "grad_norm": 0.458984375, + "learning_rate": 2.945862746363455e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9952059760689735, + "num_tokens": 246584560.0, + "step": 2068 + }, + { + "entropy": 0.6452323794364929, + "epoch": 4.714000570287995, + "grad_norm": 0.625, + "learning_rate": 2.944009366625061e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9929584339261055, + "num_tokens": 246703637.0, + "step": 2069 + }, + { + "entropy": 0.6398139074444771, + "epoch": 4.716281722269747, + "grad_norm": 0.56640625, + "learning_rate": 2.942155734875523e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.993562214076519, + "num_tokens": 246822983.0, + "step": 2070 + }, + { + "entropy": 0.6418680846691132, + "epoch": 4.718562874251497, + "grad_norm": 0.498046875, + "learning_rate": 2.9403018521669256e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9941180124878883, + "num_tokens": 246942364.0, + "step": 2071 + }, + { + "entropy": 0.6397334262728691, + "epoch": 4.720844026233248, + "grad_norm": 0.404296875, + "learning_rate": 2.938447719551498e-06, + "loss": 0.0104, + "mean_token_accuracy": 0.9967002496123314, + "num_tokens": 247061399.0, + "step": 2072 + }, + { + "entropy": 0.6470726653933525, + "epoch": 4.723125178214999, + "grad_norm": 0.47265625, + "learning_rate": 2.9365933380816092e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9945366159081459, + "num_tokens": 247180839.0, + "step": 2073 + }, + { + "entropy": 0.6434314250946045, + "epoch": 4.7254063301967495, + "grad_norm": 0.58203125, + "learning_rate": 2.93473870880977e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9939085319638252, + "num_tokens": 247300343.0, + "step": 2074 + }, + { + "entropy": 0.6451361775398254, + "epoch": 4.7276874821785, + "grad_norm": 0.671875, + "learning_rate": 2.932883832788633e-06, + "loss": 0.0244, + "mean_token_accuracy": 0.9935993626713753, + "num_tokens": 247419867.0, + "step": 2075 + }, + { + "entropy": 0.6434604972600937, + "epoch": 4.729968634160251, + "grad_norm": 0.5546875, + "learning_rate": 2.9310287110709895e-06, + "loss": 0.0223, + "mean_token_accuracy": 0.9936047270894051, + "num_tokens": 247539386.0, + "step": 2076 + }, + { + "entropy": 0.6450932621955872, + "epoch": 4.732249786142002, + "grad_norm": 0.609375, + "learning_rate": 2.9291733447097714e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9950853660702705, + "num_tokens": 247658500.0, + "step": 2077 + }, + { + "entropy": 0.6433617323637009, + "epoch": 4.734530938123752, + "grad_norm": 0.58984375, + "learning_rate": 2.927317734758047e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9941390454769135, + "num_tokens": 247777841.0, + "step": 2078 + }, + { + "entropy": 0.6433444768190384, + "epoch": 4.736812090105503, + "grad_norm": 0.52734375, + "learning_rate": 2.925461882269027e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9931460544466972, + "num_tokens": 247897619.0, + "step": 2079 + }, + { + "entropy": 0.6423624530434608, + "epoch": 4.739093242087254, + "grad_norm": 0.5546875, + "learning_rate": 2.9236057882960567e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9922545850276947, + "num_tokens": 248017432.0, + "step": 2080 + }, + { + "entropy": 0.645497553050518, + "epoch": 4.7413743940690045, + "grad_norm": 0.447265625, + "learning_rate": 2.921749453892618e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9947901144623756, + "num_tokens": 248137006.0, + "step": 2081 + }, + { + "entropy": 0.6403241902589798, + "epoch": 4.743655546050755, + "grad_norm": 0.59375, + "learning_rate": 2.919892880112332e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9931026548147202, + "num_tokens": 248256237.0, + "step": 2082 + }, + { + "entropy": 0.6389338672161102, + "epoch": 4.745936698032507, + "grad_norm": 0.5234375, + "learning_rate": 2.9180360680089542e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9947144016623497, + "num_tokens": 248375203.0, + "step": 2083 + }, + { + "entropy": 0.6431425586342812, + "epoch": 4.748217850014258, + "grad_norm": 0.54296875, + "learning_rate": 2.9161790186363746e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9954144805669785, + "num_tokens": 248494434.0, + "step": 2084 + }, + { + "entropy": 0.6411675661802292, + "epoch": 4.750499001996008, + "grad_norm": 0.55859375, + "learning_rate": 2.9143217330486186e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9940498992800713, + "num_tokens": 248613492.0, + "step": 2085 + }, + { + "entropy": 0.6382928490638733, + "epoch": 4.752780153977759, + "grad_norm": 0.4453125, + "learning_rate": 2.9124642122998453e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9959108829498291, + "num_tokens": 248733476.0, + "step": 2086 + }, + { + "entropy": 0.6489552557468414, + "epoch": 4.75506130595951, + "grad_norm": 0.53125, + "learning_rate": 2.9106064574443477e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.995123103260994, + "num_tokens": 248852583.0, + "step": 2087 + }, + { + "entropy": 0.6457589119672775, + "epoch": 4.75734245794126, + "grad_norm": 0.5078125, + "learning_rate": 2.9087484695365523e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9935207441449165, + "num_tokens": 248971775.0, + "step": 2088 + }, + { + "entropy": 0.6411176100373268, + "epoch": 4.759623609923011, + "grad_norm": 0.56640625, + "learning_rate": 2.906890249631017e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9944418147206306, + "num_tokens": 249090833.0, + "step": 2089 + }, + { + "entropy": 0.6467028111219406, + "epoch": 4.761904761904762, + "grad_norm": 0.484375, + "learning_rate": 2.905031798782431e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.9965471550822258, + "num_tokens": 249210310.0, + "step": 2090 + }, + { + "entropy": 0.6436480730772018, + "epoch": 4.764185913886513, + "grad_norm": 0.453125, + "learning_rate": 2.903173118045616e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9961269572377205, + "num_tokens": 249329675.0, + "step": 2091 + }, + { + "entropy": 0.6447379291057587, + "epoch": 4.766467065868263, + "grad_norm": 0.498046875, + "learning_rate": 2.901314208475522e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.995042622089386, + "num_tokens": 249449011.0, + "step": 2092 + }, + { + "entropy": 0.6413252726197243, + "epoch": 4.768748217850014, + "grad_norm": 0.482421875, + "learning_rate": 2.8994550711272317e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9941132366657257, + "num_tokens": 249568260.0, + "step": 2093 + }, + { + "entropy": 0.638678528368473, + "epoch": 4.771029369831765, + "grad_norm": 0.458984375, + "learning_rate": 2.897595707055954e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9957961067557335, + "num_tokens": 249687507.0, + "step": 2094 + }, + { + "entropy": 0.6414436176419258, + "epoch": 4.7733105218135155, + "grad_norm": 0.474609375, + "learning_rate": 2.8957361173170297e-06, + "loss": 0.0107, + "mean_token_accuracy": 0.9958446472883224, + "num_tokens": 249806448.0, + "step": 2095 + }, + { + "entropy": 0.6425389125943184, + "epoch": 4.775591673795267, + "grad_norm": 0.55859375, + "learning_rate": 2.893876302965925e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9925805479288101, + "num_tokens": 249925811.0, + "step": 2096 + }, + { + "entropy": 0.6418862491846085, + "epoch": 4.777872825777018, + "grad_norm": 0.58984375, + "learning_rate": 2.8920162650582344e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9941305220127106, + "num_tokens": 250046644.0, + "step": 2097 + }, + { + "entropy": 0.6426485180854797, + "epoch": 4.7801539777587685, + "grad_norm": 0.6015625, + "learning_rate": 2.8901560046496797e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9936600103974342, + "num_tokens": 250165815.0, + "step": 2098 + }, + { + "entropy": 0.6417804807424545, + "epoch": 4.782435129740519, + "grad_norm": 0.546875, + "learning_rate": 2.8882955227961098e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9929041489958763, + "num_tokens": 250285027.0, + "step": 2099 + }, + { + "entropy": 0.6484295427799225, + "epoch": 4.78471628172227, + "grad_norm": 0.66796875, + "learning_rate": 2.886434820553497e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9925272762775421, + "num_tokens": 250404676.0, + "step": 2100 + }, + { + "entropy": 0.6403294950723648, + "epoch": 4.786997433704021, + "grad_norm": 0.51171875, + "learning_rate": 2.884573898977941e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9935643598437309, + "num_tokens": 250523640.0, + "step": 2101 + }, + { + "entropy": 0.6457153111696243, + "epoch": 4.789278585685771, + "grad_norm": 0.50390625, + "learning_rate": 2.882712759125664e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9954805970191956, + "num_tokens": 250643400.0, + "step": 2102 + }, + { + "entropy": 0.6418323665857315, + "epoch": 4.791559737667522, + "grad_norm": 0.484375, + "learning_rate": 2.8808514020530127e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9952157139778137, + "num_tokens": 250762566.0, + "step": 2103 + }, + { + "entropy": 0.6434327438473701, + "epoch": 4.793840889649273, + "grad_norm": 0.462890625, + "learning_rate": 2.8789898288164595e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.995516374707222, + "num_tokens": 250881658.0, + "step": 2104 + }, + { + "entropy": 0.6419427394866943, + "epoch": 4.7961220416310235, + "grad_norm": 0.65234375, + "learning_rate": 2.8771280404725953e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9933141320943832, + "num_tokens": 251000313.0, + "step": 2105 + }, + { + "entropy": 0.6384585797786713, + "epoch": 4.798403193612774, + "grad_norm": 0.54296875, + "learning_rate": 2.8752660380781367e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9939695447683334, + "num_tokens": 251119814.0, + "step": 2106 + }, + { + "entropy": 0.6400193199515343, + "epoch": 4.800684345594525, + "grad_norm": 0.427734375, + "learning_rate": 2.8734038226899198e-06, + "loss": 0.014, + "mean_token_accuracy": 0.9949992895126343, + "num_tokens": 251238508.0, + "step": 2107 + }, + { + "entropy": 0.6414906457066536, + "epoch": 4.802965497576276, + "grad_norm": 0.515625, + "learning_rate": 2.8715413953649012e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9930652305483818, + "num_tokens": 251358226.0, + "step": 2108 + }, + { + "entropy": 0.6438077390193939, + "epoch": 4.805246649558027, + "grad_norm": 0.54296875, + "learning_rate": 2.8696787571601597e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9949614778161049, + "num_tokens": 251477472.0, + "step": 2109 + }, + { + "entropy": 0.6446219086647034, + "epoch": 4.807527801539777, + "grad_norm": 0.4453125, + "learning_rate": 2.8678159091328926e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9939121082425117, + "num_tokens": 251596983.0, + "step": 2110 + }, + { + "entropy": 0.6373369097709656, + "epoch": 4.809808953521529, + "grad_norm": 0.52734375, + "learning_rate": 2.865952852340417e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9944693893194199, + "num_tokens": 251716300.0, + "step": 2111 + }, + { + "entropy": 0.6415681019425392, + "epoch": 4.812090105503279, + "grad_norm": 0.48046875, + "learning_rate": 2.864089587840167e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9950059279799461, + "num_tokens": 251834871.0, + "step": 2112 + }, + { + "entropy": 0.6417759358882904, + "epoch": 4.81437125748503, + "grad_norm": 0.6015625, + "learning_rate": 2.862226116689696e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9936555027961731, + "num_tokens": 251953973.0, + "step": 2113 + }, + { + "entropy": 0.6395019665360451, + "epoch": 4.816652409466781, + "grad_norm": 0.5, + "learning_rate": 2.8603624399466732e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9945844113826752, + "num_tokens": 252074299.0, + "step": 2114 + }, + { + "entropy": 0.6470831483602524, + "epoch": 4.818933561448532, + "grad_norm": 0.73828125, + "learning_rate": 2.858498558668888e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9925031736493111, + "num_tokens": 252193785.0, + "step": 2115 + }, + { + "entropy": 0.6438267305493355, + "epoch": 4.821214713430282, + "grad_norm": 0.53515625, + "learning_rate": 2.856634473914242e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9943032339215279, + "num_tokens": 252313833.0, + "step": 2116 + }, + { + "entropy": 0.6395959407091141, + "epoch": 4.823495865412033, + "grad_norm": 0.38671875, + "learning_rate": 2.854770186740753e-06, + "loss": 0.014, + "mean_token_accuracy": 0.9950704872608185, + "num_tokens": 252434016.0, + "step": 2117 + }, + { + "entropy": 0.6398417875170708, + "epoch": 4.825777017393784, + "grad_norm": 0.474609375, + "learning_rate": 2.8529056982065557e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9933321699500084, + "num_tokens": 252553084.0, + "step": 2118 + }, + { + "entropy": 0.6368174478411674, + "epoch": 4.8280581693755344, + "grad_norm": 0.482421875, + "learning_rate": 2.8510410093698966e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9946967735886574, + "num_tokens": 252671893.0, + "step": 2119 + }, + { + "entropy": 0.6448867619037628, + "epoch": 4.830339321357285, + "grad_norm": 0.482421875, + "learning_rate": 2.849176121289138e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9945550709962845, + "num_tokens": 252792198.0, + "step": 2120 + }, + { + "entropy": 0.6425801515579224, + "epoch": 4.832620473339036, + "grad_norm": 0.494140625, + "learning_rate": 2.8473110350227536e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9934907928109169, + "num_tokens": 252911933.0, + "step": 2121 + }, + { + "entropy": 0.6439590305089951, + "epoch": 4.8349016253207875, + "grad_norm": 0.5703125, + "learning_rate": 2.845445751629331e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9941969960927963, + "num_tokens": 253032360.0, + "step": 2122 + }, + { + "entropy": 0.6429594829678535, + "epoch": 4.837182777302537, + "grad_norm": 0.50390625, + "learning_rate": 2.843580272167569e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9938461855053902, + "num_tokens": 253151997.0, + "step": 2123 + }, + { + "entropy": 0.6423943042755127, + "epoch": 4.839463929284289, + "grad_norm": 0.578125, + "learning_rate": 2.8417145976962773e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9949891343712807, + "num_tokens": 253270795.0, + "step": 2124 + }, + { + "entropy": 0.6409913152456284, + "epoch": 4.84174508126604, + "grad_norm": 0.7109375, + "learning_rate": 2.8398487292743772e-06, + "loss": 0.024, + "mean_token_accuracy": 0.9915321692824364, + "num_tokens": 253389785.0, + "step": 2125 + }, + { + "entropy": 0.6378040984272957, + "epoch": 4.84402623324779, + "grad_norm": 0.447265625, + "learning_rate": 2.8379826679609e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9957094863057137, + "num_tokens": 253508534.0, + "step": 2126 + }, + { + "entropy": 0.633640855550766, + "epoch": 4.846307385229541, + "grad_norm": 0.5859375, + "learning_rate": 2.836116414814985e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9944384098052979, + "num_tokens": 253627384.0, + "step": 2127 + }, + { + "entropy": 0.6400596424937248, + "epoch": 4.848588537211292, + "grad_norm": 0.52734375, + "learning_rate": 2.8342499708958827e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.994577944278717, + "num_tokens": 253746789.0, + "step": 2128 + }, + { + "entropy": 0.642471544444561, + "epoch": 4.8508696891930425, + "grad_norm": 0.4921875, + "learning_rate": 2.8323833372629485e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.9958815798163414, + "num_tokens": 253866762.0, + "step": 2129 + }, + { + "entropy": 0.6395443677902222, + "epoch": 4.853150841174793, + "grad_norm": 0.4453125, + "learning_rate": 2.8305165149756496e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.994530625641346, + "num_tokens": 253986149.0, + "step": 2130 + }, + { + "entropy": 0.6348731443285942, + "epoch": 4.855431993156544, + "grad_norm": 0.609375, + "learning_rate": 2.828649505093558e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9941309988498688, + "num_tokens": 254105506.0, + "step": 2131 + }, + { + "entropy": 0.6376974880695343, + "epoch": 4.857713145138295, + "grad_norm": 0.51171875, + "learning_rate": 2.826782308676351e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9931429103016853, + "num_tokens": 254224327.0, + "step": 2132 + }, + { + "entropy": 0.6398631185293198, + "epoch": 4.859994297120045, + "grad_norm": 0.5, + "learning_rate": 2.824914926783815e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9941440671682358, + "num_tokens": 254343885.0, + "step": 2133 + }, + { + "entropy": 0.6477411687374115, + "epoch": 4.862275449101796, + "grad_norm": 0.5859375, + "learning_rate": 2.82304736047584e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9945659264922142, + "num_tokens": 254463220.0, + "step": 2134 + }, + { + "entropy": 0.6448121219873428, + "epoch": 4.864556601083547, + "grad_norm": 0.5234375, + "learning_rate": 2.821179610812419e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9940013587474823, + "num_tokens": 254583072.0, + "step": 2135 + }, + { + "entropy": 0.6415137425065041, + "epoch": 4.8668377530652975, + "grad_norm": 0.53515625, + "learning_rate": 2.819311678853652e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.9911446273326874, + "num_tokens": 254703100.0, + "step": 2136 + }, + { + "entropy": 0.6353111788630486, + "epoch": 4.869118905047049, + "grad_norm": 0.5390625, + "learning_rate": 2.8174435656597403e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9938672631978989, + "num_tokens": 254822638.0, + "step": 2137 + }, + { + "entropy": 0.6402861401438713, + "epoch": 4.8714000570288, + "grad_norm": 0.40234375, + "learning_rate": 2.8155752722909896e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9956029430031776, + "num_tokens": 254941949.0, + "step": 2138 + }, + { + "entropy": 0.6361711695790291, + "epoch": 4.873681209010551, + "grad_norm": 0.5625, + "learning_rate": 2.8137067998078073e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.994372770190239, + "num_tokens": 255060791.0, + "step": 2139 + }, + { + "entropy": 0.6344102695584297, + "epoch": 4.875962360992301, + "grad_norm": 0.6015625, + "learning_rate": 2.8118381492707004e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.993817962706089, + "num_tokens": 255179653.0, + "step": 2140 + }, + { + "entropy": 0.6411167606711388, + "epoch": 4.878243512974052, + "grad_norm": 0.52734375, + "learning_rate": 2.8099693217402807e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9944401234388351, + "num_tokens": 255299698.0, + "step": 2141 + }, + { + "entropy": 0.6384032294154167, + "epoch": 4.880524664955803, + "grad_norm": 0.77734375, + "learning_rate": 2.808100318277258e-06, + "loss": 0.0292, + "mean_token_accuracy": 0.991107352077961, + "num_tokens": 255419250.0, + "step": 2142 + }, + { + "entropy": 0.643834225833416, + "epoch": 4.8828058169375534, + "grad_norm": 0.51953125, + "learning_rate": 2.806231139942443e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9942874759435654, + "num_tokens": 255539575.0, + "step": 2143 + }, + { + "entropy": 0.6405397355556488, + "epoch": 4.885086968919304, + "grad_norm": 0.66015625, + "learning_rate": 2.8043617877967456e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9944500178098679, + "num_tokens": 255658600.0, + "step": 2144 + }, + { + "entropy": 0.6457274034619331, + "epoch": 4.887368120901055, + "grad_norm": 0.63671875, + "learning_rate": 2.8024922629011727e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9941845238208771, + "num_tokens": 255778362.0, + "step": 2145 + }, + { + "entropy": 0.6409778818488121, + "epoch": 4.889649272882806, + "grad_norm": 0.474609375, + "learning_rate": 2.800622566316831e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9946346133947372, + "num_tokens": 255897533.0, + "step": 2146 + }, + { + "entropy": 0.6360712349414825, + "epoch": 4.891930424864556, + "grad_norm": 0.4296875, + "learning_rate": 2.798752699104925e-06, + "loss": 0.012, + "mean_token_accuracy": 0.9948798269033432, + "num_tokens": 256016415.0, + "step": 2147 + }, + { + "entropy": 0.6422446966171265, + "epoch": 4.894211576846307, + "grad_norm": 0.734375, + "learning_rate": 2.7968826623267542e-06, + "loss": 0.0246, + "mean_token_accuracy": 0.9915911927819252, + "num_tokens": 256135701.0, + "step": 2148 + }, + { + "entropy": 0.6381501331925392, + "epoch": 4.896492728828058, + "grad_norm": 0.71484375, + "learning_rate": 2.7950124570437163e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.9941871985793114, + "num_tokens": 256255071.0, + "step": 2149 + }, + { + "entropy": 0.6395769193768501, + "epoch": 4.898773880809809, + "grad_norm": 0.51953125, + "learning_rate": 2.793142084317303e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9931281358003616, + "num_tokens": 256374024.0, + "step": 2150 + }, + { + "entropy": 0.6401519551873207, + "epoch": 4.90105503279156, + "grad_norm": 0.421875, + "learning_rate": 2.7912715452091014e-06, + "loss": 0.0097, + "mean_token_accuracy": 0.9969333186745644, + "num_tokens": 256493812.0, + "step": 2151 + }, + { + "entropy": 0.6369927078485489, + "epoch": 4.903336184773311, + "grad_norm": 0.7109375, + "learning_rate": 2.789400840780795e-06, + "loss": 0.0244, + "mean_token_accuracy": 0.9924291595816612, + "num_tokens": 256612840.0, + "step": 2152 + }, + { + "entropy": 0.6365250498056412, + "epoch": 4.9056173367550615, + "grad_norm": 0.56640625, + "learning_rate": 2.7875299720941577e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9946802780032158, + "num_tokens": 256732841.0, + "step": 2153 + }, + { + "entropy": 0.6422205567359924, + "epoch": 4.907898488736812, + "grad_norm": 0.68359375, + "learning_rate": 2.785658940211059e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.993583969771862, + "num_tokens": 256851991.0, + "step": 2154 + }, + { + "entropy": 0.6411515548825264, + "epoch": 4.910179640718563, + "grad_norm": 0.55078125, + "learning_rate": 2.7837877461934616e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9952225983142853, + "num_tokens": 256970763.0, + "step": 2155 + }, + { + "entropy": 0.6409463733434677, + "epoch": 4.912460792700314, + "grad_norm": 0.52734375, + "learning_rate": 2.7819163911034175e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9940475299954414, + "num_tokens": 257090105.0, + "step": 2156 + }, + { + "entropy": 0.6383676752448082, + "epoch": 4.914741944682064, + "grad_norm": 0.46484375, + "learning_rate": 2.7800448760030724e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9934883117675781, + "num_tokens": 257209550.0, + "step": 2157 + }, + { + "entropy": 0.6358331367373466, + "epoch": 4.917023096663815, + "grad_norm": 0.50390625, + "learning_rate": 2.7781732019546625e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9952009320259094, + "num_tokens": 257328917.0, + "step": 2158 + }, + { + "entropy": 0.6352043002843857, + "epoch": 4.919304248645566, + "grad_norm": 0.56640625, + "learning_rate": 2.776301370020513e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9948874190449715, + "num_tokens": 257447353.0, + "step": 2159 + }, + { + "entropy": 0.6393407806754112, + "epoch": 4.9215854006273165, + "grad_norm": 0.4453125, + "learning_rate": 2.7744293812630412e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.992662288248539, + "num_tokens": 257566616.0, + "step": 2160 + }, + { + "entropy": 0.6411138698458672, + "epoch": 4.923866552609067, + "grad_norm": 0.455078125, + "learning_rate": 2.77255723674475e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9950527995824814, + "num_tokens": 257686209.0, + "step": 2161 + }, + { + "entropy": 0.6357845664024353, + "epoch": 4.926147704590818, + "grad_norm": 0.5703125, + "learning_rate": 2.770684937528233e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9917392060160637, + "num_tokens": 257805551.0, + "step": 2162 + }, + { + "entropy": 0.6451032012701035, + "epoch": 4.92842885657257, + "grad_norm": 0.51171875, + "learning_rate": 2.7688124846761716e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9949991405010223, + "num_tokens": 257924667.0, + "step": 2163 + }, + { + "entropy": 0.6439765244722366, + "epoch": 4.93071000855432, + "grad_norm": 0.55078125, + "learning_rate": 2.766939879251333e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9931012094020844, + "num_tokens": 258044512.0, + "step": 2164 + }, + { + "entropy": 0.6384392976760864, + "epoch": 4.932991160536071, + "grad_norm": 0.478515625, + "learning_rate": 2.7650671223165726e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9951884523034096, + "num_tokens": 258163571.0, + "step": 2165 + }, + { + "entropy": 0.6388044282793999, + "epoch": 4.935272312517822, + "grad_norm": 0.640625, + "learning_rate": 2.7631942149348313e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.995187371969223, + "num_tokens": 258283017.0, + "step": 2166 + }, + { + "entropy": 0.6406887322664261, + "epoch": 4.937553464499572, + "grad_norm": 0.5703125, + "learning_rate": 2.761321158169134e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9936549142003059, + "num_tokens": 258402453.0, + "step": 2167 + }, + { + "entropy": 0.6440480947494507, + "epoch": 4.939834616481323, + "grad_norm": 0.51171875, + "learning_rate": 2.759447953082593e-06, + "loss": 0.015, + "mean_token_accuracy": 0.9943573176860809, + "num_tokens": 258521281.0, + "step": 2168 + }, + { + "entropy": 0.644825704395771, + "epoch": 4.942115768463074, + "grad_norm": 0.384765625, + "learning_rate": 2.757574600738402e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9945191517472267, + "num_tokens": 258640067.0, + "step": 2169 + }, + { + "entropy": 0.6368236392736435, + "epoch": 4.944396920444825, + "grad_norm": 0.55078125, + "learning_rate": 2.755701102199841e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.994302049279213, + "num_tokens": 258759363.0, + "step": 2170 + }, + { + "entropy": 0.638024814426899, + "epoch": 4.946678072426575, + "grad_norm": 0.5546875, + "learning_rate": 2.7538274585302707e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.994593121111393, + "num_tokens": 258878864.0, + "step": 2171 + }, + { + "entropy": 0.6402984336018562, + "epoch": 4.948959224408326, + "grad_norm": 0.443359375, + "learning_rate": 2.751953670793135e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9963576421141624, + "num_tokens": 258998070.0, + "step": 2172 + }, + { + "entropy": 0.6457531899213791, + "epoch": 4.951240376390077, + "grad_norm": 0.474609375, + "learning_rate": 2.7500797400519595e-06, + "loss": 0.0135, + "mean_token_accuracy": 0.9956438913941383, + "num_tokens": 259117310.0, + "step": 2173 + }, + { + "entropy": 0.6415762081742287, + "epoch": 4.9535215283718275, + "grad_norm": 0.443359375, + "learning_rate": 2.7482056673703526e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9954273700714111, + "num_tokens": 259236865.0, + "step": 2174 + }, + { + "entropy": 0.6414747163653374, + "epoch": 4.955802680353578, + "grad_norm": 0.5234375, + "learning_rate": 2.746331453812e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9947013631463051, + "num_tokens": 259356005.0, + "step": 2175 + }, + { + "entropy": 0.6460990384221077, + "epoch": 4.95808383233533, + "grad_norm": 0.419921875, + "learning_rate": 2.74445710044067e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9944871366024017, + "num_tokens": 259475818.0, + "step": 2176 + }, + { + "entropy": 0.6412938460707664, + "epoch": 4.9603649843170805, + "grad_norm": 0.51171875, + "learning_rate": 2.7425826083202096e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9950723350048065, + "num_tokens": 259594949.0, + "step": 2177 + }, + { + "entropy": 0.6431998237967491, + "epoch": 4.962646136298831, + "grad_norm": 0.498046875, + "learning_rate": 2.740707978514543e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9949868991971016, + "num_tokens": 259714682.0, + "step": 2178 + }, + { + "entropy": 0.6391648277640343, + "epoch": 4.964927288280582, + "grad_norm": 0.48046875, + "learning_rate": 2.738833212087676e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9947412088513374, + "num_tokens": 259834264.0, + "step": 2179 + }, + { + "entropy": 0.6433719024062157, + "epoch": 4.967208440262333, + "grad_norm": 0.61328125, + "learning_rate": 2.736958310103688e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9938696324825287, + "num_tokens": 259953739.0, + "step": 2180 + }, + { + "entropy": 0.6372500136494637, + "epoch": 4.969489592244083, + "grad_norm": 0.5078125, + "learning_rate": 2.735083273626738e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9918456822633743, + "num_tokens": 260072517.0, + "step": 2181 + }, + { + "entropy": 0.6425298228859901, + "epoch": 4.971770744225834, + "grad_norm": 0.40625, + "learning_rate": 2.7332081037210607e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.995473101735115, + "num_tokens": 260192400.0, + "step": 2182 + }, + { + "entropy": 0.6380714923143387, + "epoch": 4.974051896207585, + "grad_norm": 0.396484375, + "learning_rate": 2.7313328014509653e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.994620144367218, + "num_tokens": 260311515.0, + "step": 2183 + }, + { + "entropy": 0.6387203335762024, + "epoch": 4.9763330481893355, + "grad_norm": 0.4140625, + "learning_rate": 2.729457367880838e-06, + "loss": 0.0117, + "mean_token_accuracy": 0.9959128275513649, + "num_tokens": 260430939.0, + "step": 2184 + }, + { + "entropy": 0.6389815285801888, + "epoch": 4.978614200171086, + "grad_norm": 0.4765625, + "learning_rate": 2.727581804075139e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9933414831757545, + "num_tokens": 260549605.0, + "step": 2185 + }, + { + "entropy": 0.639744833111763, + "epoch": 4.980895352152837, + "grad_norm": 0.478515625, + "learning_rate": 2.7257061110984005e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9944678470492363, + "num_tokens": 260668846.0, + "step": 2186 + }, + { + "entropy": 0.6400456205010414, + "epoch": 4.983176504134588, + "grad_norm": 0.4375, + "learning_rate": 2.7238302900152327e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9954462125897408, + "num_tokens": 260787905.0, + "step": 2187 + }, + { + "entropy": 0.6389146223664284, + "epoch": 4.985457656116338, + "grad_norm": 0.62890625, + "learning_rate": 2.7219543418903115e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.994327187538147, + "num_tokens": 260906558.0, + "step": 2188 + }, + { + "entropy": 0.6396173015236855, + "epoch": 4.98773880809809, + "grad_norm": 0.5234375, + "learning_rate": 2.720078267788392e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9956508576869965, + "num_tokens": 261025595.0, + "step": 2189 + }, + { + "entropy": 0.6408304870128632, + "epoch": 4.99001996007984, + "grad_norm": 0.7578125, + "learning_rate": 2.718202068774296e-06, + "loss": 0.0296, + "mean_token_accuracy": 0.9916857704520226, + "num_tokens": 261145910.0, + "step": 2190 + }, + { + "entropy": 0.636724554002285, + "epoch": 4.992301112061591, + "grad_norm": 0.4453125, + "learning_rate": 2.7163257459129184e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9938696399331093, + "num_tokens": 261265385.0, + "step": 2191 + }, + { + "entropy": 0.6401726752519608, + "epoch": 4.994582264043342, + "grad_norm": 0.4765625, + "learning_rate": 2.7144493002692242e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9952089264988899, + "num_tokens": 261384851.0, + "step": 2192 + }, + { + "entropy": 0.6423357054591179, + "epoch": 4.996863416025093, + "grad_norm": 0.6875, + "learning_rate": 2.7125727329082474e-06, + "loss": 0.0296, + "mean_token_accuracy": 0.9910811558365822, + "num_tokens": 261504015.0, + "step": 2193 + }, + { + "entropy": 0.6411290988326073, + "epoch": 4.999144568006844, + "grad_norm": 0.427734375, + "learning_rate": 2.7106960448950904e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9958030730485916, + "num_tokens": 261623407.0, + "step": 2194 + }, + { + "entropy": 0.6388970812161764, + "epoch": 5.0, + "grad_norm": 0.71875, + "learning_rate": 2.7088192372949267e-06, + "loss": 0.0105, + "mean_token_accuracy": 0.9953737656275431, + "num_tokens": 261667390.0, + "step": 2195 + }, + { + "entropy": 0.6453615576028824, + "epoch": 5.002281151981751, + "grad_norm": 0.515625, + "learning_rate": 2.7069423111729948e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9919743165373802, + "num_tokens": 261786862.0, + "step": 2196 + }, + { + "entropy": 0.6398195251822472, + "epoch": 5.004562303963501, + "grad_norm": 0.55078125, + "learning_rate": 2.705065267594602e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9942323714494705, + "num_tokens": 261906701.0, + "step": 2197 + }, + { + "entropy": 0.6387388855218887, + "epoch": 5.006843455945252, + "grad_norm": 0.482421875, + "learning_rate": 2.703188107625123e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9949055016040802, + "num_tokens": 262025674.0, + "step": 2198 + }, + { + "entropy": 0.6403909400105476, + "epoch": 5.009124607927003, + "grad_norm": 0.5859375, + "learning_rate": 2.701310832329996e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9937449246644974, + "num_tokens": 262145018.0, + "step": 2199 + }, + { + "entropy": 0.6385556906461716, + "epoch": 5.011405759908754, + "grad_norm": 0.53515625, + "learning_rate": 2.6994334427747276e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9946856275200844, + "num_tokens": 262264207.0, + "step": 2200 + }, + { + "epoch": 5.011405759908754, + "eval_entropy": 0.641756937984278, + "eval_loss": 0.02066929079592228, + "eval_mean_token_accuracy": 0.993495723140557, + "eval_num_tokens": 262264207.0, + "eval_runtime": 177.7082, + "eval_samples_per_second": 47.184, + "eval_steps_per_second": 1.48, + "step": 2200 + }, + { + "entropy": 0.6430558636784554, + "epoch": 5.013686911890504, + "grad_norm": 0.54296875, + "learning_rate": 2.6975559400248876e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9951166734099388, + "num_tokens": 262384386.0, + "step": 2201 + }, + { + "entropy": 0.6416927129030228, + "epoch": 5.015968063872256, + "grad_norm": 0.6171875, + "learning_rate": 2.6956783251461093e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9943560734391212, + "num_tokens": 262504032.0, + "step": 2202 + }, + { + "entropy": 0.6378616541624069, + "epoch": 5.018249215854007, + "grad_norm": 0.59375, + "learning_rate": 2.6938005992040923e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9944946467876434, + "num_tokens": 262623398.0, + "step": 2203 + }, + { + "entropy": 0.6410540044307709, + "epoch": 5.020530367835757, + "grad_norm": 0.48046875, + "learning_rate": 2.6919227632645963e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9957112222909927, + "num_tokens": 262742538.0, + "step": 2204 + }, + { + "entropy": 0.6428038030862808, + "epoch": 5.022811519817508, + "grad_norm": 0.51953125, + "learning_rate": 2.690044818393444e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.994317963719368, + "num_tokens": 262861356.0, + "step": 2205 + }, + { + "entropy": 0.6409348621964455, + "epoch": 5.025092671799259, + "grad_norm": 0.546875, + "learning_rate": 2.688166765656523e-06, + "loss": 0.0246, + "mean_token_accuracy": 0.993347205221653, + "num_tokens": 262980865.0, + "step": 2206 + }, + { + "entropy": 0.6402907595038414, + "epoch": 5.0273738237810095, + "grad_norm": 0.5078125, + "learning_rate": 2.686288606119778e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9938965439796448, + "num_tokens": 263099934.0, + "step": 2207 + }, + { + "entropy": 0.6403698325157166, + "epoch": 5.02965497576276, + "grad_norm": 0.4375, + "learning_rate": 2.6844103408492165e-06, + "loss": 0.0117, + "mean_token_accuracy": 0.9959322512149811, + "num_tokens": 263219071.0, + "step": 2208 + }, + { + "entropy": 0.6396448016166687, + "epoch": 5.031936127744511, + "grad_norm": 0.5234375, + "learning_rate": 2.682531970910906e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9922057762742043, + "num_tokens": 263338813.0, + "step": 2209 + }, + { + "entropy": 0.6433576419949532, + "epoch": 5.034217279726262, + "grad_norm": 0.4296875, + "learning_rate": 2.6806534973709723e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9947349950671196, + "num_tokens": 263458757.0, + "step": 2210 + }, + { + "entropy": 0.6400060728192329, + "epoch": 5.036498431708012, + "grad_norm": 0.498046875, + "learning_rate": 2.6787749212956023e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9941414222121239, + "num_tokens": 263577589.0, + "step": 2211 + }, + { + "entropy": 0.6349309086799622, + "epoch": 5.038779583689763, + "grad_norm": 0.58203125, + "learning_rate": 2.676896243751037e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9925337433815002, + "num_tokens": 263697202.0, + "step": 2212 + }, + { + "entropy": 0.6400552541017532, + "epoch": 5.041060735671514, + "grad_norm": 0.5703125, + "learning_rate": 2.6750174658035793e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9933901280164719, + "num_tokens": 263816835.0, + "step": 2213 + }, + { + "entropy": 0.6422484070062637, + "epoch": 5.0433418876532645, + "grad_norm": 0.53125, + "learning_rate": 2.673138588519587e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.995157927274704, + "num_tokens": 263936191.0, + "step": 2214 + }, + { + "entropy": 0.6384558975696564, + "epoch": 5.045623039635016, + "grad_norm": 0.50390625, + "learning_rate": 2.671259612965475e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.9958095103502274, + "num_tokens": 264055184.0, + "step": 2215 + }, + { + "entropy": 0.6395909935235977, + "epoch": 5.047904191616767, + "grad_norm": 0.55859375, + "learning_rate": 2.6693805402077123e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9924144148826599, + "num_tokens": 264174343.0, + "step": 2216 + }, + { + "entropy": 0.6479180827736855, + "epoch": 5.050185343598518, + "grad_norm": 0.59375, + "learning_rate": 2.6675013713128252e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9945342615246773, + "num_tokens": 264294282.0, + "step": 2217 + }, + { + "entropy": 0.636158399283886, + "epoch": 5.052466495580268, + "grad_norm": 0.396484375, + "learning_rate": 2.665622107347393e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9949643835425377, + "num_tokens": 264413601.0, + "step": 2218 + }, + { + "entropy": 0.6379253938794136, + "epoch": 5.054747647562019, + "grad_norm": 0.40625, + "learning_rate": 2.6637427493780503e-06, + "loss": 0.0135, + "mean_token_accuracy": 0.995668850839138, + "num_tokens": 264532354.0, + "step": 2219 + }, + { + "entropy": 0.6379868164658546, + "epoch": 5.05702879954377, + "grad_norm": 0.5546875, + "learning_rate": 2.6618632984714843e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9935676231980324, + "num_tokens": 264652239.0, + "step": 2220 + }, + { + "entropy": 0.6406855434179306, + "epoch": 5.05930995152552, + "grad_norm": 0.6640625, + "learning_rate": 2.6599837556944353e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9933048114180565, + "num_tokens": 264771006.0, + "step": 2221 + }, + { + "entropy": 0.6369809955358505, + "epoch": 5.061591103507271, + "grad_norm": 0.42578125, + "learning_rate": 2.658104122113695e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9950523152947426, + "num_tokens": 264890134.0, + "step": 2222 + }, + { + "entropy": 0.6410464867949486, + "epoch": 5.063872255489022, + "grad_norm": 0.45703125, + "learning_rate": 2.6562243987961066e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9950700849294662, + "num_tokens": 265009959.0, + "step": 2223 + }, + { + "entropy": 0.641699843108654, + "epoch": 5.066153407470773, + "grad_norm": 0.431640625, + "learning_rate": 2.6543445868085665e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9954672530293465, + "num_tokens": 265129698.0, + "step": 2224 + }, + { + "entropy": 0.6398424208164215, + "epoch": 5.068434559452523, + "grad_norm": 0.53515625, + "learning_rate": 2.652464687218018e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.99419916421175, + "num_tokens": 265248933.0, + "step": 2225 + }, + { + "entropy": 0.6402295008301735, + "epoch": 5.070715711434274, + "grad_norm": 0.5, + "learning_rate": 2.6505847010914575e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9940444603562355, + "num_tokens": 265368007.0, + "step": 2226 + }, + { + "entropy": 0.6408577710390091, + "epoch": 5.072996863416025, + "grad_norm": 0.470703125, + "learning_rate": 2.6487046294959275e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.994990237057209, + "num_tokens": 265487370.0, + "step": 2227 + }, + { + "entropy": 0.6362419724464417, + "epoch": 5.0752780153977755, + "grad_norm": 0.44140625, + "learning_rate": 2.64682447349852e-06, + "loss": 0.0126, + "mean_token_accuracy": 0.9956077411770821, + "num_tokens": 265605972.0, + "step": 2228 + }, + { + "entropy": 0.638429582118988, + "epoch": 5.077559167379527, + "grad_norm": 0.49609375, + "learning_rate": 2.6449442341663755e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9952346086502075, + "num_tokens": 265725535.0, + "step": 2229 + }, + { + "entropy": 0.6361569091677666, + "epoch": 5.079840319361278, + "grad_norm": 0.59765625, + "learning_rate": 2.643063912566683e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9924784302711487, + "num_tokens": 265844479.0, + "step": 2230 + }, + { + "entropy": 0.642608180642128, + "epoch": 5.0821214713430285, + "grad_norm": 0.42578125, + "learning_rate": 2.641183509766675e-06, + "loss": 0.013, + "mean_token_accuracy": 0.9965517073869705, + "num_tokens": 265963990.0, + "step": 2231 + }, + { + "entropy": 0.6351031064987183, + "epoch": 5.084402623324779, + "grad_norm": 0.57421875, + "learning_rate": 2.639303026833632e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9940142035484314, + "num_tokens": 266083448.0, + "step": 2232 + }, + { + "entropy": 0.6386179476976395, + "epoch": 5.08668377530653, + "grad_norm": 0.451171875, + "learning_rate": 2.6374224648348815e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9956497251987457, + "num_tokens": 266203106.0, + "step": 2233 + }, + { + "entropy": 0.6377930343151093, + "epoch": 5.088964927288281, + "grad_norm": 0.4921875, + "learning_rate": 2.6355418248377928e-06, + "loss": 0.0102, + "mean_token_accuracy": 0.9971481189131737, + "num_tokens": 266323099.0, + "step": 2234 + }, + { + "entropy": 0.641746923327446, + "epoch": 5.091246079270031, + "grad_norm": 0.484375, + "learning_rate": 2.633661107909781e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.994440346956253, + "num_tokens": 266442485.0, + "step": 2235 + }, + { + "entropy": 0.6405847892165184, + "epoch": 5.093527231251782, + "grad_norm": 0.466796875, + "learning_rate": 2.6317803151183053e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9939571768045425, + "num_tokens": 266561763.0, + "step": 2236 + }, + { + "entropy": 0.6389883682131767, + "epoch": 5.095808383233533, + "grad_norm": 0.447265625, + "learning_rate": 2.629899447530866e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9949461072683334, + "num_tokens": 266682592.0, + "step": 2237 + }, + { + "entropy": 0.640551783144474, + "epoch": 5.0980895352152835, + "grad_norm": 0.5078125, + "learning_rate": 2.6280185062150084e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9944885596632957, + "num_tokens": 266801989.0, + "step": 2238 + }, + { + "entropy": 0.6343742236495018, + "epoch": 5.100370687197034, + "grad_norm": 0.45703125, + "learning_rate": 2.6261374922383176e-06, + "loss": 0.0121, + "mean_token_accuracy": 0.9964650645852089, + "num_tokens": 266921564.0, + "step": 2239 + }, + { + "entropy": 0.639545351266861, + "epoch": 5.102651839178785, + "grad_norm": 0.56640625, + "learning_rate": 2.6242564066684217e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9939071238040924, + "num_tokens": 267040526.0, + "step": 2240 + }, + { + "entropy": 0.64347043633461, + "epoch": 5.104932991160536, + "grad_norm": 0.439453125, + "learning_rate": 2.6223752505729884e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9940275773406029, + "num_tokens": 267160452.0, + "step": 2241 + }, + { + "entropy": 0.6338881626725197, + "epoch": 5.107214143142287, + "grad_norm": 0.443359375, + "learning_rate": 2.6204940250197253e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.995152197778225, + "num_tokens": 267279877.0, + "step": 2242 + }, + { + "entropy": 0.6360463500022888, + "epoch": 5.109495295124038, + "grad_norm": 0.66015625, + "learning_rate": 2.61861273107638e-06, + "loss": 0.0223, + "mean_token_accuracy": 0.9920280650258064, + "num_tokens": 267399540.0, + "step": 2243 + }, + { + "entropy": 0.6403748020529747, + "epoch": 5.111776447105789, + "grad_norm": 0.44921875, + "learning_rate": 2.6167313698107385e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9946764558553696, + "num_tokens": 267518808.0, + "step": 2244 + }, + { + "entropy": 0.6394345164299011, + "epoch": 5.114057599087539, + "grad_norm": 0.5, + "learning_rate": 2.6148499422906243e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9939241409301758, + "num_tokens": 267638299.0, + "step": 2245 + }, + { + "entropy": 0.6345800012350082, + "epoch": 5.11633875106929, + "grad_norm": 0.5625, + "learning_rate": 2.6129684495839013e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9944344758987427, + "num_tokens": 267757534.0, + "step": 2246 + }, + { + "entropy": 0.6395940184593201, + "epoch": 5.118619903051041, + "grad_norm": 0.458984375, + "learning_rate": 2.611086892758467e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9943375065922737, + "num_tokens": 267876633.0, + "step": 2247 + }, + { + "entropy": 0.6403714716434479, + "epoch": 5.120901055032792, + "grad_norm": 0.490234375, + "learning_rate": 2.6092052728822564e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9937417581677437, + "num_tokens": 267995916.0, + "step": 2248 + }, + { + "entropy": 0.6357563436031342, + "epoch": 5.123182207014542, + "grad_norm": 0.53125, + "learning_rate": 2.607323591023242e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.994338370859623, + "num_tokens": 268115267.0, + "step": 2249 + }, + { + "entropy": 0.6401359289884567, + "epoch": 5.125463358996293, + "grad_norm": 0.6015625, + "learning_rate": 2.605441848249428e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9944301024079323, + "num_tokens": 268234870.0, + "step": 2250 + }, + { + "entropy": 0.6332013383507729, + "epoch": 5.127744510978044, + "grad_norm": 0.486328125, + "learning_rate": 2.6035600456288573e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9948499724268913, + "num_tokens": 268353620.0, + "step": 2251 + }, + { + "entropy": 0.6389177218079567, + "epoch": 5.1300256629597945, + "grad_norm": 0.4921875, + "learning_rate": 2.6016781842296044e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9936813786625862, + "num_tokens": 268472391.0, + "step": 2252 + }, + { + "entropy": 0.640631303191185, + "epoch": 5.132306814941545, + "grad_norm": 0.5234375, + "learning_rate": 2.599796265119777e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9943917393684387, + "num_tokens": 268591734.0, + "step": 2253 + }, + { + "entropy": 0.6418650820851326, + "epoch": 5.134587966923296, + "grad_norm": 0.51953125, + "learning_rate": 2.597914289367516e-06, + "loss": 0.013, + "mean_token_accuracy": 0.9956641718745232, + "num_tokens": 268711820.0, + "step": 2254 + }, + { + "entropy": 0.6369451433420181, + "epoch": 5.136869118905047, + "grad_norm": 0.5546875, + "learning_rate": 2.596032258040994e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9930550307035446, + "num_tokens": 268831104.0, + "step": 2255 + }, + { + "entropy": 0.6395971179008484, + "epoch": 5.139150270886798, + "grad_norm": 0.54296875, + "learning_rate": 2.594150172208417e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9931889697909355, + "num_tokens": 268950259.0, + "step": 2256 + }, + { + "entropy": 0.6396232321858406, + "epoch": 5.141431422868549, + "grad_norm": 0.474609375, + "learning_rate": 2.59226803293802e-06, + "loss": 0.0127, + "mean_token_accuracy": 0.9950950145721436, + "num_tokens": 269069313.0, + "step": 2257 + }, + { + "entropy": 0.6342134475708008, + "epoch": 5.1437125748503, + "grad_norm": 0.484375, + "learning_rate": 2.5903858412980688e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9942809715867043, + "num_tokens": 269188292.0, + "step": 2258 + }, + { + "entropy": 0.6362833976745605, + "epoch": 5.14599372683205, + "grad_norm": 0.6328125, + "learning_rate": 2.5885035983568584e-06, + "loss": 0.0258, + "mean_token_accuracy": 0.9927891865372658, + "num_tokens": 269308206.0, + "step": 2259 + }, + { + "entropy": 0.639234147965908, + "epoch": 5.148274878813801, + "grad_norm": 0.482421875, + "learning_rate": 2.5866213051827148e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9946079701185226, + "num_tokens": 269427674.0, + "step": 2260 + }, + { + "entropy": 0.6387187391519547, + "epoch": 5.150556030795552, + "grad_norm": 0.486328125, + "learning_rate": 2.5847389628439905e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9953724071383476, + "num_tokens": 269547295.0, + "step": 2261 + }, + { + "entropy": 0.6392261832952499, + "epoch": 5.1528371827773025, + "grad_norm": 0.53125, + "learning_rate": 2.5828565724090672e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9936652705073357, + "num_tokens": 269667417.0, + "step": 2262 + }, + { + "entropy": 0.6398108378052711, + "epoch": 5.155118334759053, + "grad_norm": 0.447265625, + "learning_rate": 2.5809741349463526e-06, + "loss": 0.0098, + "mean_token_accuracy": 0.9962107166647911, + "num_tokens": 269786507.0, + "step": 2263 + }, + { + "entropy": 0.6403687447309494, + "epoch": 5.157399486740804, + "grad_norm": 0.43359375, + "learning_rate": 2.579091651524282e-06, + "loss": 0.0136, + "mean_token_accuracy": 0.9943887963891029, + "num_tokens": 269905874.0, + "step": 2264 + }, + { + "entropy": 0.6373026594519615, + "epoch": 5.159680638722555, + "grad_norm": 0.55078125, + "learning_rate": 2.5772091232113176e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9944737702608109, + "num_tokens": 270025701.0, + "step": 2265 + }, + { + "entropy": 0.6395878568291664, + "epoch": 5.161961790704305, + "grad_norm": 0.54296875, + "learning_rate": 2.575326551075945e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9948684126138687, + "num_tokens": 270145011.0, + "step": 2266 + }, + { + "entropy": 0.6411764025688171, + "epoch": 5.164242942686056, + "grad_norm": 0.50390625, + "learning_rate": 2.5734439361866762e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9955326542258263, + "num_tokens": 270265372.0, + "step": 2267 + }, + { + "entropy": 0.6335322558879852, + "epoch": 5.166524094667807, + "grad_norm": 0.427734375, + "learning_rate": 2.571561279612047e-06, + "loss": 0.0114, + "mean_token_accuracy": 0.9957868754863739, + "num_tokens": 270384811.0, + "step": 2268 + }, + { + "entropy": 0.6376103013753891, + "epoch": 5.168805246649558, + "grad_norm": 0.5859375, + "learning_rate": 2.5696785824206177e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9946294948458672, + "num_tokens": 270503849.0, + "step": 2269 + }, + { + "entropy": 0.6355725303292274, + "epoch": 5.171086398631309, + "grad_norm": 0.5078125, + "learning_rate": 2.5677958456809703e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9933493509888649, + "num_tokens": 270622448.0, + "step": 2270 + }, + { + "entropy": 0.6420644596219063, + "epoch": 5.17336755061306, + "grad_norm": 0.53515625, + "learning_rate": 2.5659130704617092e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9929642081260681, + "num_tokens": 270742207.0, + "step": 2271 + }, + { + "entropy": 0.635376051068306, + "epoch": 5.175648702594811, + "grad_norm": 0.61328125, + "learning_rate": 2.5640302578314614e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9928009733557701, + "num_tokens": 270861560.0, + "step": 2272 + }, + { + "entropy": 0.6427772864699364, + "epoch": 5.177929854576561, + "grad_norm": 0.53125, + "learning_rate": 2.562147408858876e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9941305890679359, + "num_tokens": 270980980.0, + "step": 2273 + }, + { + "entropy": 0.6425194144248962, + "epoch": 5.180211006558312, + "grad_norm": 0.66015625, + "learning_rate": 2.5602645246126207e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9925933480262756, + "num_tokens": 271100374.0, + "step": 2274 + }, + { + "entropy": 0.6377389207482338, + "epoch": 5.182492158540063, + "grad_norm": 0.59765625, + "learning_rate": 2.5583816061613847e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9937486872076988, + "num_tokens": 271219750.0, + "step": 2275 + }, + { + "entropy": 0.6383015960454941, + "epoch": 5.1847733105218134, + "grad_norm": 0.54296875, + "learning_rate": 2.5564986545738767e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9938366562128067, + "num_tokens": 271339482.0, + "step": 2276 + }, + { + "entropy": 0.6376020088791847, + "epoch": 5.187054462503564, + "grad_norm": 0.515625, + "learning_rate": 2.554615670918823e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9963782727718353, + "num_tokens": 271458112.0, + "step": 2277 + }, + { + "entropy": 0.6395053416490555, + "epoch": 5.189335614485315, + "grad_norm": 0.50390625, + "learning_rate": 2.552732656264969e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9960042834281921, + "num_tokens": 271577952.0, + "step": 2278 + }, + { + "entropy": 0.6401731818914413, + "epoch": 5.191616766467066, + "grad_norm": 0.640625, + "learning_rate": 2.5508496116810766e-06, + "loss": 0.0276, + "mean_token_accuracy": 0.9936056211590767, + "num_tokens": 271697161.0, + "step": 2279 + }, + { + "entropy": 0.6384700909256935, + "epoch": 5.193897918448816, + "grad_norm": 0.396484375, + "learning_rate": 2.548966538235927e-06, + "loss": 0.0128, + "mean_token_accuracy": 0.9959568604826927, + "num_tokens": 271816292.0, + "step": 2280 + }, + { + "entropy": 0.6380911096930504, + "epoch": 5.196179070430567, + "grad_norm": 0.5, + "learning_rate": 2.547083436998316e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9962186440825462, + "num_tokens": 271935208.0, + "step": 2281 + }, + { + "entropy": 0.6406601518392563, + "epoch": 5.198460222412319, + "grad_norm": 0.474609375, + "learning_rate": 2.5452003090370543e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9948535338044167, + "num_tokens": 272054366.0, + "step": 2282 + }, + { + "entropy": 0.6364331319928169, + "epoch": 5.200741374394069, + "grad_norm": 0.46875, + "learning_rate": 2.5433171554209694e-06, + "loss": 0.0121, + "mean_token_accuracy": 0.9966221898794174, + "num_tokens": 272173562.0, + "step": 2283 + }, + { + "entropy": 0.6382191702723503, + "epoch": 5.20302252637582, + "grad_norm": 0.44921875, + "learning_rate": 2.5414339772189045e-06, + "loss": 0.0103, + "mean_token_accuracy": 0.9961083754897118, + "num_tokens": 272293244.0, + "step": 2284 + }, + { + "entropy": 0.639429934322834, + "epoch": 5.205303678357571, + "grad_norm": 0.423828125, + "learning_rate": 2.5395507754997135e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9963374733924866, + "num_tokens": 272413261.0, + "step": 2285 + }, + { + "entropy": 0.6411982700228691, + "epoch": 5.2075848303393215, + "grad_norm": 0.56640625, + "learning_rate": 2.5376675513322665e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9951925948262215, + "num_tokens": 272532217.0, + "step": 2286 + }, + { + "entropy": 0.6443229466676712, + "epoch": 5.209865982321072, + "grad_norm": 0.51171875, + "learning_rate": 2.535784305785443e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9946814849972725, + "num_tokens": 272651721.0, + "step": 2287 + }, + { + "entropy": 0.6400162875652313, + "epoch": 5.212147134302823, + "grad_norm": 0.76171875, + "learning_rate": 2.5339010399281394e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9923993945121765, + "num_tokens": 272771413.0, + "step": 2288 + }, + { + "entropy": 0.6350873932242393, + "epoch": 5.214428286284574, + "grad_norm": 0.423828125, + "learning_rate": 2.53201775482926e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9954353347420692, + "num_tokens": 272890591.0, + "step": 2289 + }, + { + "entropy": 0.6379029676318169, + "epoch": 5.216709438266324, + "grad_norm": 0.4375, + "learning_rate": 2.530134451557722e-06, + "loss": 0.012, + "mean_token_accuracy": 0.9965425357222557, + "num_tokens": 273010070.0, + "step": 2290 + }, + { + "entropy": 0.646380215883255, + "epoch": 5.218990590248075, + "grad_norm": 0.50390625, + "learning_rate": 2.52825113118245e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9942713156342506, + "num_tokens": 273129699.0, + "step": 2291 + }, + { + "entropy": 0.6449098587036133, + "epoch": 5.221271742229826, + "grad_norm": 0.51953125, + "learning_rate": 2.5263677947723813e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9932804554700851, + "num_tokens": 273249021.0, + "step": 2292 + }, + { + "entropy": 0.639538861811161, + "epoch": 5.2235528942115765, + "grad_norm": 0.54296875, + "learning_rate": 2.5244844433964615e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9946067482233047, + "num_tokens": 273368887.0, + "step": 2293 + }, + { + "entropy": 0.6329449117183685, + "epoch": 5.225834046193327, + "grad_norm": 0.48828125, + "learning_rate": 2.522601078123645e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9946059137582779, + "num_tokens": 273488604.0, + "step": 2294 + }, + { + "entropy": 0.6353581622242928, + "epoch": 5.228115198175079, + "grad_norm": 0.470703125, + "learning_rate": 2.5207177000228916e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9952534809708595, + "num_tokens": 273607331.0, + "step": 2295 + }, + { + "entropy": 0.6402522996068001, + "epoch": 5.23039635015683, + "grad_norm": 0.39453125, + "learning_rate": 2.5188343101631717e-06, + "loss": 0.0131, + "mean_token_accuracy": 0.9959250837564468, + "num_tokens": 273726663.0, + "step": 2296 + }, + { + "entropy": 0.6396752893924713, + "epoch": 5.23267750213858, + "grad_norm": 0.494140625, + "learning_rate": 2.516950909613462e-06, + "loss": 0.011, + "mean_token_accuracy": 0.9948886260390282, + "num_tokens": 273846199.0, + "step": 2297 + }, + { + "entropy": 0.6407026648521423, + "epoch": 5.234958654120331, + "grad_norm": 0.51171875, + "learning_rate": 2.5150674994427427e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9955054000020027, + "num_tokens": 273965285.0, + "step": 2298 + }, + { + "entropy": 0.6397402733564377, + "epoch": 5.237239806102082, + "grad_norm": 0.451171875, + "learning_rate": 2.5131840807200015e-06, + "loss": 0.013, + "mean_token_accuracy": 0.9961011484265327, + "num_tokens": 274086349.0, + "step": 2299 + }, + { + "entropy": 0.6363022476434708, + "epoch": 5.2395209580838324, + "grad_norm": 0.58203125, + "learning_rate": 2.511300654514231e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9935476928949356, + "num_tokens": 274205190.0, + "step": 2300 + }, + { + "entropy": 0.6386421769857407, + "epoch": 5.241802110065583, + "grad_norm": 0.408203125, + "learning_rate": 2.5094172218944276e-06, + "loss": 0.0117, + "mean_token_accuracy": 0.9970376417040825, + "num_tokens": 274325001.0, + "step": 2301 + }, + { + "entropy": 0.6367201283574104, + "epoch": 5.244083262047334, + "grad_norm": 0.46484375, + "learning_rate": 2.5075337839295903e-06, + "loss": 0.0136, + "mean_token_accuracy": 0.9951942339539528, + "num_tokens": 274443965.0, + "step": 2302 + }, + { + "entropy": 0.6347163766622543, + "epoch": 5.246364414029085, + "grad_norm": 0.54296875, + "learning_rate": 2.5056503416887222e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9940134510397911, + "num_tokens": 274563301.0, + "step": 2303 + }, + { + "entropy": 0.6439990550279617, + "epoch": 5.248645566010835, + "grad_norm": 0.46484375, + "learning_rate": 2.5037668962408295e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9947071969509125, + "num_tokens": 274682811.0, + "step": 2304 + }, + { + "entropy": 0.6363752409815788, + "epoch": 5.250926717992586, + "grad_norm": 0.55078125, + "learning_rate": 2.5018834486549198e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9933529272675514, + "num_tokens": 274802363.0, + "step": 2305 + }, + { + "entropy": 0.6366180777549744, + "epoch": 5.253207869974337, + "grad_norm": 0.447265625, + "learning_rate": 2.5e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9951946660876274, + "num_tokens": 274921895.0, + "step": 2306 + }, + { + "entropy": 0.6379761770367622, + "epoch": 5.2554890219560875, + "grad_norm": 0.384765625, + "learning_rate": 2.4981165513450807e-06, + "loss": 0.0106, + "mean_token_accuracy": 0.9971502870321274, + "num_tokens": 275040539.0, + "step": 2307 + }, + { + "entropy": 0.6387635543942451, + "epoch": 5.257770173937839, + "grad_norm": 0.48046875, + "learning_rate": 2.4962331037591705e-06, + "loss": 0.0114, + "mean_token_accuracy": 0.9961639121174812, + "num_tokens": 275160024.0, + "step": 2308 + }, + { + "entropy": 0.6394033208489418, + "epoch": 5.26005132591959, + "grad_norm": 0.58203125, + "learning_rate": 2.494349658311279e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9946499317884445, + "num_tokens": 275278712.0, + "step": 2309 + }, + { + "entropy": 0.638643242418766, + "epoch": 5.2623324779013405, + "grad_norm": 0.494140625, + "learning_rate": 2.492466216070411e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9954987615346909, + "num_tokens": 275398949.0, + "step": 2310 + }, + { + "entropy": 0.6383827850222588, + "epoch": 5.264613629883091, + "grad_norm": 0.7421875, + "learning_rate": 2.4905827781055733e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9929229617118835, + "num_tokens": 275517709.0, + "step": 2311 + }, + { + "entropy": 0.6404964625835419, + "epoch": 5.266894781864842, + "grad_norm": 0.484375, + "learning_rate": 2.4886993454857696e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9947107955813408, + "num_tokens": 275637634.0, + "step": 2312 + }, + { + "entropy": 0.6334439814090729, + "epoch": 5.269175933846593, + "grad_norm": 0.53515625, + "learning_rate": 2.486815919279999e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9946572333574295, + "num_tokens": 275756617.0, + "step": 2313 + }, + { + "entropy": 0.6385932937264442, + "epoch": 5.271457085828343, + "grad_norm": 0.52734375, + "learning_rate": 2.4849325005572573e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9948386028409004, + "num_tokens": 275875611.0, + "step": 2314 + }, + { + "entropy": 0.6406770497560501, + "epoch": 5.273738237810094, + "grad_norm": 0.419921875, + "learning_rate": 2.483049090386539e-06, + "loss": 0.0107, + "mean_token_accuracy": 0.9969892501831055, + "num_tokens": 275994588.0, + "step": 2315 + }, + { + "entropy": 0.6394596695899963, + "epoch": 5.276019389791845, + "grad_norm": 0.6015625, + "learning_rate": 2.4811656898368287e-06, + "loss": 0.022, + "mean_token_accuracy": 0.992605485022068, + "num_tokens": 276113917.0, + "step": 2316 + }, + { + "entropy": 0.6403463333845139, + "epoch": 5.2783005417735955, + "grad_norm": 0.58984375, + "learning_rate": 2.4792822999771092e-06, + "loss": 0.0117, + "mean_token_accuracy": 0.9945822358131409, + "num_tokens": 276232990.0, + "step": 2317 + }, + { + "entropy": 0.6376551985740662, + "epoch": 5.280581693755346, + "grad_norm": 0.5078125, + "learning_rate": 2.477398921876356e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9931498542428017, + "num_tokens": 276351987.0, + "step": 2318 + }, + { + "entropy": 0.6418017670512199, + "epoch": 5.282862845737097, + "grad_norm": 0.46484375, + "learning_rate": 2.475515556603539e-06, + "loss": 0.0127, + "mean_token_accuracy": 0.9959842786192894, + "num_tokens": 276471198.0, + "step": 2319 + }, + { + "entropy": 0.6406931728124619, + "epoch": 5.285143997718848, + "grad_norm": 0.5546875, + "learning_rate": 2.47363220522762e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9930545911192894, + "num_tokens": 276590664.0, + "step": 2320 + }, + { + "entropy": 0.6346911862492561, + "epoch": 5.287425149700598, + "grad_norm": 0.51171875, + "learning_rate": 2.4717488688175513e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9956279322504997, + "num_tokens": 276709459.0, + "step": 2321 + }, + { + "entropy": 0.6397387087345123, + "epoch": 5.289706301682349, + "grad_norm": 0.4453125, + "learning_rate": 2.469865548442279e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9963891804218292, + "num_tokens": 276829029.0, + "step": 2322 + }, + { + "entropy": 0.6375634372234344, + "epoch": 5.291987453664101, + "grad_norm": 0.3984375, + "learning_rate": 2.4679822451707404e-06, + "loss": 0.0115, + "mean_token_accuracy": 0.9966582730412483, + "num_tokens": 276948133.0, + "step": 2323 + }, + { + "entropy": 0.6377409547567368, + "epoch": 5.2942686056458514, + "grad_norm": 0.46484375, + "learning_rate": 2.4660989600718606e-06, + "loss": 0.0113, + "mean_token_accuracy": 0.9962807074189186, + "num_tokens": 277067685.0, + "step": 2324 + }, + { + "entropy": 0.6380213126540184, + "epoch": 5.296549757627602, + "grad_norm": 0.515625, + "learning_rate": 2.4642156942145577e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.994778461754322, + "num_tokens": 277186906.0, + "step": 2325 + }, + { + "entropy": 0.6368103697896004, + "epoch": 5.298830909609353, + "grad_norm": 0.60546875, + "learning_rate": 2.4623324486677352e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9943451434373856, + "num_tokens": 277306274.0, + "step": 2326 + }, + { + "entropy": 0.6412858739495277, + "epoch": 5.301112061591104, + "grad_norm": 0.63671875, + "learning_rate": 2.4604492245002873e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9941505417227745, + "num_tokens": 277425565.0, + "step": 2327 + }, + { + "entropy": 0.6364952027797699, + "epoch": 5.303393213572854, + "grad_norm": 0.412109375, + "learning_rate": 2.4585660227810963e-06, + "loss": 0.0115, + "mean_token_accuracy": 0.9961299747228622, + "num_tokens": 277545210.0, + "step": 2328 + }, + { + "entropy": 0.638011746108532, + "epoch": 5.305674365554605, + "grad_norm": 0.53125, + "learning_rate": 2.4566828445790306e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9922657161951065, + "num_tokens": 277663763.0, + "step": 2329 + }, + { + "entropy": 0.6387683153152466, + "epoch": 5.307955517536356, + "grad_norm": 0.53125, + "learning_rate": 2.454799690962946e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9962317645549774, + "num_tokens": 277782863.0, + "step": 2330 + }, + { + "entropy": 0.6414927393198013, + "epoch": 5.3102366695181065, + "grad_norm": 0.578125, + "learning_rate": 2.4529165630016855e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9955480545759201, + "num_tokens": 277901767.0, + "step": 2331 + }, + { + "entropy": 0.636423796415329, + "epoch": 5.312517821499857, + "grad_norm": 0.52734375, + "learning_rate": 2.4510334617640733e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9937384203076363, + "num_tokens": 278020857.0, + "step": 2332 + }, + { + "entropy": 0.6411848068237305, + "epoch": 5.314798973481608, + "grad_norm": 0.51171875, + "learning_rate": 2.4491503883189242e-06, + "loss": 0.013, + "mean_token_accuracy": 0.9961826875805855, + "num_tokens": 278140364.0, + "step": 2333 + }, + { + "entropy": 0.6364300400018692, + "epoch": 5.317080125463359, + "grad_norm": 0.640625, + "learning_rate": 2.447267343735032e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9947511181235313, + "num_tokens": 278259437.0, + "step": 2334 + }, + { + "entropy": 0.6390166878700256, + "epoch": 5.319361277445109, + "grad_norm": 0.6171875, + "learning_rate": 2.4453843290811772e-06, + "loss": 0.023, + "mean_token_accuracy": 0.9934451207518578, + "num_tokens": 278378842.0, + "step": 2335 + }, + { + "entropy": 0.6324476972222328, + "epoch": 5.321642429426861, + "grad_norm": 0.52734375, + "learning_rate": 2.4435013454261246e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9935642406344414, + "num_tokens": 278497516.0, + "step": 2336 + }, + { + "entropy": 0.6408939883112907, + "epoch": 5.323923581408612, + "grad_norm": 0.66796875, + "learning_rate": 2.4416183938386157e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9923098832368851, + "num_tokens": 278617250.0, + "step": 2337 + }, + { + "entropy": 0.6332000866532326, + "epoch": 5.326204733390362, + "grad_norm": 0.63671875, + "learning_rate": 2.4397354753873797e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9923102483153343, + "num_tokens": 278736777.0, + "step": 2338 + }, + { + "entropy": 0.6385438144207001, + "epoch": 5.328485885372113, + "grad_norm": 0.515625, + "learning_rate": 2.4378525911411246e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9945679903030396, + "num_tokens": 278855730.0, + "step": 2339 + }, + { + "entropy": 0.6349846869707108, + "epoch": 5.330767037353864, + "grad_norm": 0.6171875, + "learning_rate": 2.435969742168539e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9935977458953857, + "num_tokens": 278974867.0, + "step": 2340 + }, + { + "entropy": 0.6346960961818695, + "epoch": 5.3330481893356145, + "grad_norm": 0.51953125, + "learning_rate": 2.4340869295382924e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9948094710707664, + "num_tokens": 279094223.0, + "step": 2341 + }, + { + "entropy": 0.6397505924105644, + "epoch": 5.335329341317365, + "grad_norm": 0.578125, + "learning_rate": 2.432204154319031e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9926820322871208, + "num_tokens": 279214152.0, + "step": 2342 + }, + { + "entropy": 0.637494184076786, + "epoch": 5.337610493299116, + "grad_norm": 0.64453125, + "learning_rate": 2.4303214175793827e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.9949726387858391, + "num_tokens": 279333509.0, + "step": 2343 + }, + { + "entropy": 0.6365151181817055, + "epoch": 5.339891645280867, + "grad_norm": 0.515625, + "learning_rate": 2.4284387203879536e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9947508424520493, + "num_tokens": 279453827.0, + "step": 2344 + }, + { + "entropy": 0.6411127969622612, + "epoch": 5.342172797262617, + "grad_norm": 0.60546875, + "learning_rate": 2.426556063813324e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9934843555092812, + "num_tokens": 279573963.0, + "step": 2345 + }, + { + "entropy": 0.6369736194610596, + "epoch": 5.344453949244368, + "grad_norm": 0.609375, + "learning_rate": 2.4246734489240554e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9925227388739586, + "num_tokens": 279692956.0, + "step": 2346 + }, + { + "entropy": 0.6406808793544769, + "epoch": 5.346735101226119, + "grad_norm": 0.546875, + "learning_rate": 2.4227908767886837e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.993464358150959, + "num_tokens": 279813150.0, + "step": 2347 + }, + { + "entropy": 0.6337300166487694, + "epoch": 5.3490162532078696, + "grad_norm": 0.51953125, + "learning_rate": 2.420908348475719e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9946438744664192, + "num_tokens": 279932152.0, + "step": 2348 + }, + { + "entropy": 0.6374392509460449, + "epoch": 5.351297405189621, + "grad_norm": 0.44140625, + "learning_rate": 2.4190258650536483e-06, + "loss": 0.0135, + "mean_token_accuracy": 0.9955737963318825, + "num_tokens": 280051447.0, + "step": 2349 + }, + { + "entropy": 0.6398628652095795, + "epoch": 5.353578557171372, + "grad_norm": 0.48828125, + "learning_rate": 2.417143427590933e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.994400791823864, + "num_tokens": 280171362.0, + "step": 2350 + }, + { + "entropy": 0.6358270198106766, + "epoch": 5.355859709153123, + "grad_norm": 0.451171875, + "learning_rate": 2.4152610371560095e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.995156928896904, + "num_tokens": 280290798.0, + "step": 2351 + }, + { + "entropy": 0.6421844884753227, + "epoch": 5.358140861134873, + "grad_norm": 0.412109375, + "learning_rate": 2.413378694817286e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9951684921979904, + "num_tokens": 280410634.0, + "step": 2352 + }, + { + "entropy": 0.6373702734708786, + "epoch": 5.360422013116624, + "grad_norm": 0.40625, + "learning_rate": 2.411496401643142e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.9957727342844009, + "num_tokens": 280530167.0, + "step": 2353 + }, + { + "entropy": 0.6376002058386803, + "epoch": 5.362703165098375, + "grad_norm": 0.48828125, + "learning_rate": 2.409614158701932e-06, + "loss": 0.013, + "mean_token_accuracy": 0.996869370341301, + "num_tokens": 280649796.0, + "step": 2354 + }, + { + "entropy": 0.6368831843137741, + "epoch": 5.3649843170801255, + "grad_norm": 0.6328125, + "learning_rate": 2.407731967061981e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.994355745613575, + "num_tokens": 280769410.0, + "step": 2355 + }, + { + "entropy": 0.6377579942345619, + "epoch": 5.367265469061876, + "grad_norm": 0.5546875, + "learning_rate": 2.4058498277915835e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9930682554841042, + "num_tokens": 280887926.0, + "step": 2356 + }, + { + "entropy": 0.6396503746509552, + "epoch": 5.369546621043627, + "grad_norm": 0.57421875, + "learning_rate": 2.4039677419590064e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9935258701443672, + "num_tokens": 281007492.0, + "step": 2357 + }, + { + "entropy": 0.6397499144077301, + "epoch": 5.371827773025378, + "grad_norm": 0.58203125, + "learning_rate": 2.4020857106324853e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.992737703025341, + "num_tokens": 281126831.0, + "step": 2358 + }, + { + "entropy": 0.6393130645155907, + "epoch": 5.374108925007128, + "grad_norm": 0.70703125, + "learning_rate": 2.4002037348802245e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.993714764714241, + "num_tokens": 281246234.0, + "step": 2359 + }, + { + "entropy": 0.6394072026014328, + "epoch": 5.376390076988879, + "grad_norm": 0.58203125, + "learning_rate": 2.3983218157703964e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9939935281872749, + "num_tokens": 281365462.0, + "step": 2360 + }, + { + "entropy": 0.6415555328130722, + "epoch": 5.37867122897063, + "grad_norm": 0.447265625, + "learning_rate": 2.3964399543711427e-06, + "loss": 0.0102, + "mean_token_accuracy": 0.9968991279602051, + "num_tokens": 281485204.0, + "step": 2361 + }, + { + "entropy": 0.6359155401587486, + "epoch": 5.380952380952381, + "grad_norm": 0.5703125, + "learning_rate": 2.394558151750572e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9946882054209709, + "num_tokens": 281604945.0, + "step": 2362 + }, + { + "entropy": 0.6450681909918785, + "epoch": 5.383233532934132, + "grad_norm": 0.458984375, + "learning_rate": 2.3926764089767594e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9951719269156456, + "num_tokens": 281724112.0, + "step": 2363 + }, + { + "entropy": 0.6395528763532639, + "epoch": 5.385514684915883, + "grad_norm": 0.51171875, + "learning_rate": 2.3907947271177444e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.9954202473163605, + "num_tokens": 281843221.0, + "step": 2364 + }, + { + "entropy": 0.638059139251709, + "epoch": 5.3877958368976335, + "grad_norm": 0.53515625, + "learning_rate": 2.388913107241534e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9944328293204308, + "num_tokens": 281961830.0, + "step": 2365 + }, + { + "entropy": 0.6382259652018547, + "epoch": 5.390076988879384, + "grad_norm": 0.451171875, + "learning_rate": 2.3870315504160995e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9953602328896523, + "num_tokens": 282081422.0, + "step": 2366 + }, + { + "entropy": 0.6365378126502037, + "epoch": 5.392358140861135, + "grad_norm": 0.515625, + "learning_rate": 2.3851500577093757e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9939025640487671, + "num_tokens": 282200900.0, + "step": 2367 + }, + { + "entropy": 0.6386839672923088, + "epoch": 5.394639292842886, + "grad_norm": 0.66796875, + "learning_rate": 2.3832686301892628e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9945534318685532, + "num_tokens": 282320026.0, + "step": 2368 + }, + { + "entropy": 0.6426062434911728, + "epoch": 5.396920444824636, + "grad_norm": 0.52734375, + "learning_rate": 2.381387268923621e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.9936311170458794, + "num_tokens": 282439443.0, + "step": 2369 + }, + { + "entropy": 0.6380885541439056, + "epoch": 5.399201596806387, + "grad_norm": 0.498046875, + "learning_rate": 2.3795059749802756e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9955107867717743, + "num_tokens": 282558817.0, + "step": 2370 + }, + { + "entropy": 0.6378804296255112, + "epoch": 5.401482748788138, + "grad_norm": 0.462890625, + "learning_rate": 2.377624749427012e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9956310838460922, + "num_tokens": 282678469.0, + "step": 2371 + }, + { + "entropy": 0.6413206085562706, + "epoch": 5.4037639007698886, + "grad_norm": 0.7265625, + "learning_rate": 2.3757435933315787e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9937672540545464, + "num_tokens": 282797546.0, + "step": 2372 + }, + { + "entropy": 0.6437965333461761, + "epoch": 5.406045052751639, + "grad_norm": 0.5703125, + "learning_rate": 2.3738625077616837e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.9941146224737167, + "num_tokens": 282917191.0, + "step": 2373 + }, + { + "entropy": 0.6312757357954979, + "epoch": 5.40832620473339, + "grad_norm": 0.66015625, + "learning_rate": 2.371981493784993e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9942869991064072, + "num_tokens": 283035663.0, + "step": 2374 + }, + { + "entropy": 0.6388252228498459, + "epoch": 5.410607356715142, + "grad_norm": 0.6171875, + "learning_rate": 2.370100552469135e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9937246143817902, + "num_tokens": 283154574.0, + "step": 2375 + }, + { + "entropy": 0.638635165989399, + "epoch": 5.412888508696892, + "grad_norm": 0.62109375, + "learning_rate": 2.3682196848816955e-06, + "loss": 0.0259, + "mean_token_accuracy": 0.9928053990006447, + "num_tokens": 283274126.0, + "step": 2376 + }, + { + "entropy": 0.6417307034134865, + "epoch": 5.415169660678643, + "grad_norm": 0.515625, + "learning_rate": 2.3663388920902198e-06, + "loss": 0.0117, + "mean_token_accuracy": 0.995877243578434, + "num_tokens": 283393421.0, + "step": 2377 + }, + { + "entropy": 0.6375080719590187, + "epoch": 5.417450812660394, + "grad_norm": 0.55859375, + "learning_rate": 2.3644581751622076e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9920376539230347, + "num_tokens": 283512766.0, + "step": 2378 + }, + { + "entropy": 0.6400329098105431, + "epoch": 5.4197319646421445, + "grad_norm": 0.55859375, + "learning_rate": 2.3625775351651193e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9933201000094414, + "num_tokens": 283631982.0, + "step": 2379 + }, + { + "entropy": 0.6368478983640671, + "epoch": 5.422013116623895, + "grad_norm": 0.59375, + "learning_rate": 2.3606969731663683e-06, + "loss": 0.015, + "mean_token_accuracy": 0.9951108396053314, + "num_tokens": 283751408.0, + "step": 2380 + }, + { + "entropy": 0.6343287155032158, + "epoch": 5.424294268605646, + "grad_norm": 0.625, + "learning_rate": 2.358816490233326e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9938060939311981, + "num_tokens": 283870358.0, + "step": 2381 + }, + { + "entropy": 0.6400390639901161, + "epoch": 5.426575420587397, + "grad_norm": 0.45703125, + "learning_rate": 2.356936087433318e-06, + "loss": 0.0127, + "mean_token_accuracy": 0.996204748749733, + "num_tokens": 283989816.0, + "step": 2382 + }, + { + "entropy": 0.6383167430758476, + "epoch": 5.428856572569147, + "grad_norm": 0.453125, + "learning_rate": 2.3550557658336245e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9953126013278961, + "num_tokens": 284108774.0, + "step": 2383 + }, + { + "entropy": 0.6376960724592209, + "epoch": 5.431137724550898, + "grad_norm": 0.66796875, + "learning_rate": 2.3531755265014818e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.992362730205059, + "num_tokens": 284229515.0, + "step": 2384 + }, + { + "entropy": 0.641070120036602, + "epoch": 5.433418876532649, + "grad_norm": 0.498046875, + "learning_rate": 2.3512953705040737e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.995657168328762, + "num_tokens": 284349779.0, + "step": 2385 + }, + { + "entropy": 0.6381007805466652, + "epoch": 5.4357000285143995, + "grad_norm": 0.498046875, + "learning_rate": 2.3494152989085433e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9939686357975006, + "num_tokens": 284470530.0, + "step": 2386 + }, + { + "entropy": 0.6454793438315392, + "epoch": 5.43798118049615, + "grad_norm": 0.640625, + "learning_rate": 2.3475353127819827e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9929848909378052, + "num_tokens": 284590270.0, + "step": 2387 + }, + { + "entropy": 0.639455609023571, + "epoch": 5.440262332477902, + "grad_norm": 0.546875, + "learning_rate": 2.345655413191434e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9933989569544792, + "num_tokens": 284709896.0, + "step": 2388 + }, + { + "entropy": 0.6433944255113602, + "epoch": 5.4425434844596525, + "grad_norm": 0.53515625, + "learning_rate": 2.3437756012038933e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9948571622371674, + "num_tokens": 284830231.0, + "step": 2389 + }, + { + "entropy": 0.6362849324941635, + "epoch": 5.444824636441403, + "grad_norm": 0.41015625, + "learning_rate": 2.341895877886306e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.995499923825264, + "num_tokens": 284949395.0, + "step": 2390 + }, + { + "entropy": 0.6401253640651703, + "epoch": 5.447105788423154, + "grad_norm": 0.53515625, + "learning_rate": 2.3400162443055655e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9952874556183815, + "num_tokens": 285068982.0, + "step": 2391 + }, + { + "entropy": 0.6382438540458679, + "epoch": 5.449386940404905, + "grad_norm": 0.4296875, + "learning_rate": 2.338136701528516e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9946202635765076, + "num_tokens": 285188865.0, + "step": 2392 + }, + { + "entropy": 0.6432719901204109, + "epoch": 5.451668092386655, + "grad_norm": 0.62890625, + "learning_rate": 2.33625725062195e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9945890977978706, + "num_tokens": 285307890.0, + "step": 2393 + }, + { + "entropy": 0.6384292393922806, + "epoch": 5.453949244368406, + "grad_norm": 0.486328125, + "learning_rate": 2.3343778926526074e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9954699724912643, + "num_tokens": 285426710.0, + "step": 2394 + }, + { + "entropy": 0.6396980732679367, + "epoch": 5.456230396350157, + "grad_norm": 0.51953125, + "learning_rate": 2.332498628687176e-06, + "loss": 0.0131, + "mean_token_accuracy": 0.9947387352585793, + "num_tokens": 285545731.0, + "step": 2395 + }, + { + "entropy": 0.6402654871344566, + "epoch": 5.4585115483319075, + "grad_norm": 0.58203125, + "learning_rate": 2.330619459792289e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9944124594330788, + "num_tokens": 285664636.0, + "step": 2396 + }, + { + "entropy": 0.6365652531385422, + "epoch": 5.460792700313658, + "grad_norm": 0.40625, + "learning_rate": 2.328740387034526e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.996221736073494, + "num_tokens": 285784067.0, + "step": 2397 + }, + { + "entropy": 0.6342399641871452, + "epoch": 5.463073852295409, + "grad_norm": 0.703125, + "learning_rate": 2.326861411480414e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9946505799889565, + "num_tokens": 285903195.0, + "step": 2398 + }, + { + "entropy": 0.6399852931499481, + "epoch": 5.46535500427716, + "grad_norm": 0.50390625, + "learning_rate": 2.324982534196421e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9942696616053581, + "num_tokens": 286023065.0, + "step": 2399 + }, + { + "entropy": 0.6367531195282936, + "epoch": 5.46763615625891, + "grad_norm": 0.400390625, + "learning_rate": 2.3231037562489636e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.995113879442215, + "num_tokens": 286142858.0, + "step": 2400 + }, + { + "entropy": 0.6348176002502441, + "epoch": 5.469917308240661, + "grad_norm": 0.7734375, + "learning_rate": 2.321225078704399e-06, + "loss": 0.0235, + "mean_token_accuracy": 0.9920751079916954, + "num_tokens": 286262048.0, + "step": 2401 + }, + { + "entropy": 0.6392532885074615, + "epoch": 5.472198460222412, + "grad_norm": 0.470703125, + "learning_rate": 2.319346502629028e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.995530903339386, + "num_tokens": 286381332.0, + "step": 2402 + }, + { + "entropy": 0.6431594118475914, + "epoch": 5.4744796122041635, + "grad_norm": 0.451171875, + "learning_rate": 2.3174680290890945e-06, + "loss": 0.0097, + "mean_token_accuracy": 0.9961316287517548, + "num_tokens": 286500935.0, + "step": 2403 + }, + { + "entropy": 0.6378897503018379, + "epoch": 5.476760764185914, + "grad_norm": 0.66796875, + "learning_rate": 2.315589659150784e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9925869256258011, + "num_tokens": 286620234.0, + "step": 2404 + }, + { + "entropy": 0.633121520280838, + "epoch": 5.479041916167665, + "grad_norm": 0.55859375, + "learning_rate": 2.3137113938802224e-06, + "loss": 0.0118, + "mean_token_accuracy": 0.9960329532623291, + "num_tokens": 286739871.0, + "step": 2405 + }, + { + "entropy": 0.637439601123333, + "epoch": 5.481323068149416, + "grad_norm": 0.64453125, + "learning_rate": 2.311833234343478e-06, + "loss": 0.0266, + "mean_token_accuracy": 0.991218812763691, + "num_tokens": 286859080.0, + "step": 2406 + }, + { + "entropy": 0.639147512614727, + "epoch": 5.483604220131166, + "grad_norm": 0.65625, + "learning_rate": 2.3099551816065563e-06, + "loss": 0.0275, + "mean_token_accuracy": 0.992988221347332, + "num_tokens": 286977726.0, + "step": 2407 + }, + { + "entropy": 0.6390094384551048, + "epoch": 5.485885372112917, + "grad_norm": 0.3671875, + "learning_rate": 2.3080772367354046e-06, + "loss": 0.0126, + "mean_token_accuracy": 0.9961584359407425, + "num_tokens": 287096911.0, + "step": 2408 + }, + { + "entropy": 0.6413558647036552, + "epoch": 5.488166524094668, + "grad_norm": 0.388671875, + "learning_rate": 2.3061994007959086e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9969837293028831, + "num_tokens": 287216037.0, + "step": 2409 + }, + { + "entropy": 0.6392953842878342, + "epoch": 5.4904476760764185, + "grad_norm": 0.6171875, + "learning_rate": 2.304321674853891e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.9968736842274666, + "num_tokens": 287335630.0, + "step": 2410 + }, + { + "entropy": 0.6407031044363976, + "epoch": 5.492728828058169, + "grad_norm": 0.44140625, + "learning_rate": 2.3024440599751132e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9960461035370827, + "num_tokens": 287455379.0, + "step": 2411 + }, + { + "entropy": 0.6423872783780098, + "epoch": 5.49500998003992, + "grad_norm": 0.6640625, + "learning_rate": 2.3005665572252732e-06, + "loss": 0.0294, + "mean_token_accuracy": 0.989506907761097, + "num_tokens": 287574222.0, + "step": 2412 + }, + { + "entropy": 0.637581117451191, + "epoch": 5.497291132021671, + "grad_norm": 0.443359375, + "learning_rate": 2.2986891676700042e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9926197156310081, + "num_tokens": 287693582.0, + "step": 2413 + }, + { + "entropy": 0.6374746114015579, + "epoch": 5.499572284003421, + "grad_norm": 0.46484375, + "learning_rate": 2.296811892374878e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9938049092888832, + "num_tokens": 287812907.0, + "step": 2414 + }, + { + "entropy": 0.640739195048809, + "epoch": 5.501853435985172, + "grad_norm": 0.5078125, + "learning_rate": 2.294934732405398e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9951888546347618, + "num_tokens": 287932669.0, + "step": 2415 + }, + { + "entropy": 0.6452915892004967, + "epoch": 5.504134587966924, + "grad_norm": 0.486328125, + "learning_rate": 2.293057688827007e-06, + "loss": 0.0125, + "mean_token_accuracy": 0.9969162568449974, + "num_tokens": 288052597.0, + "step": 2416 + }, + { + "entropy": 0.6436102464795113, + "epoch": 5.506415739948674, + "grad_norm": 0.59765625, + "learning_rate": 2.2911807627050745e-06, + "loss": 0.024, + "mean_token_accuracy": 0.9924965798854828, + "num_tokens": 288172461.0, + "step": 2417 + }, + { + "entropy": 0.6423786357045174, + "epoch": 5.508696891930425, + "grad_norm": 0.50390625, + "learning_rate": 2.2893039551049104e-06, + "loss": 0.0135, + "mean_token_accuracy": 0.9957814887166023, + "num_tokens": 288292026.0, + "step": 2418 + }, + { + "entropy": 0.6349745839834213, + "epoch": 5.510978043912176, + "grad_norm": 0.416015625, + "learning_rate": 2.2874272670917534e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9946180731058121, + "num_tokens": 288411712.0, + "step": 2419 + }, + { + "entropy": 0.6436842605471611, + "epoch": 5.5132591958939265, + "grad_norm": 0.546875, + "learning_rate": 2.2855506997307766e-06, + "loss": 0.0107, + "mean_token_accuracy": 0.997319869697094, + "num_tokens": 288531207.0, + "step": 2420 + }, + { + "epoch": 5.5132591958939265, + "eval_entropy": 0.641282047835593, + "eval_loss": 0.020601695403456688, + "eval_mean_token_accuracy": 0.9936199675494727, + "eval_num_tokens": 288531207.0, + "eval_runtime": 177.543, + "eval_samples_per_second": 47.228, + "eval_steps_per_second": 1.481, + "step": 2420 + }, + { + "entropy": 0.6386368870735168, + "epoch": 5.515540347875677, + "grad_norm": 0.52734375, + "learning_rate": 2.283674254087082e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.995141975581646, + "num_tokens": 288650475.0, + "step": 2421 + }, + { + "entropy": 0.6427244171500206, + "epoch": 5.517821499857428, + "grad_norm": 0.55078125, + "learning_rate": 2.281797931225705e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9949744194746017, + "num_tokens": 288769180.0, + "step": 2422 + }, + { + "entropy": 0.6391404122114182, + "epoch": 5.520102651839179, + "grad_norm": 0.486328125, + "learning_rate": 2.279921732211609e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9939669445157051, + "num_tokens": 288887958.0, + "step": 2423 + }, + { + "entropy": 0.6401629075407982, + "epoch": 5.522383803820929, + "grad_norm": 0.5234375, + "learning_rate": 2.278045658109689e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9946850016713142, + "num_tokens": 289007112.0, + "step": 2424 + }, + { + "entropy": 0.634204126894474, + "epoch": 5.52466495580268, + "grad_norm": 0.462890625, + "learning_rate": 2.2761697099847686e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.99394890666008, + "num_tokens": 289125847.0, + "step": 2425 + }, + { + "entropy": 0.6459424197673798, + "epoch": 5.526946107784431, + "grad_norm": 0.62109375, + "learning_rate": 2.274293888901599e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9937918931245804, + "num_tokens": 289245801.0, + "step": 2426 + }, + { + "entropy": 0.6451177448034286, + "epoch": 5.529227259766182, + "grad_norm": 0.50390625, + "learning_rate": 2.2724181959248627e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9951755255460739, + "num_tokens": 289365255.0, + "step": 2427 + }, + { + "entropy": 0.6422101184725761, + "epoch": 5.531508411747932, + "grad_norm": 0.58203125, + "learning_rate": 2.270542632119163e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9922914206981659, + "num_tokens": 289484562.0, + "step": 2428 + }, + { + "entropy": 0.641721598803997, + "epoch": 5.533789563729684, + "grad_norm": 0.53515625, + "learning_rate": 2.2686671985490355e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9924471750855446, + "num_tokens": 289604227.0, + "step": 2429 + }, + { + "entropy": 0.636128731071949, + "epoch": 5.536070715711435, + "grad_norm": 0.47265625, + "learning_rate": 2.26679189627894e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9952670484781265, + "num_tokens": 289722955.0, + "step": 2430 + }, + { + "entropy": 0.64027089625597, + "epoch": 5.538351867693185, + "grad_norm": 0.515625, + "learning_rate": 2.264916726373263e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9951866492629051, + "num_tokens": 289841339.0, + "step": 2431 + }, + { + "entropy": 0.6412405148148537, + "epoch": 5.540633019674936, + "grad_norm": 0.5625, + "learning_rate": 2.263041689896313e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9935343489050865, + "num_tokens": 289960428.0, + "step": 2432 + }, + { + "entropy": 0.6428171172738075, + "epoch": 5.542914171656687, + "grad_norm": 0.447265625, + "learning_rate": 2.261166787912325e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9947112500667572, + "num_tokens": 290079673.0, + "step": 2433 + }, + { + "entropy": 0.6439976170659065, + "epoch": 5.5451953236384375, + "grad_norm": 0.51953125, + "learning_rate": 2.2592920214854573e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9961233586072922, + "num_tokens": 290198693.0, + "step": 2434 + }, + { + "entropy": 0.6401423215866089, + "epoch": 5.547476475620188, + "grad_norm": 0.48828125, + "learning_rate": 2.2574173916797912e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9943058863282204, + "num_tokens": 290318136.0, + "step": 2435 + }, + { + "entropy": 0.6393100023269653, + "epoch": 5.549757627601939, + "grad_norm": 0.50390625, + "learning_rate": 2.2555428995593303e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9944053068757057, + "num_tokens": 290437269.0, + "step": 2436 + }, + { + "entropy": 0.643096923828125, + "epoch": 5.55203877958369, + "grad_norm": 0.451171875, + "learning_rate": 2.253668546188e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9958179667592049, + "num_tokens": 290556943.0, + "step": 2437 + }, + { + "entropy": 0.6379654929041862, + "epoch": 5.55431993156544, + "grad_norm": 0.63671875, + "learning_rate": 2.2517943326296487e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9933631792664528, + "num_tokens": 290677207.0, + "step": 2438 + }, + { + "entropy": 0.6453045010566711, + "epoch": 5.556601083547191, + "grad_norm": 0.458984375, + "learning_rate": 2.249920259948041e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.9956672489643097, + "num_tokens": 290797766.0, + "step": 2439 + }, + { + "entropy": 0.6346229538321495, + "epoch": 5.558882235528942, + "grad_norm": 0.447265625, + "learning_rate": 2.2480463292068655e-06, + "loss": 0.015, + "mean_token_accuracy": 0.9949743375182152, + "num_tokens": 290916784.0, + "step": 2440 + }, + { + "entropy": 0.6405005976557732, + "epoch": 5.5611633875106925, + "grad_norm": 0.62109375, + "learning_rate": 2.24617254146973e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9943385571241379, + "num_tokens": 291036076.0, + "step": 2441 + }, + { + "entropy": 0.6407162174582481, + "epoch": 5.563444539492444, + "grad_norm": 0.64453125, + "learning_rate": 2.2442988978001594e-06, + "loss": 0.0267, + "mean_token_accuracy": 0.9920257404446602, + "num_tokens": 291156394.0, + "step": 2442 + }, + { + "entropy": 0.6401777490973473, + "epoch": 5.565725691474195, + "grad_norm": 0.5703125, + "learning_rate": 2.2424253992615983e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9937121421098709, + "num_tokens": 291275397.0, + "step": 2443 + }, + { + "entropy": 0.6367546021938324, + "epoch": 5.5680068434559455, + "grad_norm": 0.66015625, + "learning_rate": 2.2405520469174084e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9921469762921333, + "num_tokens": 291394611.0, + "step": 2444 + }, + { + "entropy": 0.6408069655299187, + "epoch": 5.570287995437696, + "grad_norm": 0.51953125, + "learning_rate": 2.238678841830867e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9954084604978561, + "num_tokens": 291513658.0, + "step": 2445 + }, + { + "entropy": 0.6369916051626205, + "epoch": 5.572569147419447, + "grad_norm": 0.62109375, + "learning_rate": 2.23680578506517e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9933514147996902, + "num_tokens": 291632986.0, + "step": 2446 + }, + { + "entropy": 0.6393511444330215, + "epoch": 5.574850299401198, + "grad_norm": 0.453125, + "learning_rate": 2.234932877683428e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9951742440462112, + "num_tokens": 291752240.0, + "step": 2447 + }, + { + "entropy": 0.6363268345594406, + "epoch": 5.577131451382948, + "grad_norm": 0.353515625, + "learning_rate": 2.233060120748667e-06, + "loss": 0.0078, + "mean_token_accuracy": 0.9969458132982254, + "num_tokens": 291871738.0, + "step": 2448 + }, + { + "entropy": 0.6342857927083969, + "epoch": 5.579412603364699, + "grad_norm": 0.4609375, + "learning_rate": 2.2311875153238296e-06, + "loss": 0.0115, + "mean_token_accuracy": 0.9966310113668442, + "num_tokens": 291990818.0, + "step": 2449 + }, + { + "entropy": 0.6389492973685265, + "epoch": 5.58169375534645, + "grad_norm": 0.462890625, + "learning_rate": 2.229315062471768e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9942750632762909, + "num_tokens": 292110135.0, + "step": 2450 + }, + { + "entropy": 0.6418540552258492, + "epoch": 5.583974907328201, + "grad_norm": 0.57421875, + "learning_rate": 2.2274427632552507e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9949524104595184, + "num_tokens": 292229023.0, + "step": 2451 + }, + { + "entropy": 0.6427427977323532, + "epoch": 5.586256059309951, + "grad_norm": 0.61328125, + "learning_rate": 2.2255706187369596e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9932012110948563, + "num_tokens": 292348600.0, + "step": 2452 + }, + { + "entropy": 0.6361582279205322, + "epoch": 5.588537211291702, + "grad_norm": 0.6328125, + "learning_rate": 2.223698629979487e-06, + "loss": 0.0136, + "mean_token_accuracy": 0.9940447583794594, + "num_tokens": 292468222.0, + "step": 2453 + }, + { + "entropy": 0.6393432840704918, + "epoch": 5.590818363273453, + "grad_norm": 0.494140625, + "learning_rate": 2.221826798045338e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9944486692547798, + "num_tokens": 292587086.0, + "step": 2454 + }, + { + "entropy": 0.6376495659351349, + "epoch": 5.593099515255204, + "grad_norm": 0.357421875, + "learning_rate": 2.2199551239969284e-06, + "loss": 0.0119, + "mean_token_accuracy": 0.9964526072144508, + "num_tokens": 292706434.0, + "step": 2455 + }, + { + "entropy": 0.640723466873169, + "epoch": 5.595380667236955, + "grad_norm": 0.58984375, + "learning_rate": 2.2180836088965833e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9937949478626251, + "num_tokens": 292825864.0, + "step": 2456 + }, + { + "entropy": 0.6292269751429558, + "epoch": 5.597661819218706, + "grad_norm": 0.61328125, + "learning_rate": 2.216212253806539e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9935330674052238, + "num_tokens": 292945348.0, + "step": 2457 + }, + { + "entropy": 0.6343361139297485, + "epoch": 5.5999429712004565, + "grad_norm": 0.44140625, + "learning_rate": 2.214341059788941e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9946956187486649, + "num_tokens": 293064772.0, + "step": 2458 + }, + { + "entropy": 0.637665718793869, + "epoch": 5.602224123182207, + "grad_norm": 0.4453125, + "learning_rate": 2.2124700279058435e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9937221556901932, + "num_tokens": 293184138.0, + "step": 2459 + }, + { + "entropy": 0.6428703814744949, + "epoch": 5.604505275163958, + "grad_norm": 0.62890625, + "learning_rate": 2.2105991592192063e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.994095504283905, + "num_tokens": 293303118.0, + "step": 2460 + }, + { + "entropy": 0.6357977092266083, + "epoch": 5.606786427145709, + "grad_norm": 0.578125, + "learning_rate": 2.208728454790899e-06, + "loss": 0.015, + "mean_token_accuracy": 0.9951033666729927, + "num_tokens": 293422765.0, + "step": 2461 + }, + { + "entropy": 0.640275664627552, + "epoch": 5.609067579127459, + "grad_norm": 0.48828125, + "learning_rate": 2.2068579156826974e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9944201409816742, + "num_tokens": 293543020.0, + "step": 2462 + }, + { + "entropy": 0.6381045579910278, + "epoch": 5.61134873110921, + "grad_norm": 0.458984375, + "learning_rate": 2.2049875429562845e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9942235946655273, + "num_tokens": 293662299.0, + "step": 2463 + }, + { + "entropy": 0.6355725526809692, + "epoch": 5.613629883090961, + "grad_norm": 0.3984375, + "learning_rate": 2.203117337673246e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9957984462380409, + "num_tokens": 293781349.0, + "step": 2464 + }, + { + "entropy": 0.6379541754722595, + "epoch": 5.6159110350727115, + "grad_norm": 0.5078125, + "learning_rate": 2.2012473008950756e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.994997650384903, + "num_tokens": 293901205.0, + "step": 2465 + }, + { + "entropy": 0.6396207734942436, + "epoch": 5.618192187054462, + "grad_norm": 0.52734375, + "learning_rate": 2.1993774336831696e-06, + "loss": 0.0123, + "mean_token_accuracy": 0.9951431304216385, + "num_tokens": 294021203.0, + "step": 2466 + }, + { + "entropy": 0.6425206288695335, + "epoch": 5.620473339036213, + "grad_norm": 0.41015625, + "learning_rate": 2.197507737098828e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.9952560588717461, + "num_tokens": 294140833.0, + "step": 2467 + }, + { + "entropy": 0.6317030936479568, + "epoch": 5.6227544910179645, + "grad_norm": 0.412109375, + "learning_rate": 2.195638212203255e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.994780108332634, + "num_tokens": 294259786.0, + "step": 2468 + }, + { + "entropy": 0.6393278241157532, + "epoch": 5.625035642999714, + "grad_norm": 0.53125, + "learning_rate": 2.193768860057557e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9937828183174133, + "num_tokens": 294379438.0, + "step": 2469 + }, + { + "entropy": 0.6413309797644615, + "epoch": 5.627316794981466, + "grad_norm": 0.490234375, + "learning_rate": 2.191899681722743e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9939108639955521, + "num_tokens": 294498332.0, + "step": 2470 + }, + { + "entropy": 0.638202577829361, + "epoch": 5.629597946963217, + "grad_norm": 0.478515625, + "learning_rate": 2.19003067825972e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9941800683736801, + "num_tokens": 294617651.0, + "step": 2471 + }, + { + "entropy": 0.6375350430607796, + "epoch": 5.631879098944967, + "grad_norm": 0.61328125, + "learning_rate": 2.1881618507293004e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9930551499128342, + "num_tokens": 294737387.0, + "step": 2472 + }, + { + "entropy": 0.6359066963195801, + "epoch": 5.634160250926718, + "grad_norm": 0.5078125, + "learning_rate": 2.186293200192194e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9939957112073898, + "num_tokens": 294856641.0, + "step": 2473 + }, + { + "entropy": 0.6366061270236969, + "epoch": 5.636441402908469, + "grad_norm": 0.6953125, + "learning_rate": 2.1844247277090113e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9927259087562561, + "num_tokens": 294976296.0, + "step": 2474 + }, + { + "entropy": 0.635845497250557, + "epoch": 5.63872255489022, + "grad_norm": 0.609375, + "learning_rate": 2.1825564343402606e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9946633651852608, + "num_tokens": 295095154.0, + "step": 2475 + }, + { + "entropy": 0.6316873356699944, + "epoch": 5.64100370687197, + "grad_norm": 0.52734375, + "learning_rate": 2.180688321146349e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9948359653353691, + "num_tokens": 295213523.0, + "step": 2476 + }, + { + "entropy": 0.6398913636803627, + "epoch": 5.643284858853721, + "grad_norm": 0.419921875, + "learning_rate": 2.1788203891875818e-06, + "loss": 0.0121, + "mean_token_accuracy": 0.9975197985768318, + "num_tokens": 295332776.0, + "step": 2477 + }, + { + "entropy": 0.6396508812904358, + "epoch": 5.645566010835472, + "grad_norm": 0.5078125, + "learning_rate": 2.176952639524161e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9937849789857864, + "num_tokens": 295452409.0, + "step": 2478 + }, + { + "entropy": 0.6384711489081383, + "epoch": 5.647847162817222, + "grad_norm": 0.470703125, + "learning_rate": 2.175085073216185e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9949768111109734, + "num_tokens": 295571738.0, + "step": 2479 + }, + { + "entropy": 0.6385917067527771, + "epoch": 5.650128314798973, + "grad_norm": 0.5, + "learning_rate": 2.173217691323649e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9943718537688255, + "num_tokens": 295691039.0, + "step": 2480 + }, + { + "entropy": 0.6382876187562943, + "epoch": 5.652409466780725, + "grad_norm": 0.48046875, + "learning_rate": 2.1713504949064433e-06, + "loss": 0.0121, + "mean_token_accuracy": 0.9961007907986641, + "num_tokens": 295811058.0, + "step": 2481 + }, + { + "entropy": 0.6393980532884598, + "epoch": 5.654690618762475, + "grad_norm": 0.578125, + "learning_rate": 2.169483485024351e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9945502281188965, + "num_tokens": 295930145.0, + "step": 2482 + }, + { + "entropy": 0.6410712972283363, + "epoch": 5.656971770744226, + "grad_norm": 0.51171875, + "learning_rate": 2.167616662737052e-06, + "loss": 0.0125, + "mean_token_accuracy": 0.9958928897976875, + "num_tokens": 296050091.0, + "step": 2483 + }, + { + "entropy": 0.6401623785495758, + "epoch": 5.659252922725977, + "grad_norm": 0.515625, + "learning_rate": 2.1657500291041185e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9955332204699516, + "num_tokens": 296170072.0, + "step": 2484 + }, + { + "entropy": 0.6426578536629677, + "epoch": 5.661534074707728, + "grad_norm": 0.48046875, + "learning_rate": 2.1638835851850155e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9953238517045975, + "num_tokens": 296289107.0, + "step": 2485 + }, + { + "entropy": 0.6360530480742455, + "epoch": 5.663815226689478, + "grad_norm": 0.466796875, + "learning_rate": 2.1620173320391007e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9953430742025375, + "num_tokens": 296408679.0, + "step": 2486 + }, + { + "entropy": 0.6343429312109947, + "epoch": 5.666096378671229, + "grad_norm": 0.58203125, + "learning_rate": 2.160151270725623e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9929355159401894, + "num_tokens": 296528165.0, + "step": 2487 + }, + { + "entropy": 0.6424784287810326, + "epoch": 5.66837753065298, + "grad_norm": 0.6796875, + "learning_rate": 2.158285402303723e-06, + "loss": 0.0252, + "mean_token_accuracy": 0.9922554641962051, + "num_tokens": 296647830.0, + "step": 2488 + }, + { + "entropy": 0.6363007351756096, + "epoch": 5.6706586826347305, + "grad_norm": 0.478515625, + "learning_rate": 2.1564197278324317e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9964245185256004, + "num_tokens": 296767413.0, + "step": 2489 + }, + { + "entropy": 0.6396607980132103, + "epoch": 5.672939834616481, + "grad_norm": 0.6015625, + "learning_rate": 2.1545542483706694e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9946216717362404, + "num_tokens": 296886882.0, + "step": 2490 + }, + { + "entropy": 0.633281484246254, + "epoch": 5.675220986598232, + "grad_norm": 0.447265625, + "learning_rate": 2.1526889649772477e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9953809082508087, + "num_tokens": 297006236.0, + "step": 2491 + }, + { + "entropy": 0.6356832608580589, + "epoch": 5.677502138579983, + "grad_norm": 0.439453125, + "learning_rate": 2.1508238787108633e-06, + "loss": 0.0118, + "mean_token_accuracy": 0.9964328184723854, + "num_tokens": 297125795.0, + "step": 2492 + }, + { + "entropy": 0.6347229108214378, + "epoch": 5.679783290561733, + "grad_norm": 0.4453125, + "learning_rate": 2.1489589906301046e-06, + "loss": 0.0136, + "mean_token_accuracy": 0.9956938475370407, + "num_tokens": 297244628.0, + "step": 2493 + }, + { + "entropy": 0.6386193931102753, + "epoch": 5.682064442543484, + "grad_norm": 0.875, + "learning_rate": 2.1470943017934455e-06, + "loss": 0.0259, + "mean_token_accuracy": 0.9917585030198097, + "num_tokens": 297364366.0, + "step": 2494 + }, + { + "entropy": 0.6307841092348099, + "epoch": 5.684345594525235, + "grad_norm": 0.443359375, + "learning_rate": 2.145229813259248e-06, + "loss": 0.0114, + "mean_token_accuracy": 0.9962062686681747, + "num_tokens": 297484089.0, + "step": 2495 + }, + { + "entropy": 0.6386852562427521, + "epoch": 5.686626746506986, + "grad_norm": 0.470703125, + "learning_rate": 2.143365526085759e-06, + "loss": 0.013, + "mean_token_accuracy": 0.9960926026105881, + "num_tokens": 297603062.0, + "step": 2496 + }, + { + "entropy": 0.6372415125370026, + "epoch": 5.688907898488737, + "grad_norm": 0.6796875, + "learning_rate": 2.1415014413311126e-06, + "loss": 0.0267, + "mean_token_accuracy": 0.9930501505732536, + "num_tokens": 297722298.0, + "step": 2497 + }, + { + "entropy": 0.635584332048893, + "epoch": 5.691189050470488, + "grad_norm": 0.4921875, + "learning_rate": 2.139637560053327e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9945462048053741, + "num_tokens": 297842202.0, + "step": 2498 + }, + { + "entropy": 0.6398444101214409, + "epoch": 5.693470202452239, + "grad_norm": 0.53125, + "learning_rate": 2.137773883310305e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.994646854698658, + "num_tokens": 297961714.0, + "step": 2499 + }, + { + "entropy": 0.6351535618305206, + "epoch": 5.695751354433989, + "grad_norm": 0.59765625, + "learning_rate": 2.1359104121598337e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9951756671071053, + "num_tokens": 298080680.0, + "step": 2500 + }, + { + "entropy": 0.6387563198804855, + "epoch": 5.69803250641574, + "grad_norm": 0.482421875, + "learning_rate": 2.1340471476595836e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9938080981373787, + "num_tokens": 298200656.0, + "step": 2501 + }, + { + "entropy": 0.6370149701833725, + "epoch": 5.700313658397491, + "grad_norm": 0.75, + "learning_rate": 2.1321840908671082e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9915368854999542, + "num_tokens": 298320068.0, + "step": 2502 + }, + { + "entropy": 0.6314645260572433, + "epoch": 5.702594810379241, + "grad_norm": 0.4921875, + "learning_rate": 2.1303212428398407e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9944505617022514, + "num_tokens": 298439054.0, + "step": 2503 + }, + { + "entropy": 0.6423662304878235, + "epoch": 5.704875962360992, + "grad_norm": 0.41796875, + "learning_rate": 2.1284586046350996e-06, + "loss": 0.012, + "mean_token_accuracy": 0.9958660006523132, + "num_tokens": 298558668.0, + "step": 2504 + }, + { + "entropy": 0.6427352279424667, + "epoch": 5.707157114342743, + "grad_norm": 0.443359375, + "learning_rate": 2.126596177310081e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9954738616943359, + "num_tokens": 298678339.0, + "step": 2505 + }, + { + "entropy": 0.6399745941162109, + "epoch": 5.709438266324494, + "grad_norm": 0.494140625, + "learning_rate": 2.124733961921864e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9946881756186485, + "num_tokens": 298797935.0, + "step": 2506 + }, + { + "entropy": 0.636977881193161, + "epoch": 5.711719418306244, + "grad_norm": 0.6171875, + "learning_rate": 2.1228719595274056e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.9928865805268288, + "num_tokens": 298917045.0, + "step": 2507 + }, + { + "entropy": 0.6365994364023209, + "epoch": 5.714000570287995, + "grad_norm": 0.59375, + "learning_rate": 2.1210101711835413e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9926122352480888, + "num_tokens": 299036366.0, + "step": 2508 + }, + { + "entropy": 0.6345532909035683, + "epoch": 5.716281722269747, + "grad_norm": 0.462890625, + "learning_rate": 2.1191485979469877e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9944212511181831, + "num_tokens": 299155406.0, + "step": 2509 + }, + { + "entropy": 0.6370548158884048, + "epoch": 5.718562874251497, + "grad_norm": 0.392578125, + "learning_rate": 2.1172872408743374e-06, + "loss": 0.0131, + "mean_token_accuracy": 0.997104749083519, + "num_tokens": 299274674.0, + "step": 2510 + }, + { + "entropy": 0.6399078369140625, + "epoch": 5.720844026233248, + "grad_norm": 0.6875, + "learning_rate": 2.11542610102206e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9934512972831726, + "num_tokens": 299393770.0, + "step": 2511 + }, + { + "entropy": 0.6408544182777405, + "epoch": 5.723125178214999, + "grad_norm": 0.53125, + "learning_rate": 2.1135651794465032e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9952864646911621, + "num_tokens": 299513462.0, + "step": 2512 + }, + { + "entropy": 0.6324893832206726, + "epoch": 5.7254063301967495, + "grad_norm": 0.53125, + "learning_rate": 2.1117044772038915e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9946317225694656, + "num_tokens": 299632501.0, + "step": 2513 + }, + { + "entropy": 0.6351687833666801, + "epoch": 5.7276874821785, + "grad_norm": 0.5078125, + "learning_rate": 2.1098439953503207e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9927003532648087, + "num_tokens": 299751792.0, + "step": 2514 + }, + { + "entropy": 0.6356898546218872, + "epoch": 5.729968634160251, + "grad_norm": 0.44921875, + "learning_rate": 2.1079837349417664e-06, + "loss": 0.011, + "mean_token_accuracy": 0.9969367757439613, + "num_tokens": 299870666.0, + "step": 2515 + }, + { + "entropy": 0.6351112127304077, + "epoch": 5.732249786142002, + "grad_norm": 0.48828125, + "learning_rate": 2.1061236970340756e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9946328178048134, + "num_tokens": 299989988.0, + "step": 2516 + }, + { + "entropy": 0.6382355466485023, + "epoch": 5.734530938123752, + "grad_norm": 0.64453125, + "learning_rate": 2.104263882682971e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.992844820022583, + "num_tokens": 300109233.0, + "step": 2517 + }, + { + "entropy": 0.6374291777610779, + "epoch": 5.736812090105503, + "grad_norm": 0.43359375, + "learning_rate": 2.1024042929440465e-06, + "loss": 0.0114, + "mean_token_accuracy": 0.9959100633859634, + "num_tokens": 300228424.0, + "step": 2518 + }, + { + "entropy": 0.6376974061131477, + "epoch": 5.739093242087254, + "grad_norm": 0.5078125, + "learning_rate": 2.1005449288727696e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9965368285775185, + "num_tokens": 300347962.0, + "step": 2519 + }, + { + "entropy": 0.6405285596847534, + "epoch": 5.7413743940690045, + "grad_norm": 0.51171875, + "learning_rate": 2.0986857915244787e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9939272776246071, + "num_tokens": 300467738.0, + "step": 2520 + }, + { + "entropy": 0.6408319473266602, + "epoch": 5.743655546050755, + "grad_norm": 0.515625, + "learning_rate": 2.096826881954385e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.9942061007022858, + "num_tokens": 300587336.0, + "step": 2521 + }, + { + "entropy": 0.638691283762455, + "epoch": 5.745936698032507, + "grad_norm": 0.412109375, + "learning_rate": 2.0949682012175693e-06, + "loss": 0.0121, + "mean_token_accuracy": 0.9964900985360146, + "num_tokens": 300707234.0, + "step": 2522 + }, + { + "entropy": 0.6390473991632462, + "epoch": 5.748217850014258, + "grad_norm": 0.53515625, + "learning_rate": 2.093109750368983e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9951080307364464, + "num_tokens": 300827070.0, + "step": 2523 + }, + { + "entropy": 0.6375282034277916, + "epoch": 5.750499001996008, + "grad_norm": 0.50390625, + "learning_rate": 2.0912515304634485e-06, + "loss": 0.0123, + "mean_token_accuracy": 0.9968195781111717, + "num_tokens": 300946491.0, + "step": 2524 + }, + { + "entropy": 0.63719292730093, + "epoch": 5.752780153977759, + "grad_norm": 0.5390625, + "learning_rate": 2.089393542555653e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9939640611410141, + "num_tokens": 301065549.0, + "step": 2525 + }, + { + "entropy": 0.639599397778511, + "epoch": 5.75506130595951, + "grad_norm": 0.470703125, + "learning_rate": 2.0875357877001556e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9935353249311447, + "num_tokens": 301184439.0, + "step": 2526 + }, + { + "entropy": 0.6349383369088173, + "epoch": 5.75734245794126, + "grad_norm": 0.443359375, + "learning_rate": 2.085678266951382e-06, + "loss": 0.0105, + "mean_token_accuracy": 0.996767558157444, + "num_tokens": 301303435.0, + "step": 2527 + }, + { + "entropy": 0.6369887515902519, + "epoch": 5.759623609923011, + "grad_norm": 0.609375, + "learning_rate": 2.083820981363626e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9945555552840233, + "num_tokens": 301422954.0, + "step": 2528 + }, + { + "entropy": 0.641552098095417, + "epoch": 5.761904761904762, + "grad_norm": 0.55859375, + "learning_rate": 2.0819639319910466e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9946057051420212, + "num_tokens": 301542210.0, + "step": 2529 + }, + { + "entropy": 0.6348596289753914, + "epoch": 5.764185913886513, + "grad_norm": 0.431640625, + "learning_rate": 2.0801071198876684e-06, + "loss": 0.013, + "mean_token_accuracy": 0.9953520148992538, + "num_tokens": 301661012.0, + "step": 2530 + }, + { + "entropy": 0.6361228302121162, + "epoch": 5.766467065868263, + "grad_norm": 0.69140625, + "learning_rate": 2.0782505461073822e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.99388238042593, + "num_tokens": 301779736.0, + "step": 2531 + }, + { + "entropy": 0.6334600895643234, + "epoch": 5.768748217850014, + "grad_norm": 0.37109375, + "learning_rate": 2.076394211703944e-06, + "loss": 0.0127, + "mean_token_accuracy": 0.9960079416632652, + "num_tokens": 301898982.0, + "step": 2532 + }, + { + "entropy": 0.635722927749157, + "epoch": 5.771029369831765, + "grad_norm": 0.462890625, + "learning_rate": 2.0745381177309732e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9952659830451012, + "num_tokens": 302018558.0, + "step": 2533 + }, + { + "entropy": 0.6373133659362793, + "epoch": 5.7733105218135155, + "grad_norm": 0.42578125, + "learning_rate": 2.072682265241954e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.995358794927597, + "num_tokens": 302137932.0, + "step": 2534 + }, + { + "entropy": 0.6389187648892403, + "epoch": 5.775591673795267, + "grad_norm": 0.49609375, + "learning_rate": 2.0708266552902303e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.992392010986805, + "num_tokens": 302257123.0, + "step": 2535 + }, + { + "entropy": 0.6373532861471176, + "epoch": 5.777872825777018, + "grad_norm": 0.396484375, + "learning_rate": 2.0689712889290114e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.995698906481266, + "num_tokens": 302376665.0, + "step": 2536 + }, + { + "entropy": 0.639203280210495, + "epoch": 5.7801539777587685, + "grad_norm": 0.52734375, + "learning_rate": 2.0671161672113677e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9939268156886101, + "num_tokens": 302496249.0, + "step": 2537 + }, + { + "entropy": 0.6395558416843414, + "epoch": 5.782435129740519, + "grad_norm": 0.5546875, + "learning_rate": 2.06526129119023e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9940950945019722, + "num_tokens": 302615556.0, + "step": 2538 + }, + { + "entropy": 0.6392509415745735, + "epoch": 5.78471628172227, + "grad_norm": 0.53515625, + "learning_rate": 2.063406661918391e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9931370988488197, + "num_tokens": 302735082.0, + "step": 2539 + }, + { + "entropy": 0.6354018226265907, + "epoch": 5.786997433704021, + "grad_norm": 0.451171875, + "learning_rate": 2.0615522804485027e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.993894137442112, + "num_tokens": 302854179.0, + "step": 2540 + }, + { + "entropy": 0.6366094499826431, + "epoch": 5.789278585685771, + "grad_norm": 0.53515625, + "learning_rate": 2.059698147833075e-06, + "loss": 0.0253, + "mean_token_accuracy": 0.9937736392021179, + "num_tokens": 302973905.0, + "step": 2541 + }, + { + "entropy": 0.6387930735945702, + "epoch": 5.791559737667522, + "grad_norm": 0.443359375, + "learning_rate": 2.0578442651244774e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9936597719788551, + "num_tokens": 303093283.0, + "step": 2542 + }, + { + "entropy": 0.6428203955292702, + "epoch": 5.793840889649273, + "grad_norm": 0.490234375, + "learning_rate": 2.0559906333749392e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9954173117876053, + "num_tokens": 303213028.0, + "step": 2543 + }, + { + "entropy": 0.6426948308944702, + "epoch": 5.7961220416310235, + "grad_norm": 0.462890625, + "learning_rate": 2.054137253636545e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.9961816966533661, + "num_tokens": 303333519.0, + "step": 2544 + }, + { + "entropy": 0.6418589726090431, + "epoch": 5.798403193612774, + "grad_norm": 0.50390625, + "learning_rate": 2.0522841269612397e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9932484105229378, + "num_tokens": 303453529.0, + "step": 2545 + }, + { + "entropy": 0.6362891495227814, + "epoch": 5.800684345594525, + "grad_norm": 0.466796875, + "learning_rate": 2.0504312544008193e-06, + "loss": 0.0099, + "mean_token_accuracy": 0.9967095479369164, + "num_tokens": 303572321.0, + "step": 2546 + }, + { + "entropy": 0.6364968344569206, + "epoch": 5.802965497576276, + "grad_norm": 0.70703125, + "learning_rate": 2.048578637006939e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9934842213988304, + "num_tokens": 303691971.0, + "step": 2547 + }, + { + "entropy": 0.6381266936659813, + "epoch": 5.805246649558027, + "grad_norm": 0.58203125, + "learning_rate": 2.04672627583111e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9942469075322151, + "num_tokens": 303811246.0, + "step": 2548 + }, + { + "entropy": 0.635671429336071, + "epoch": 5.807527801539777, + "grad_norm": 0.5546875, + "learning_rate": 2.0448741719246962e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9951353594660759, + "num_tokens": 303930337.0, + "step": 2549 + }, + { + "entropy": 0.6406738758087158, + "epoch": 5.809808953521529, + "grad_norm": 0.64453125, + "learning_rate": 2.043022326338916e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9934975206851959, + "num_tokens": 304049750.0, + "step": 2550 + }, + { + "entropy": 0.6366659998893738, + "epoch": 5.812090105503279, + "grad_norm": 0.5, + "learning_rate": 2.0411707401248406e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9948532581329346, + "num_tokens": 304169661.0, + "step": 2551 + }, + { + "entropy": 0.6420188024640083, + "epoch": 5.81437125748503, + "grad_norm": 0.68359375, + "learning_rate": 2.0393194143333956e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9929956197738647, + "num_tokens": 304289185.0, + "step": 2552 + }, + { + "entropy": 0.640873946249485, + "epoch": 5.816652409466781, + "grad_norm": 0.55078125, + "learning_rate": 2.0374683500153564e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9924633800983429, + "num_tokens": 304408852.0, + "step": 2553 + }, + { + "entropy": 0.6369428262114525, + "epoch": 5.818933561448532, + "grad_norm": 0.4453125, + "learning_rate": 2.0356175482213523e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9951367601752281, + "num_tokens": 304528177.0, + "step": 2554 + }, + { + "entropy": 0.6324970349669456, + "epoch": 5.821214713430282, + "grad_norm": 0.458984375, + "learning_rate": 2.033767010001863e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9945944026112556, + "num_tokens": 304646491.0, + "step": 2555 + }, + { + "entropy": 0.6394414082169533, + "epoch": 5.823495865412033, + "grad_norm": 0.54296875, + "learning_rate": 2.0319167364072184e-06, + "loss": 0.0136, + "mean_token_accuracy": 0.995421901345253, + "num_tokens": 304766114.0, + "step": 2556 + }, + { + "entropy": 0.6373368427157402, + "epoch": 5.825777017393784, + "grad_norm": 0.466796875, + "learning_rate": 2.0300667284875965e-06, + "loss": 0.0131, + "mean_token_accuracy": 0.9960253685712814, + "num_tokens": 304886180.0, + "step": 2557 + }, + { + "entropy": 0.6401776447892189, + "epoch": 5.8280581693755344, + "grad_norm": 0.55078125, + "learning_rate": 2.0282169872930275e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9933372437953949, + "num_tokens": 305006441.0, + "step": 2558 + }, + { + "entropy": 0.6374201700091362, + "epoch": 5.830339321357285, + "grad_norm": 0.55859375, + "learning_rate": 2.026367513873388e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9946922212839127, + "num_tokens": 305126254.0, + "step": 2559 + }, + { + "entropy": 0.6371617615222931, + "epoch": 5.832620473339036, + "grad_norm": 0.5078125, + "learning_rate": 2.0245183092784046e-06, + "loss": 0.0113, + "mean_token_accuracy": 0.9953070133924484, + "num_tokens": 305245039.0, + "step": 2560 + }, + { + "entropy": 0.6292161643505096, + "epoch": 5.8349016253207875, + "grad_norm": 0.72265625, + "learning_rate": 2.0226693745576494e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9926886931061745, + "num_tokens": 305365078.0, + "step": 2561 + }, + { + "entropy": 0.6367559731006622, + "epoch": 5.837182777302537, + "grad_norm": 0.423828125, + "learning_rate": 2.020820710760541e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.9958921149373055, + "num_tokens": 305485341.0, + "step": 2562 + }, + { + "entropy": 0.6385046094655991, + "epoch": 5.839463929284289, + "grad_norm": 0.53125, + "learning_rate": 2.018972318936347e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9937961623072624, + "num_tokens": 305604354.0, + "step": 2563 + }, + { + "entropy": 0.6347955390810966, + "epoch": 5.84174508126604, + "grad_norm": 0.53515625, + "learning_rate": 2.017124200134178e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9934083893895149, + "num_tokens": 305723844.0, + "step": 2564 + }, + { + "entropy": 0.6340402811765671, + "epoch": 5.84402623324779, + "grad_norm": 0.53125, + "learning_rate": 2.01527635540299e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9937670677900314, + "num_tokens": 305843374.0, + "step": 2565 + }, + { + "entropy": 0.6345420032739639, + "epoch": 5.846307385229541, + "grad_norm": 0.5390625, + "learning_rate": 2.0134287857915864e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9944725409150124, + "num_tokens": 305963520.0, + "step": 2566 + }, + { + "entropy": 0.6369866132736206, + "epoch": 5.848588537211292, + "grad_norm": 0.671875, + "learning_rate": 2.0115814923486093e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9936189129948616, + "num_tokens": 306082869.0, + "step": 2567 + }, + { + "entropy": 0.640837587416172, + "epoch": 5.8508696891930425, + "grad_norm": 0.55859375, + "learning_rate": 2.009734476122547e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.9954570829868317, + "num_tokens": 306202363.0, + "step": 2568 + }, + { + "entropy": 0.6374751776456833, + "epoch": 5.853150841174793, + "grad_norm": 0.546875, + "learning_rate": 2.007887738161732e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9950167685747147, + "num_tokens": 306322187.0, + "step": 2569 + }, + { + "entropy": 0.6338777616620064, + "epoch": 5.855431993156544, + "grad_norm": 0.625, + "learning_rate": 2.006041279514336e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9924641251564026, + "num_tokens": 306441089.0, + "step": 2570 + }, + { + "entropy": 0.6392702981829643, + "epoch": 5.857713145138295, + "grad_norm": 0.453125, + "learning_rate": 2.004195101228374e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.99554143846035, + "num_tokens": 306560967.0, + "step": 2571 + }, + { + "entropy": 0.6398747339844704, + "epoch": 5.859994297120045, + "grad_norm": 0.5078125, + "learning_rate": 2.002349204351701e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.9951138347387314, + "num_tokens": 306679780.0, + "step": 2572 + }, + { + "entropy": 0.6363858580589294, + "epoch": 5.862275449101796, + "grad_norm": 0.57421875, + "learning_rate": 2.0005035899320115e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9959112107753754, + "num_tokens": 306799183.0, + "step": 2573 + }, + { + "entropy": 0.6382249891757965, + "epoch": 5.864556601083547, + "grad_norm": 0.41015625, + "learning_rate": 1.998658259016841e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9960094764828682, + "num_tokens": 306918335.0, + "step": 2574 + }, + { + "entropy": 0.6400691047310829, + "epoch": 5.8668377530652975, + "grad_norm": 0.83203125, + "learning_rate": 1.996813212653564e-06, + "loss": 0.0279, + "mean_token_accuracy": 0.9931002929806709, + "num_tokens": 307037684.0, + "step": 2575 + }, + { + "entropy": 0.6419222429394722, + "epoch": 5.869118905047049, + "grad_norm": 0.62890625, + "learning_rate": 1.9949684518893926e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.993940606713295, + "num_tokens": 307156814.0, + "step": 2576 + }, + { + "entropy": 0.6387804299592972, + "epoch": 5.8714000570288, + "grad_norm": 0.37109375, + "learning_rate": 1.9931239777713794e-06, + "loss": 0.0107, + "mean_token_accuracy": 0.9969074055552483, + "num_tokens": 307276400.0, + "step": 2577 + }, + { + "entropy": 0.6326093077659607, + "epoch": 5.873681209010551, + "grad_norm": 0.46484375, + "learning_rate": 1.9912797913464098e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9952221214771271, + "num_tokens": 307395826.0, + "step": 2578 + }, + { + "entropy": 0.6366284787654877, + "epoch": 5.875962360992301, + "grad_norm": 0.59375, + "learning_rate": 1.989435893661209e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9935618564486504, + "num_tokens": 307515142.0, + "step": 2579 + }, + { + "entropy": 0.6377076804637909, + "epoch": 5.878243512974052, + "grad_norm": 0.416015625, + "learning_rate": 1.9875922857623387e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9957499280571938, + "num_tokens": 307634156.0, + "step": 2580 + }, + { + "entropy": 0.6358212158083916, + "epoch": 5.880524664955803, + "grad_norm": 0.43359375, + "learning_rate": 1.985748968696194e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9955461397767067, + "num_tokens": 307752869.0, + "step": 2581 + }, + { + "entropy": 0.6379106789827347, + "epoch": 5.8828058169375534, + "grad_norm": 0.462890625, + "learning_rate": 1.9839059435090073e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9949470534920692, + "num_tokens": 307871661.0, + "step": 2582 + }, + { + "entropy": 0.6311449483036995, + "epoch": 5.885086968919304, + "grad_norm": 0.470703125, + "learning_rate": 1.9820632112468437e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.994870625436306, + "num_tokens": 307991473.0, + "step": 2583 + }, + { + "entropy": 0.6332216486334801, + "epoch": 5.887368120901055, + "grad_norm": 0.4453125, + "learning_rate": 1.9802207729556023e-06, + "loss": 0.0122, + "mean_token_accuracy": 0.9964180812239647, + "num_tokens": 308110751.0, + "step": 2584 + }, + { + "entropy": 0.6361869797110558, + "epoch": 5.889649272882806, + "grad_norm": 0.52734375, + "learning_rate": 1.9783786296810148e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9971845149993896, + "num_tokens": 308230590.0, + "step": 2585 + }, + { + "entropy": 0.638336293399334, + "epoch": 5.891930424864556, + "grad_norm": 0.43359375, + "learning_rate": 1.9765367824686467e-06, + "loss": 0.0113, + "mean_token_accuracy": 0.9960629642009735, + "num_tokens": 308349310.0, + "step": 2586 + }, + { + "entropy": 0.6362483575940132, + "epoch": 5.894211576846307, + "grad_norm": 0.44921875, + "learning_rate": 1.974695232363895e-06, + "loss": 0.0127, + "mean_token_accuracy": 0.9951998814940453, + "num_tokens": 308468637.0, + "step": 2587 + }, + { + "entropy": 0.6379825994372368, + "epoch": 5.896492728828058, + "grad_norm": 0.62890625, + "learning_rate": 1.9728539804119893e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9930427670478821, + "num_tokens": 308588043.0, + "step": 2588 + }, + { + "entropy": 0.634053498506546, + "epoch": 5.898773880809809, + "grad_norm": 0.375, + "learning_rate": 1.9710130276579864e-06, + "loss": 0.0071, + "mean_token_accuracy": 0.997446745634079, + "num_tokens": 308706693.0, + "step": 2589 + }, + { + "entropy": 0.6336224302649498, + "epoch": 5.90105503279156, + "grad_norm": 0.5625, + "learning_rate": 1.969172375146776e-06, + "loss": 0.0125, + "mean_token_accuracy": 0.9962071105837822, + "num_tokens": 308826188.0, + "step": 2590 + }, + { + "entropy": 0.6355944126844406, + "epoch": 5.903336184773311, + "grad_norm": 0.53515625, + "learning_rate": 1.9673320239230783e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.994892306625843, + "num_tokens": 308945640.0, + "step": 2591 + }, + { + "entropy": 0.6322052329778671, + "epoch": 5.9056173367550615, + "grad_norm": 0.47265625, + "learning_rate": 1.9654919750314396e-06, + "loss": 0.0114, + "mean_token_accuracy": 0.9951648190617561, + "num_tokens": 309064759.0, + "step": 2592 + }, + { + "entropy": 0.6378982439637184, + "epoch": 5.907898488736812, + "grad_norm": 0.48046875, + "learning_rate": 1.9636522295162375e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9948143884539604, + "num_tokens": 309184515.0, + "step": 2593 + }, + { + "entropy": 0.6415349170565605, + "epoch": 5.910179640718563, + "grad_norm": 0.63671875, + "learning_rate": 1.9618127884216753e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9933547973632812, + "num_tokens": 309303778.0, + "step": 2594 + }, + { + "entropy": 0.6433418467640877, + "epoch": 5.912460792700314, + "grad_norm": 0.49609375, + "learning_rate": 1.959973652791784e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9942664280533791, + "num_tokens": 309423947.0, + "step": 2595 + }, + { + "entropy": 0.6354240030050278, + "epoch": 5.914741944682064, + "grad_norm": 0.703125, + "learning_rate": 1.9581348236704217e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9928667023777962, + "num_tokens": 309543345.0, + "step": 2596 + }, + { + "entropy": 0.635118305683136, + "epoch": 5.917023096663815, + "grad_norm": 0.72265625, + "learning_rate": 1.9562963021012723e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9926005452871323, + "num_tokens": 309662903.0, + "step": 2597 + }, + { + "entropy": 0.6380796954035759, + "epoch": 5.919304248645566, + "grad_norm": 0.462890625, + "learning_rate": 1.954458089127845e-06, + "loss": 0.011, + "mean_token_accuracy": 0.9960594773292542, + "num_tokens": 309782455.0, + "step": 2598 + }, + { + "entropy": 0.6372536718845367, + "epoch": 5.9215854006273165, + "grad_norm": 0.671875, + "learning_rate": 1.952620185793475e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9944698140025139, + "num_tokens": 309901014.0, + "step": 2599 + }, + { + "entropy": 0.6314385235309601, + "epoch": 5.923866552609067, + "grad_norm": 0.515625, + "learning_rate": 1.9507825931413193e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9954736530780792, + "num_tokens": 310020358.0, + "step": 2600 + }, + { + "entropy": 0.633869506418705, + "epoch": 5.926147704590818, + "grad_norm": 0.54296875, + "learning_rate": 1.9489453122143605e-06, + "loss": 0.015, + "mean_token_accuracy": 0.99476058781147, + "num_tokens": 310139272.0, + "step": 2601 + }, + { + "entropy": 0.630821019411087, + "epoch": 5.92842885657257, + "grad_norm": 0.455078125, + "learning_rate": 1.947108344055404e-06, + "loss": 0.0114, + "mean_token_accuracy": 0.9948229119181633, + "num_tokens": 310258609.0, + "step": 2602 + }, + { + "entropy": 0.6381283029913902, + "epoch": 5.93071000855432, + "grad_norm": 0.66796875, + "learning_rate": 1.9452716897070785e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9931163042783737, + "num_tokens": 310377857.0, + "step": 2603 + }, + { + "entropy": 0.6298312395811081, + "epoch": 5.932991160536071, + "grad_norm": 0.51171875, + "learning_rate": 1.943435350211832e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9943474605679512, + "num_tokens": 310497101.0, + "step": 2604 + }, + { + "entropy": 0.6363061890006065, + "epoch": 5.935272312517822, + "grad_norm": 0.484375, + "learning_rate": 1.941599326611935e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9930847138166428, + "num_tokens": 310615869.0, + "step": 2605 + }, + { + "entropy": 0.6372633054852486, + "epoch": 5.937553464499572, + "grad_norm": 0.5859375, + "learning_rate": 1.939763619949481e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9929855838418007, + "num_tokens": 310735243.0, + "step": 2606 + }, + { + "entropy": 0.6363314166665077, + "epoch": 5.939834616481323, + "grad_norm": 0.609375, + "learning_rate": 1.9379282312663797e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9936799556016922, + "num_tokens": 310854336.0, + "step": 2607 + }, + { + "entropy": 0.6360226199030876, + "epoch": 5.942115768463074, + "grad_norm": 0.58203125, + "learning_rate": 1.936093161604363e-06, + "loss": 0.0113, + "mean_token_accuracy": 0.9958169758319855, + "num_tokens": 310973667.0, + "step": 2608 + }, + { + "entropy": 0.6383411139249802, + "epoch": 5.944396920444825, + "grad_norm": 0.65625, + "learning_rate": 1.9342584120049824e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.994138278067112, + "num_tokens": 311092954.0, + "step": 2609 + }, + { + "entropy": 0.6392907351255417, + "epoch": 5.946678072426575, + "grad_norm": 0.53125, + "learning_rate": 1.9324239835096044e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9937454462051392, + "num_tokens": 311212711.0, + "step": 2610 + }, + { + "entropy": 0.637532539665699, + "epoch": 5.948959224408326, + "grad_norm": 0.79296875, + "learning_rate": 1.930589877159415e-06, + "loss": 0.0294, + "mean_token_accuracy": 0.9914610758423805, + "num_tokens": 311332413.0, + "step": 2611 + }, + { + "entropy": 0.6315947398543358, + "epoch": 5.951240376390077, + "grad_norm": 0.70703125, + "learning_rate": 1.928756093995419e-06, + "loss": 0.025, + "mean_token_accuracy": 0.9927035123109818, + "num_tokens": 311451831.0, + "step": 2612 + }, + { + "entropy": 0.6373242363333702, + "epoch": 5.9535215283718275, + "grad_norm": 0.5234375, + "learning_rate": 1.9269226350584357e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9939387291669846, + "num_tokens": 311570949.0, + "step": 2613 + }, + { + "entropy": 0.6367953345179558, + "epoch": 5.955802680353578, + "grad_norm": 0.3984375, + "learning_rate": 1.9250895013891015e-06, + "loss": 0.0128, + "mean_token_accuracy": 0.9964085072278976, + "num_tokens": 311690312.0, + "step": 2614 + }, + { + "entropy": 0.6361260786652565, + "epoch": 5.95808383233533, + "grad_norm": 0.609375, + "learning_rate": 1.9232566940278675e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9936571940779686, + "num_tokens": 311809652.0, + "step": 2615 + }, + { + "entropy": 0.6375875025987625, + "epoch": 5.9603649843170805, + "grad_norm": 0.546875, + "learning_rate": 1.9214242140149987e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9935875609517097, + "num_tokens": 311928959.0, + "step": 2616 + }, + { + "entropy": 0.6347905471920967, + "epoch": 5.962646136298831, + "grad_norm": 0.671875, + "learning_rate": 1.9195920623905766e-06, + "loss": 0.0228, + "mean_token_accuracy": 0.992525614798069, + "num_tokens": 312048052.0, + "step": 2617 + }, + { + "entropy": 0.6396196186542511, + "epoch": 5.964927288280582, + "grad_norm": 0.4765625, + "learning_rate": 1.9177602401944943e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9963365867733955, + "num_tokens": 312166432.0, + "step": 2618 + }, + { + "entropy": 0.6414041519165039, + "epoch": 5.967208440262333, + "grad_norm": 0.6796875, + "learning_rate": 1.915928748466459e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.993456557393074, + "num_tokens": 312286275.0, + "step": 2619 + }, + { + "entropy": 0.6409224718809128, + "epoch": 5.969489592244083, + "grad_norm": 0.56640625, + "learning_rate": 1.9140975882459912e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9949458092451096, + "num_tokens": 312405522.0, + "step": 2620 + }, + { + "entropy": 0.638986386358738, + "epoch": 5.971770744225834, + "grad_norm": 0.6953125, + "learning_rate": 1.9122667605724202e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.9925295040011406, + "num_tokens": 312524453.0, + "step": 2621 + }, + { + "entropy": 0.6337964311242104, + "epoch": 5.974051896207585, + "grad_norm": 0.4765625, + "learning_rate": 1.910436266484889e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9940472841262817, + "num_tokens": 312643809.0, + "step": 2622 + }, + { + "entropy": 0.635272815823555, + "epoch": 5.9763330481893355, + "grad_norm": 0.416015625, + "learning_rate": 1.908606107022351e-06, + "loss": 0.0116, + "mean_token_accuracy": 0.9954419285058975, + "num_tokens": 312763773.0, + "step": 2623 + }, + { + "entropy": 0.6388294398784637, + "epoch": 5.978614200171086, + "grad_norm": 0.5859375, + "learning_rate": 1.9067762832235698e-06, + "loss": 0.0246, + "mean_token_accuracy": 0.9932216182351112, + "num_tokens": 312883374.0, + "step": 2624 + }, + { + "entropy": 0.6439246311783791, + "epoch": 5.980895352152837, + "grad_norm": 0.50390625, + "learning_rate": 1.9049467961271184e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.9963691383600235, + "num_tokens": 313002670.0, + "step": 2625 + }, + { + "entropy": 0.6309181898832321, + "epoch": 5.983176504134588, + "grad_norm": 0.51953125, + "learning_rate": 1.9031176467713763e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9952955171465874, + "num_tokens": 313122550.0, + "step": 2626 + }, + { + "entropy": 0.6368819177150726, + "epoch": 5.985457656116338, + "grad_norm": 0.482421875, + "learning_rate": 1.9012888361945354e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9953158050775528, + "num_tokens": 313242107.0, + "step": 2627 + }, + { + "entropy": 0.6378467455506325, + "epoch": 5.98773880809809, + "grad_norm": 0.51171875, + "learning_rate": 1.8994603654345917e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9959428906440735, + "num_tokens": 313361112.0, + "step": 2628 + }, + { + "entropy": 0.6362001672387123, + "epoch": 5.99001996007984, + "grad_norm": 0.48046875, + "learning_rate": 1.897632235529351e-06, + "loss": 0.0092, + "mean_token_accuracy": 0.9968788847327232, + "num_tokens": 313480316.0, + "step": 2629 + }, + { + "entropy": 0.6385065764188766, + "epoch": 5.992301112061591, + "grad_norm": 0.5625, + "learning_rate": 1.8958044475164242e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9939796626567841, + "num_tokens": 313599984.0, + "step": 2630 + }, + { + "entropy": 0.6347980722784996, + "epoch": 5.994582264043342, + "grad_norm": 0.53125, + "learning_rate": 1.8939770024332294e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9936322569847107, + "num_tokens": 313719162.0, + "step": 2631 + }, + { + "entropy": 0.6355270817875862, + "epoch": 5.996863416025093, + "grad_norm": 0.6328125, + "learning_rate": 1.8921499013169876e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.9924603179097176, + "num_tokens": 313838370.0, + "step": 2632 + }, + { + "entropy": 0.6406752392649651, + "epoch": 5.999144568006844, + "grad_norm": 0.431640625, + "learning_rate": 1.8903231452047265e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.995189443230629, + "num_tokens": 313957523.0, + "step": 2633 + }, + { + "entropy": 0.6305730740229288, + "epoch": 6.0, + "grad_norm": 0.6796875, + "learning_rate": 1.8884967351332778e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.996270497639974, + "num_tokens": 314000868.0, + "step": 2634 + }, + { + "entropy": 0.6361557617783546, + "epoch": 6.002281151981751, + "grad_norm": 0.4453125, + "learning_rate": 1.886670672139277e-06, + "loss": 0.0125, + "mean_token_accuracy": 0.9961081445217133, + "num_tokens": 314120589.0, + "step": 2635 + }, + { + "entropy": 0.6381090432405472, + "epoch": 6.004562303963501, + "grad_norm": 0.51953125, + "learning_rate": 1.884844957259163e-06, + "loss": 0.0125, + "mean_token_accuracy": 0.9954915940761566, + "num_tokens": 314239847.0, + "step": 2636 + }, + { + "entropy": 0.6389171555638313, + "epoch": 6.006843455945252, + "grad_norm": 0.55859375, + "learning_rate": 1.8830195915291741e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9938817620277405, + "num_tokens": 314359036.0, + "step": 2637 + }, + { + "entropy": 0.6384937614202499, + "epoch": 6.009124607927003, + "grad_norm": 0.486328125, + "learning_rate": 1.8811945759853543e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9962089359760284, + "num_tokens": 314478119.0, + "step": 2638 + }, + { + "entropy": 0.6379852220416069, + "epoch": 6.011405759908754, + "grad_norm": 0.5234375, + "learning_rate": 1.879369911663546e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9938038289546967, + "num_tokens": 314597601.0, + "step": 2639 + }, + { + "entropy": 0.6347741559147835, + "epoch": 6.013686911890504, + "grad_norm": 0.5859375, + "learning_rate": 1.8775455995993941e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9926309362053871, + "num_tokens": 314717069.0, + "step": 2640 + }, + { + "epoch": 6.013686911890504, + "eval_entropy": 0.6378532754604355, + "eval_loss": 0.020548511296510696, + "eval_mean_token_accuracy": 0.993599727806483, + "eval_num_tokens": 314717069.0, + "eval_runtime": 177.4521, + "eval_samples_per_second": 47.252, + "eval_steps_per_second": 1.482, + "step": 2640 + }, + { + "entropy": 0.6339016556739807, + "epoch": 6.015968063872256, + "grad_norm": 0.392578125, + "learning_rate": 1.875721640828344e-06, + "loss": 0.0122, + "mean_token_accuracy": 0.996279165148735, + "num_tokens": 314836171.0, + "step": 2641 + }, + { + "entropy": 0.6327250450849533, + "epoch": 6.018249215854007, + "grad_norm": 0.58984375, + "learning_rate": 1.8738980363856376e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9931067377328873, + "num_tokens": 314955104.0, + "step": 2642 + }, + { + "entropy": 0.6377785429358482, + "epoch": 6.020530367835757, + "grad_norm": 0.447265625, + "learning_rate": 1.8720747873063184e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9963974058628082, + "num_tokens": 315074483.0, + "step": 2643 + }, + { + "entropy": 0.637920431792736, + "epoch": 6.022811519817508, + "grad_norm": 0.412109375, + "learning_rate": 1.870251894625227e-06, + "loss": 0.013, + "mean_token_accuracy": 0.9957775846123695, + "num_tokens": 315193603.0, + "step": 2644 + }, + { + "entropy": 0.6392377093434334, + "epoch": 6.025092671799259, + "grad_norm": 0.5546875, + "learning_rate": 1.8684293593770026e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9941689446568489, + "num_tokens": 315312360.0, + "step": 2645 + }, + { + "entropy": 0.630735382437706, + "epoch": 6.0273738237810095, + "grad_norm": 0.443359375, + "learning_rate": 1.866607182596081e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9960184320807457, + "num_tokens": 315431596.0, + "step": 2646 + }, + { + "entropy": 0.637915201485157, + "epoch": 6.02965497576276, + "grad_norm": 0.55078125, + "learning_rate": 1.8647853653166953e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9955686628818512, + "num_tokens": 315550946.0, + "step": 2647 + }, + { + "entropy": 0.6370959356427193, + "epoch": 6.031936127744511, + "grad_norm": 0.52734375, + "learning_rate": 1.862963908572872e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9946969673037529, + "num_tokens": 315669807.0, + "step": 2648 + }, + { + "entropy": 0.6379946246743202, + "epoch": 6.034217279726262, + "grad_norm": 0.6015625, + "learning_rate": 1.8611428133984365e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9926190301775932, + "num_tokens": 315788885.0, + "step": 2649 + }, + { + "entropy": 0.6382901817560196, + "epoch": 6.036498431708012, + "grad_norm": 0.5078125, + "learning_rate": 1.8593220808270057e-06, + "loss": 0.0125, + "mean_token_accuracy": 0.994327962398529, + "num_tokens": 315907721.0, + "step": 2650 + }, + { + "entropy": 0.6357349455356598, + "epoch": 6.038779583689763, + "grad_norm": 0.578125, + "learning_rate": 1.857501711891993e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9932722821831703, + "num_tokens": 316027590.0, + "step": 2651 + }, + { + "entropy": 0.6345123425126076, + "epoch": 6.041060735671514, + "grad_norm": 0.490234375, + "learning_rate": 1.8556817076266059e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.9946858957409859, + "num_tokens": 316146404.0, + "step": 2652 + }, + { + "entropy": 0.6386755481362343, + "epoch": 6.0433418876532645, + "grad_norm": 0.431640625, + "learning_rate": 1.8538620690638414e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9945316165685654, + "num_tokens": 316265787.0, + "step": 2653 + }, + { + "entropy": 0.6394455283880234, + "epoch": 6.045623039635016, + "grad_norm": 0.60546875, + "learning_rate": 1.8520427972364924e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9935853853821754, + "num_tokens": 316385681.0, + "step": 2654 + }, + { + "entropy": 0.636052094399929, + "epoch": 6.047904191616767, + "grad_norm": 0.435546875, + "learning_rate": 1.8502238931771422e-06, + "loss": 0.0105, + "mean_token_accuracy": 0.9962378889322281, + "num_tokens": 316504875.0, + "step": 2655 + }, + { + "entropy": 0.6369018480181694, + "epoch": 6.050185343598518, + "grad_norm": 0.458984375, + "learning_rate": 1.848405357918166e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9948622807860374, + "num_tokens": 316623818.0, + "step": 2656 + }, + { + "entropy": 0.6404262334108353, + "epoch": 6.052466495580268, + "grad_norm": 0.53125, + "learning_rate": 1.8465871924917295e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9955299496650696, + "num_tokens": 316742453.0, + "step": 2657 + }, + { + "entropy": 0.6348710209131241, + "epoch": 6.054747647562019, + "grad_norm": 0.458984375, + "learning_rate": 1.8447693979297882e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9936573803424835, + "num_tokens": 316861944.0, + "step": 2658 + }, + { + "entropy": 0.6394931524991989, + "epoch": 6.05702879954377, + "grad_norm": 0.494140625, + "learning_rate": 1.8429519752640862e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9952221661806107, + "num_tokens": 316981592.0, + "step": 2659 + }, + { + "entropy": 0.6371202692389488, + "epoch": 6.05930995152552, + "grad_norm": 0.55078125, + "learning_rate": 1.8411349255261587e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9939974844455719, + "num_tokens": 317101525.0, + "step": 2660 + }, + { + "entropy": 0.6331505477428436, + "epoch": 6.061591103507271, + "grad_norm": 0.4921875, + "learning_rate": 1.8393182497473271e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.9950246065855026, + "num_tokens": 317221464.0, + "step": 2661 + }, + { + "entropy": 0.634735994040966, + "epoch": 6.063872255489022, + "grad_norm": 0.8125, + "learning_rate": 1.837501948958702e-06, + "loss": 0.0281, + "mean_token_accuracy": 0.9922359436750412, + "num_tokens": 317340938.0, + "step": 2662 + }, + { + "entropy": 0.6468506529927254, + "epoch": 6.066153407470773, + "grad_norm": 0.48828125, + "learning_rate": 1.8356860241911817e-06, + "loss": 0.0136, + "mean_token_accuracy": 0.9945531859993935, + "num_tokens": 317461148.0, + "step": 2663 + }, + { + "entropy": 0.6390712186694145, + "epoch": 6.068434559452523, + "grad_norm": 0.53125, + "learning_rate": 1.833870476475448e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9941346049308777, + "num_tokens": 317580466.0, + "step": 2664 + }, + { + "entropy": 0.6368519589304924, + "epoch": 6.070715711434274, + "grad_norm": 0.5234375, + "learning_rate": 1.8320553068419716e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9935498163104057, + "num_tokens": 317699616.0, + "step": 2665 + }, + { + "entropy": 0.6343102231621742, + "epoch": 6.072996863416025, + "grad_norm": 0.53125, + "learning_rate": 1.830240516321008e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.9963266551494598, + "num_tokens": 317818762.0, + "step": 2666 + }, + { + "entropy": 0.6385525986552238, + "epoch": 6.0752780153977755, + "grad_norm": 0.46484375, + "learning_rate": 1.8284261059425972e-06, + "loss": 0.0125, + "mean_token_accuracy": 0.9950185641646385, + "num_tokens": 317938160.0, + "step": 2667 + }, + { + "entropy": 0.6315932348370552, + "epoch": 6.077559167379527, + "grad_norm": 0.57421875, + "learning_rate": 1.8266120767365642e-06, + "loss": 0.013, + "mean_token_accuracy": 0.99539964646101, + "num_tokens": 318056809.0, + "step": 2668 + }, + { + "entropy": 0.6344187259674072, + "epoch": 6.079840319361278, + "grad_norm": 0.470703125, + "learning_rate": 1.8247984297325156e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9954681098461151, + "num_tokens": 318176008.0, + "step": 2669 + }, + { + "entropy": 0.6334509253501892, + "epoch": 6.0821214713430285, + "grad_norm": 0.796875, + "learning_rate": 1.8229851659598425e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.9922873303294182, + "num_tokens": 318295337.0, + "step": 2670 + }, + { + "entropy": 0.6305963099002838, + "epoch": 6.084402623324779, + "grad_norm": 0.443359375, + "learning_rate": 1.8211722864477197e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9960333406925201, + "num_tokens": 318414308.0, + "step": 2671 + }, + { + "entropy": 0.6385723426938057, + "epoch": 6.08668377530653, + "grad_norm": 0.55859375, + "learning_rate": 1.819359792225101e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9931806400418282, + "num_tokens": 318534143.0, + "step": 2672 + }, + { + "entropy": 0.6410021558403969, + "epoch": 6.088964927288281, + "grad_norm": 0.421875, + "learning_rate": 1.8175476843207245e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9949770048260689, + "num_tokens": 318653566.0, + "step": 2673 + }, + { + "entropy": 0.6328125819563866, + "epoch": 6.091246079270031, + "grad_norm": 0.443359375, + "learning_rate": 1.8157359637631078e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9948703870177269, + "num_tokens": 318772298.0, + "step": 2674 + }, + { + "entropy": 0.6329041570425034, + "epoch": 6.093527231251782, + "grad_norm": 0.4921875, + "learning_rate": 1.813924631580547e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9947662875056267, + "num_tokens": 318891419.0, + "step": 2675 + }, + { + "entropy": 0.634616993367672, + "epoch": 6.095808383233533, + "grad_norm": 0.51171875, + "learning_rate": 1.8121136888011198e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9941568896174431, + "num_tokens": 319010051.0, + "step": 2676 + }, + { + "entropy": 0.6367817372083664, + "epoch": 6.0980895352152835, + "grad_norm": 0.400390625, + "learning_rate": 1.810303136452683e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.9960006698966026, + "num_tokens": 319129598.0, + "step": 2677 + }, + { + "entropy": 0.6382457688450813, + "epoch": 6.100370687197034, + "grad_norm": 0.58984375, + "learning_rate": 1.8084929755628707e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9940702095627785, + "num_tokens": 319248858.0, + "step": 2678 + }, + { + "entropy": 0.6369543448090553, + "epoch": 6.102651839178785, + "grad_norm": 0.63671875, + "learning_rate": 1.8066832071590967e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9927216693758965, + "num_tokens": 319368432.0, + "step": 2679 + }, + { + "entropy": 0.6358160451054573, + "epoch": 6.104932991160536, + "grad_norm": 0.373046875, + "learning_rate": 1.8048738322685478e-06, + "loss": 0.0123, + "mean_token_accuracy": 0.9973326921463013, + "num_tokens": 319487578.0, + "step": 2680 + }, + { + "entropy": 0.6375992745161057, + "epoch": 6.107214143142287, + "grad_norm": 0.66015625, + "learning_rate": 1.8030648519181926e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9947179481387138, + "num_tokens": 319606894.0, + "step": 2681 + }, + { + "entropy": 0.6357466652989388, + "epoch": 6.109495295124038, + "grad_norm": 0.478515625, + "learning_rate": 1.8012562671347721e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9948904067277908, + "num_tokens": 319725902.0, + "step": 2682 + }, + { + "entropy": 0.6393820643424988, + "epoch": 6.111776447105789, + "grad_norm": 0.482421875, + "learning_rate": 1.7994480789448043e-06, + "loss": 0.012, + "mean_token_accuracy": 0.9960585534572601, + "num_tokens": 319845729.0, + "step": 2683 + }, + { + "entropy": 0.6359532922506332, + "epoch": 6.114057599087539, + "grad_norm": 0.455078125, + "learning_rate": 1.7976402883745836e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9944631233811378, + "num_tokens": 319965591.0, + "step": 2684 + }, + { + "entropy": 0.6365648284554482, + "epoch": 6.11633875106929, + "grad_norm": 0.435546875, + "learning_rate": 1.7958328964501749e-06, + "loss": 0.0118, + "mean_token_accuracy": 0.9966108128428459, + "num_tokens": 320084495.0, + "step": 2685 + }, + { + "entropy": 0.6354339346289635, + "epoch": 6.118619903051041, + "grad_norm": 0.390625, + "learning_rate": 1.7940259041974189e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9956609830260277, + "num_tokens": 320203892.0, + "step": 2686 + }, + { + "entropy": 0.6393050402402878, + "epoch": 6.120901055032792, + "grad_norm": 0.435546875, + "learning_rate": 1.7922193126419306e-06, + "loss": 0.0108, + "mean_token_accuracy": 0.9959715232253075, + "num_tokens": 320323188.0, + "step": 2687 + }, + { + "entropy": 0.6378999650478363, + "epoch": 6.123182207014542, + "grad_norm": 0.640625, + "learning_rate": 1.7904131228090965e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.9924760535359383, + "num_tokens": 320442574.0, + "step": 2688 + }, + { + "entropy": 0.6397502273321152, + "epoch": 6.125463358996293, + "grad_norm": 0.57421875, + "learning_rate": 1.7886073357240746e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9930745363235474, + "num_tokens": 320561929.0, + "step": 2689 + }, + { + "entropy": 0.6357740834355354, + "epoch": 6.127744510978044, + "grad_norm": 0.546875, + "learning_rate": 1.7868019524117957e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9922360554337502, + "num_tokens": 320680844.0, + "step": 2690 + }, + { + "entropy": 0.6358489319682121, + "epoch": 6.1300256629597945, + "grad_norm": 0.5546875, + "learning_rate": 1.7849969738969592e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9935157895088196, + "num_tokens": 320799668.0, + "step": 2691 + }, + { + "entropy": 0.6376421898603439, + "epoch": 6.132306814941545, + "grad_norm": 0.578125, + "learning_rate": 1.783192401204037e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9953818991780281, + "num_tokens": 320919174.0, + "step": 2692 + }, + { + "entropy": 0.6401348859071732, + "epoch": 6.134587966923296, + "grad_norm": 0.470703125, + "learning_rate": 1.7813882353572692e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9968213587999344, + "num_tokens": 321038430.0, + "step": 2693 + }, + { + "entropy": 0.6402238681912422, + "epoch": 6.136869118905047, + "grad_norm": 0.6328125, + "learning_rate": 1.7795844773806653e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9926995262503624, + "num_tokens": 321157948.0, + "step": 2694 + }, + { + "entropy": 0.6364776864647865, + "epoch": 6.139150270886798, + "grad_norm": 0.58203125, + "learning_rate": 1.7777811282980047e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9941646605730057, + "num_tokens": 321277539.0, + "step": 2695 + }, + { + "entropy": 0.6388070359826088, + "epoch": 6.141431422868549, + "grad_norm": 0.482421875, + "learning_rate": 1.7759781891328321e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.994741752743721, + "num_tokens": 321397092.0, + "step": 2696 + }, + { + "entropy": 0.6329629644751549, + "epoch": 6.1437125748503, + "grad_norm": 0.625, + "learning_rate": 1.7741756609084616e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9959028214216232, + "num_tokens": 321517105.0, + "step": 2697 + }, + { + "entropy": 0.6396226212382317, + "epoch": 6.14599372683205, + "grad_norm": 0.494140625, + "learning_rate": 1.772373544647973e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9924875497817993, + "num_tokens": 321637030.0, + "step": 2698 + }, + { + "entropy": 0.6366459429264069, + "epoch": 6.148274878813801, + "grad_norm": 0.57421875, + "learning_rate": 1.770571841374213e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9944822415709496, + "num_tokens": 321756474.0, + "step": 2699 + }, + { + "entropy": 0.6408012062311172, + "epoch": 6.150556030795552, + "grad_norm": 0.5234375, + "learning_rate": 1.7687705521097954e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.993996761739254, + "num_tokens": 321876181.0, + "step": 2700 + }, + { + "entropy": 0.6390322968363762, + "epoch": 6.1528371827773025, + "grad_norm": 0.71484375, + "learning_rate": 1.766969677877094e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9968041330575943, + "num_tokens": 321995895.0, + "step": 2701 + }, + { + "entropy": 0.6375428065657616, + "epoch": 6.155118334759053, + "grad_norm": 0.458984375, + "learning_rate": 1.7651692196982517e-06, + "loss": 0.0122, + "mean_token_accuracy": 0.994930662214756, + "num_tokens": 322115043.0, + "step": 2702 + }, + { + "entropy": 0.6344108358025551, + "epoch": 6.157399486740804, + "grad_norm": 0.65625, + "learning_rate": 1.7633691785951746e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.995151937007904, + "num_tokens": 322234160.0, + "step": 2703 + }, + { + "entropy": 0.6392261236906052, + "epoch": 6.159680638722555, + "grad_norm": 0.64453125, + "learning_rate": 1.7615695555895296e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9920416846871376, + "num_tokens": 322353709.0, + "step": 2704 + }, + { + "entropy": 0.6393696889281273, + "epoch": 6.161961790704305, + "grad_norm": 0.404296875, + "learning_rate": 1.7597703517027491e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9950105920433998, + "num_tokens": 322472698.0, + "step": 2705 + }, + { + "entropy": 0.6333466991782188, + "epoch": 6.164242942686056, + "grad_norm": 0.498046875, + "learning_rate": 1.7579715679560273e-06, + "loss": 0.014, + "mean_token_accuracy": 0.9945386424660683, + "num_tokens": 322591903.0, + "step": 2706 + }, + { + "entropy": 0.6339712217450142, + "epoch": 6.166524094667807, + "grad_norm": 0.55078125, + "learning_rate": 1.7561732053703174e-06, + "loss": 0.013, + "mean_token_accuracy": 0.9952100738883018, + "num_tokens": 322711700.0, + "step": 2707 + }, + { + "entropy": 0.6401451453566551, + "epoch": 6.168805246649558, + "grad_norm": 0.515625, + "learning_rate": 1.7543752649663354e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9949452504515648, + "num_tokens": 322831788.0, + "step": 2708 + }, + { + "entropy": 0.6345501765608788, + "epoch": 6.171086398631309, + "grad_norm": 0.39453125, + "learning_rate": 1.7525777477645586e-06, + "loss": 0.0096, + "mean_token_accuracy": 0.9971357583999634, + "num_tokens": 322952042.0, + "step": 2709 + }, + { + "entropy": 0.6357293054461479, + "epoch": 6.17336755061306, + "grad_norm": 0.5859375, + "learning_rate": 1.7507806547852224e-06, + "loss": 0.0247, + "mean_token_accuracy": 0.9932793900370598, + "num_tokens": 323071631.0, + "step": 2710 + }, + { + "entropy": 0.6323801651597023, + "epoch": 6.175648702594811, + "grad_norm": 0.474609375, + "learning_rate": 1.7489839870483236e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.996217668056488, + "num_tokens": 323191335.0, + "step": 2711 + }, + { + "entropy": 0.6405774801969528, + "epoch": 6.177929854576561, + "grad_norm": 0.37890625, + "learning_rate": 1.7471877455736136e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9960121884942055, + "num_tokens": 323310951.0, + "step": 2712 + }, + { + "entropy": 0.6378917768597603, + "epoch": 6.180211006558312, + "grad_norm": 0.470703125, + "learning_rate": 1.7453919313806057e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9962176010012627, + "num_tokens": 323430369.0, + "step": 2713 + }, + { + "entropy": 0.6345153748989105, + "epoch": 6.182492158540063, + "grad_norm": 0.484375, + "learning_rate": 1.7435965454885699e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9954604879021645, + "num_tokens": 323549554.0, + "step": 2714 + }, + { + "entropy": 0.6387321129441261, + "epoch": 6.1847733105218134, + "grad_norm": 0.4765625, + "learning_rate": 1.7418015889165312e-06, + "loss": 0.0105, + "mean_token_accuracy": 0.9961895942687988, + "num_tokens": 323669925.0, + "step": 2715 + }, + { + "entropy": 0.642352744936943, + "epoch": 6.187054462503564, + "grad_norm": 0.486328125, + "learning_rate": 1.7400070626832732e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9953032359480858, + "num_tokens": 323789324.0, + "step": 2716 + }, + { + "entropy": 0.6413306370377541, + "epoch": 6.189335614485315, + "grad_norm": 0.58203125, + "learning_rate": 1.7382129678073351e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9942976161837578, + "num_tokens": 323908864.0, + "step": 2717 + }, + { + "entropy": 0.6368302330374718, + "epoch": 6.191616766467066, + "grad_norm": 0.55859375, + "learning_rate": 1.7364193053070082e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9950887337327003, + "num_tokens": 324027578.0, + "step": 2718 + }, + { + "entropy": 0.6369655281305313, + "epoch": 6.193897918448816, + "grad_norm": 0.4609375, + "learning_rate": 1.7346260762003428e-06, + "loss": 0.0136, + "mean_token_accuracy": 0.99490125477314, + "num_tokens": 324146729.0, + "step": 2719 + }, + { + "entropy": 0.6391453370451927, + "epoch": 6.196179070430567, + "grad_norm": 0.53515625, + "learning_rate": 1.7328332815051403e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9923267140984535, + "num_tokens": 324266497.0, + "step": 2720 + }, + { + "entropy": 0.6361438259482384, + "epoch": 6.198460222412319, + "grad_norm": 0.458984375, + "learning_rate": 1.7310409222389563e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9937867075204849, + "num_tokens": 324385962.0, + "step": 2721 + }, + { + "entropy": 0.6383626610040665, + "epoch": 6.200741374394069, + "grad_norm": 0.52734375, + "learning_rate": 1.7292489994191005e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.994943268597126, + "num_tokens": 324505535.0, + "step": 2722 + }, + { + "entropy": 0.6408745646476746, + "epoch": 6.20302252637582, + "grad_norm": 0.6328125, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9938058257102966, + "num_tokens": 324625262.0, + "step": 2723 + }, + { + "entropy": 0.6343057826161385, + "epoch": 6.205303678357571, + "grad_norm": 0.5, + "learning_rate": 1.7256664671863634e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9950433447957039, + "num_tokens": 324745126.0, + "step": 2724 + }, + { + "entropy": 0.6365067288279533, + "epoch": 6.2075848303393215, + "grad_norm": 0.546875, + "learning_rate": 1.72387585980686e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9961476475000381, + "num_tokens": 324864199.0, + "step": 2725 + }, + { + "entropy": 0.635585330426693, + "epoch": 6.209865982321072, + "grad_norm": 0.41796875, + "learning_rate": 1.7220856929404342e-06, + "loss": 0.0103, + "mean_token_accuracy": 0.9963839650154114, + "num_tokens": 324983541.0, + "step": 2726 + }, + { + "entropy": 0.6419303640723228, + "epoch": 6.212147134302823, + "grad_norm": 0.61328125, + "learning_rate": 1.720295967603152e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9922996312379837, + "num_tokens": 325102966.0, + "step": 2727 + }, + { + "entropy": 0.6400644034147263, + "epoch": 6.214428286284574, + "grad_norm": 0.498046875, + "learning_rate": 1.7185066848108244e-06, + "loss": 0.012, + "mean_token_accuracy": 0.9964170753955841, + "num_tokens": 325222367.0, + "step": 2728 + }, + { + "entropy": 0.6386188492178917, + "epoch": 6.216709438266324, + "grad_norm": 0.578125, + "learning_rate": 1.7167178455790157e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9948132634162903, + "num_tokens": 325342083.0, + "step": 2729 + }, + { + "entropy": 0.6393946558237076, + "epoch": 6.218990590248075, + "grad_norm": 0.431640625, + "learning_rate": 1.7149294509230357e-06, + "loss": 0.009, + "mean_token_accuracy": 0.9975345581769943, + "num_tokens": 325461237.0, + "step": 2730 + }, + { + "entropy": 0.6369836330413818, + "epoch": 6.221271742229826, + "grad_norm": 0.421875, + "learning_rate": 1.713141501857943e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.9958923161029816, + "num_tokens": 325580883.0, + "step": 2731 + }, + { + "entropy": 0.6372670903801918, + "epoch": 6.2235528942115765, + "grad_norm": 0.6796875, + "learning_rate": 1.7113539993985431e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9928383976221085, + "num_tokens": 325699675.0, + "step": 2732 + }, + { + "entropy": 0.6333255469799042, + "epoch": 6.225834046193327, + "grad_norm": 0.439453125, + "learning_rate": 1.7095669445593887e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9955977946519852, + "num_tokens": 325819218.0, + "step": 2733 + }, + { + "entropy": 0.6328725591301918, + "epoch": 6.228115198175079, + "grad_norm": 0.455078125, + "learning_rate": 1.707780338354776e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9942411333322525, + "num_tokens": 325938770.0, + "step": 2734 + }, + { + "entropy": 0.6405331864953041, + "epoch": 6.23039635015683, + "grad_norm": 0.5546875, + "learning_rate": 1.7059941817987485e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9943485260009766, + "num_tokens": 326058307.0, + "step": 2735 + }, + { + "entropy": 0.6413170099258423, + "epoch": 6.23267750213858, + "grad_norm": 0.498046875, + "learning_rate": 1.7042084759050948e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9947510287165642, + "num_tokens": 326177761.0, + "step": 2736 + }, + { + "entropy": 0.6401316374540329, + "epoch": 6.234958654120331, + "grad_norm": 0.54296875, + "learning_rate": 1.7024232216873465e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9931034669280052, + "num_tokens": 326297480.0, + "step": 2737 + }, + { + "entropy": 0.633489690721035, + "epoch": 6.237239806102082, + "grad_norm": 0.482421875, + "learning_rate": 1.7006384201587809e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9939722865819931, + "num_tokens": 326416966.0, + "step": 2738 + }, + { + "entropy": 0.6412429288029671, + "epoch": 6.2395209580838324, + "grad_norm": 0.625, + "learning_rate": 1.6988540723324145e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9947263449430466, + "num_tokens": 326536605.0, + "step": 2739 + }, + { + "entropy": 0.63918237388134, + "epoch": 6.241802110065583, + "grad_norm": 0.6171875, + "learning_rate": 1.6970701792210101e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9949327409267426, + "num_tokens": 326656542.0, + "step": 2740 + }, + { + "entropy": 0.6367617323994637, + "epoch": 6.244083262047334, + "grad_norm": 0.66015625, + "learning_rate": 1.6952867418370707e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9937827214598656, + "num_tokens": 326776248.0, + "step": 2741 + }, + { + "entropy": 0.6312263906002045, + "epoch": 6.246364414029085, + "grad_norm": 0.51953125, + "learning_rate": 1.6935037611928412e-06, + "loss": 0.0103, + "mean_token_accuracy": 0.9960389584302902, + "num_tokens": 326895153.0, + "step": 2742 + }, + { + "entropy": 0.6344048529863358, + "epoch": 6.248645566010835, + "grad_norm": 0.408203125, + "learning_rate": 1.691721238300308e-06, + "loss": 0.0089, + "mean_token_accuracy": 0.9962317943572998, + "num_tokens": 327014155.0, + "step": 2743 + }, + { + "entropy": 0.6429111659526825, + "epoch": 6.250926717992586, + "grad_norm": 0.609375, + "learning_rate": 1.689939174171194e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9927534610033035, + "num_tokens": 327133399.0, + "step": 2744 + }, + { + "entropy": 0.635049119591713, + "epoch": 6.253207869974337, + "grad_norm": 0.65625, + "learning_rate": 1.6881575698169662e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.994601659476757, + "num_tokens": 327253524.0, + "step": 2745 + }, + { + "entropy": 0.6397652477025986, + "epoch": 6.2554890219560875, + "grad_norm": 0.6015625, + "learning_rate": 1.6863764262488292e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9945103153586388, + "num_tokens": 327373194.0, + "step": 2746 + }, + { + "entropy": 0.6406799331307411, + "epoch": 6.257770173937839, + "grad_norm": 0.462890625, + "learning_rate": 1.6845957444777244e-06, + "loss": 0.0086, + "mean_token_accuracy": 0.9972309023141861, + "num_tokens": 327493000.0, + "step": 2747 + }, + { + "entropy": 0.631546750664711, + "epoch": 6.26005132591959, + "grad_norm": 0.6171875, + "learning_rate": 1.6828155255143331e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9945584237575531, + "num_tokens": 327612186.0, + "step": 2748 + }, + { + "entropy": 0.6359046250581741, + "epoch": 6.2623324779013405, + "grad_norm": 0.78125, + "learning_rate": 1.6810357703690739e-06, + "loss": 0.0255, + "mean_token_accuracy": 0.9922871440649033, + "num_tokens": 327731648.0, + "step": 2749 + }, + { + "entropy": 0.6412780657410622, + "epoch": 6.264613629883091, + "grad_norm": 0.51953125, + "learning_rate": 1.6792564800521e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9952049255371094, + "num_tokens": 327850760.0, + "step": 2750 + }, + { + "entropy": 0.6326613798737526, + "epoch": 6.266894781864842, + "grad_norm": 0.51953125, + "learning_rate": 1.677477655573303e-06, + "loss": 0.015, + "mean_token_accuracy": 0.9944530501961708, + "num_tokens": 327969876.0, + "step": 2751 + }, + { + "entropy": 0.6401064693927765, + "epoch": 6.269175933846593, + "grad_norm": 0.52734375, + "learning_rate": 1.675699297942309e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.9953096881508827, + "num_tokens": 328088867.0, + "step": 2752 + }, + { + "entropy": 0.6371389627456665, + "epoch": 6.271457085828343, + "grad_norm": 0.6328125, + "learning_rate": 1.6739214081684799e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9940144866704941, + "num_tokens": 328208480.0, + "step": 2753 + }, + { + "entropy": 0.6418630555272102, + "epoch": 6.273738237810094, + "grad_norm": 0.6171875, + "learning_rate": 1.6721439872609125e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.993518516421318, + "num_tokens": 328328551.0, + "step": 2754 + }, + { + "entropy": 0.6379042267799377, + "epoch": 6.276019389791845, + "grad_norm": 0.47265625, + "learning_rate": 1.6703670362284346e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9959050416946411, + "num_tokens": 328447776.0, + "step": 2755 + }, + { + "entropy": 0.6401774361729622, + "epoch": 6.2783005417735955, + "grad_norm": 0.6015625, + "learning_rate": 1.6685905560796101e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9952052161097527, + "num_tokens": 328567012.0, + "step": 2756 + }, + { + "entropy": 0.6361541897058487, + "epoch": 6.280581693755346, + "grad_norm": 0.49609375, + "learning_rate": 1.6668145478227354e-06, + "loss": 0.014, + "mean_token_accuracy": 0.9944012239575386, + "num_tokens": 328685873.0, + "step": 2757 + }, + { + "entropy": 0.6417081952095032, + "epoch": 6.282862845737097, + "grad_norm": 0.443359375, + "learning_rate": 1.6650390124658378e-06, + "loss": 0.0121, + "mean_token_accuracy": 0.9954908117651939, + "num_tokens": 328805109.0, + "step": 2758 + }, + { + "entropy": 0.6405195444822311, + "epoch": 6.285143997718848, + "grad_norm": 0.458984375, + "learning_rate": 1.663263951016678e-06, + "loss": 0.0123, + "mean_token_accuracy": 0.995763435959816, + "num_tokens": 328924168.0, + "step": 2759 + }, + { + "entropy": 0.6348528787493706, + "epoch": 6.287425149700598, + "grad_norm": 0.6328125, + "learning_rate": 1.661489364482745e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9951487705111504, + "num_tokens": 329043448.0, + "step": 2760 + }, + { + "entropy": 0.6410601288080215, + "epoch": 6.289706301682349, + "grad_norm": 0.515625, + "learning_rate": 1.6597152538712608e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.9959833323955536, + "num_tokens": 329163237.0, + "step": 2761 + }, + { + "entropy": 0.6389107331633568, + "epoch": 6.291987453664101, + "grad_norm": 0.52734375, + "learning_rate": 1.6579416201891757e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.9948684349656105, + "num_tokens": 329283210.0, + "step": 2762 + }, + { + "entropy": 0.6372180134057999, + "epoch": 6.2942686056458514, + "grad_norm": 0.71484375, + "learning_rate": 1.6561684644431709e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9933885037899017, + "num_tokens": 329402114.0, + "step": 2763 + }, + { + "entropy": 0.6374088227748871, + "epoch": 6.296549757627602, + "grad_norm": 0.50390625, + "learning_rate": 1.6543957876396544e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9953914657235146, + "num_tokens": 329521406.0, + "step": 2764 + }, + { + "entropy": 0.6396094784140587, + "epoch": 6.298830909609353, + "grad_norm": 0.50390625, + "learning_rate": 1.6526235907847649e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9947836473584175, + "num_tokens": 329641054.0, + "step": 2765 + }, + { + "entropy": 0.6401033252477646, + "epoch": 6.301112061591104, + "grad_norm": 0.65234375, + "learning_rate": 1.6508518748843651e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9951796904206276, + "num_tokens": 329760696.0, + "step": 2766 + }, + { + "entropy": 0.6405196115374565, + "epoch": 6.303393213572854, + "grad_norm": 0.5, + "learning_rate": 1.649080640944048e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9959066659212112, + "num_tokens": 329881256.0, + "step": 2767 + }, + { + "entropy": 0.6404917240142822, + "epoch": 6.305674365554605, + "grad_norm": 0.51953125, + "learning_rate": 1.6473098899691313e-06, + "loss": 0.0107, + "mean_token_accuracy": 0.996356762945652, + "num_tokens": 330000076.0, + "step": 2768 + }, + { + "entropy": 0.6385736018419266, + "epoch": 6.307955517536356, + "grad_norm": 0.484375, + "learning_rate": 1.6455396229646595e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9952977225184441, + "num_tokens": 330119696.0, + "step": 2769 + }, + { + "entropy": 0.6378677636384964, + "epoch": 6.3102366695181065, + "grad_norm": 0.51953125, + "learning_rate": 1.6437698409354025e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9932197034358978, + "num_tokens": 330239258.0, + "step": 2770 + }, + { + "entropy": 0.6334404498338699, + "epoch": 6.312517821499857, + "grad_norm": 0.515625, + "learning_rate": 1.6420005448858522e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9936918914318085, + "num_tokens": 330358195.0, + "step": 2771 + }, + { + "entropy": 0.6351257711648941, + "epoch": 6.314798973481608, + "grad_norm": 0.5546875, + "learning_rate": 1.6402317358202286e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9943983778357506, + "num_tokens": 330477080.0, + "step": 2772 + }, + { + "entropy": 0.6374962851405144, + "epoch": 6.317080125463359, + "grad_norm": 0.58203125, + "learning_rate": 1.6384634147424732e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9957917556166649, + "num_tokens": 330596889.0, + "step": 2773 + }, + { + "entropy": 0.6319269388914108, + "epoch": 6.319361277445109, + "grad_norm": 0.68359375, + "learning_rate": 1.636695582656251e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9931373074650764, + "num_tokens": 330715767.0, + "step": 2774 + }, + { + "entropy": 0.6373465806245804, + "epoch": 6.321642429426861, + "grad_norm": 0.3828125, + "learning_rate": 1.6349282405649506e-06, + "loss": 0.0126, + "mean_token_accuracy": 0.9960980266332626, + "num_tokens": 330835551.0, + "step": 2775 + }, + { + "entropy": 0.6398861780762672, + "epoch": 6.323923581408612, + "grad_norm": 0.42578125, + "learning_rate": 1.6331613894716787e-06, + "loss": 0.0115, + "mean_token_accuracy": 0.995815321803093, + "num_tokens": 330954739.0, + "step": 2776 + }, + { + "entropy": 0.635695144534111, + "epoch": 6.326204733390362, + "grad_norm": 0.5390625, + "learning_rate": 1.6313950303792672e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9946434423327446, + "num_tokens": 331074286.0, + "step": 2777 + }, + { + "entropy": 0.6362407803535461, + "epoch": 6.328485885372113, + "grad_norm": 0.55078125, + "learning_rate": 1.6296291642902673e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9953189119696617, + "num_tokens": 331193515.0, + "step": 2778 + }, + { + "entropy": 0.6369274258613586, + "epoch": 6.330767037353864, + "grad_norm": 0.59765625, + "learning_rate": 1.6278637922069512e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.9936073645949364, + "num_tokens": 331312874.0, + "step": 2779 + }, + { + "entropy": 0.6395990401506424, + "epoch": 6.3330481893356145, + "grad_norm": 0.4921875, + "learning_rate": 1.6260989151313091e-06, + "loss": 0.0117, + "mean_token_accuracy": 0.996147632598877, + "num_tokens": 331431642.0, + "step": 2780 + }, + { + "entropy": 0.6347162127494812, + "epoch": 6.335329341317365, + "grad_norm": 0.47265625, + "learning_rate": 1.6243345340650523e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9932296127080917, + "num_tokens": 331550883.0, + "step": 2781 + }, + { + "entropy": 0.6379182562232018, + "epoch": 6.337610493299116, + "grad_norm": 0.5078125, + "learning_rate": 1.6225706500096079e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9942666590213776, + "num_tokens": 331670095.0, + "step": 2782 + }, + { + "entropy": 0.637493908405304, + "epoch": 6.339891645280867, + "grad_norm": 0.45703125, + "learning_rate": 1.6208072639661226e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9948540925979614, + "num_tokens": 331789646.0, + "step": 2783 + }, + { + "entropy": 0.6350652724504471, + "epoch": 6.342172797262617, + "grad_norm": 0.625, + "learning_rate": 1.6190443769354608e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9942559599876404, + "num_tokens": 331908882.0, + "step": 2784 + }, + { + "entropy": 0.6399040222167969, + "epoch": 6.344453949244368, + "grad_norm": 0.455078125, + "learning_rate": 1.6172819899182036e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.9956949129700661, + "num_tokens": 332028121.0, + "step": 2785 + }, + { + "entropy": 0.6331444382667542, + "epoch": 6.346735101226119, + "grad_norm": 0.46484375, + "learning_rate": 1.6155201039146478e-06, + "loss": 0.0128, + "mean_token_accuracy": 0.9966537207365036, + "num_tokens": 332147594.0, + "step": 2786 + }, + { + "entropy": 0.6362572461366653, + "epoch": 6.3490162532078696, + "grad_norm": 0.52734375, + "learning_rate": 1.613758719924805e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9933521673083305, + "num_tokens": 332266661.0, + "step": 2787 + }, + { + "entropy": 0.6387243941426277, + "epoch": 6.351297405189621, + "grad_norm": 0.53515625, + "learning_rate": 1.611997838948403e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9947771355509758, + "num_tokens": 332387124.0, + "step": 2788 + }, + { + "entropy": 0.6356546655297279, + "epoch": 6.353578557171372, + "grad_norm": 0.470703125, + "learning_rate": 1.6102374619848845e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.9959160313010216, + "num_tokens": 332506508.0, + "step": 2789 + }, + { + "entropy": 0.6332424283027649, + "epoch": 6.355859709153123, + "grad_norm": 0.5546875, + "learning_rate": 1.6084775900334046e-06, + "loss": 0.01, + "mean_token_accuracy": 0.996816910803318, + "num_tokens": 332625613.0, + "step": 2790 + }, + { + "entropy": 0.6329077705740929, + "epoch": 6.358140861134873, + "grad_norm": 0.52734375, + "learning_rate": 1.6067182240928332e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9948827624320984, + "num_tokens": 332745012.0, + "step": 2791 + }, + { + "entropy": 0.6312092170119286, + "epoch": 6.360422013116624, + "grad_norm": 0.55078125, + "learning_rate": 1.6049593651617534e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9936445951461792, + "num_tokens": 332864163.0, + "step": 2792 + }, + { + "entropy": 0.6364886686205864, + "epoch": 6.362703165098375, + "grad_norm": 0.58203125, + "learning_rate": 1.6032010142384572e-06, + "loss": 0.0243, + "mean_token_accuracy": 0.9904529005289078, + "num_tokens": 332983591.0, + "step": 2793 + }, + { + "entropy": 0.6368608325719833, + "epoch": 6.3649843170801255, + "grad_norm": 0.451171875, + "learning_rate": 1.6014431723209522e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9956449940800667, + "num_tokens": 333102859.0, + "step": 2794 + }, + { + "entropy": 0.6331729739904404, + "epoch": 6.367265469061876, + "grad_norm": 0.47265625, + "learning_rate": 1.599685840406955e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9936198964715004, + "num_tokens": 333222663.0, + "step": 2795 + }, + { + "entropy": 0.6384290158748627, + "epoch": 6.369546621043627, + "grad_norm": 0.6171875, + "learning_rate": 1.5979290194938938e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.9932801797986031, + "num_tokens": 333341481.0, + "step": 2796 + }, + { + "entropy": 0.6401607021689415, + "epoch": 6.371827773025378, + "grad_norm": 0.59765625, + "learning_rate": 1.5961727105789072e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9957045316696167, + "num_tokens": 333460729.0, + "step": 2797 + }, + { + "entropy": 0.6398200914263725, + "epoch": 6.374108925007128, + "grad_norm": 0.458984375, + "learning_rate": 1.5944169146588395e-06, + "loss": 0.0135, + "mean_token_accuracy": 0.9958650544285774, + "num_tokens": 333580049.0, + "step": 2798 + }, + { + "entropy": 0.6367319002747536, + "epoch": 6.376390076988879, + "grad_norm": 0.59375, + "learning_rate": 1.5926616327302482e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9936634451150894, + "num_tokens": 333699231.0, + "step": 2799 + }, + { + "entropy": 0.6340999081730843, + "epoch": 6.37867122897063, + "grad_norm": 0.58984375, + "learning_rate": 1.5909068657893978e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9950465708971024, + "num_tokens": 333818720.0, + "step": 2800 + }, + { + "entropy": 0.6369842663407326, + "epoch": 6.380952380952381, + "grad_norm": 0.5, + "learning_rate": 1.5891526148322594e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.9955877438187599, + "num_tokens": 333938324.0, + "step": 2801 + }, + { + "entropy": 0.6359204947948456, + "epoch": 6.383233532934132, + "grad_norm": 0.474609375, + "learning_rate": 1.5873988808545127e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.9952408894896507, + "num_tokens": 334057882.0, + "step": 2802 + }, + { + "entropy": 0.6345510333776474, + "epoch": 6.385514684915883, + "grad_norm": 0.490234375, + "learning_rate": 1.5856456648515425e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9952908232808113, + "num_tokens": 334177605.0, + "step": 2803 + }, + { + "entropy": 0.63844183832407, + "epoch": 6.3877958368976335, + "grad_norm": 0.419921875, + "learning_rate": 1.5838929678184405e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9962063804268837, + "num_tokens": 334295932.0, + "step": 2804 + }, + { + "entropy": 0.6356644108891487, + "epoch": 6.390076988879384, + "grad_norm": 0.5234375, + "learning_rate": 1.5821407907500036e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9942361041903496, + "num_tokens": 334416241.0, + "step": 2805 + }, + { + "entropy": 0.6427603214979172, + "epoch": 6.392358140861135, + "grad_norm": 0.625, + "learning_rate": 1.5803891346407342e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9939723983407021, + "num_tokens": 334535824.0, + "step": 2806 + }, + { + "entropy": 0.6337458044290543, + "epoch": 6.394639292842886, + "grad_norm": 0.37890625, + "learning_rate": 1.5786380004848379e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.9963848367333412, + "num_tokens": 334655232.0, + "step": 2807 + }, + { + "entropy": 0.6360874846577644, + "epoch": 6.396920444824636, + "grad_norm": 0.486328125, + "learning_rate": 1.576887389276226e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9956512525677681, + "num_tokens": 334774776.0, + "step": 2808 + }, + { + "entropy": 0.6375115290284157, + "epoch": 6.399201596806387, + "grad_norm": 0.69921875, + "learning_rate": 1.5751373020085093e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9938253238797188, + "num_tokens": 334894231.0, + "step": 2809 + }, + { + "entropy": 0.6303799226880074, + "epoch": 6.401482748788138, + "grad_norm": 0.5546875, + "learning_rate": 1.5733877396750051e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9929120689630508, + "num_tokens": 335013023.0, + "step": 2810 + }, + { + "entropy": 0.6354173421859741, + "epoch": 6.4037639007698886, + "grad_norm": 0.5625, + "learning_rate": 1.5716387032687314e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9938091710209846, + "num_tokens": 335131967.0, + "step": 2811 + }, + { + "entropy": 0.6338339000940323, + "epoch": 6.406045052751639, + "grad_norm": 0.53515625, + "learning_rate": 1.5698901937824066e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9958818256855011, + "num_tokens": 335251372.0, + "step": 2812 + }, + { + "entropy": 0.6370807141065598, + "epoch": 6.40832620473339, + "grad_norm": 0.5078125, + "learning_rate": 1.5681422122084522e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9940342232584953, + "num_tokens": 335371980.0, + "step": 2813 + }, + { + "entropy": 0.6344561949372292, + "epoch": 6.410607356715142, + "grad_norm": 0.55078125, + "learning_rate": 1.5663947595389873e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9956442639231682, + "num_tokens": 335491811.0, + "step": 2814 + }, + { + "entropy": 0.6376986056566238, + "epoch": 6.412888508696892, + "grad_norm": 0.6015625, + "learning_rate": 1.5646478367658325e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9951124340295792, + "num_tokens": 335610684.0, + "step": 2815 + }, + { + "entropy": 0.6328012049198151, + "epoch": 6.415169660678643, + "grad_norm": 0.5703125, + "learning_rate": 1.562901444880508e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.995036818087101, + "num_tokens": 335730074.0, + "step": 2816 + }, + { + "entropy": 0.6408514752984047, + "epoch": 6.417450812660394, + "grad_norm": 0.50390625, + "learning_rate": 1.5611555848742318e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9947736337780952, + "num_tokens": 335849586.0, + "step": 2817 + }, + { + "entropy": 0.6371811851859093, + "epoch": 6.4197319646421445, + "grad_norm": 0.376953125, + "learning_rate": 1.5594102577379216e-06, + "loss": 0.0105, + "mean_token_accuracy": 0.997013621032238, + "num_tokens": 335968655.0, + "step": 2818 + }, + { + "entropy": 0.6389383301138878, + "epoch": 6.422013116623895, + "grad_norm": 0.447265625, + "learning_rate": 1.5576654644621897e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9950008317828178, + "num_tokens": 336087810.0, + "step": 2819 + }, + { + "entropy": 0.636287085711956, + "epoch": 6.424294268605646, + "grad_norm": 0.49609375, + "learning_rate": 1.5559212060373474e-06, + "loss": 0.0128, + "mean_token_accuracy": 0.9959000945091248, + "num_tokens": 336206965.0, + "step": 2820 + }, + { + "entropy": 0.6327490657567978, + "epoch": 6.426575420587397, + "grad_norm": 0.4921875, + "learning_rate": 1.5541774834534024e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9954334944486618, + "num_tokens": 336326319.0, + "step": 2821 + }, + { + "entropy": 0.6360858157277107, + "epoch": 6.428856572569147, + "grad_norm": 0.55859375, + "learning_rate": 1.5524342977000587e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9935713931918144, + "num_tokens": 336446575.0, + "step": 2822 + }, + { + "entropy": 0.6382481753826141, + "epoch": 6.431137724550898, + "grad_norm": 0.55078125, + "learning_rate": 1.5506916497667134e-06, + "loss": 0.0243, + "mean_token_accuracy": 0.9936537593603134, + "num_tokens": 336566046.0, + "step": 2823 + }, + { + "entropy": 0.6304898485541344, + "epoch": 6.433418876532649, + "grad_norm": 0.5234375, + "learning_rate": 1.5489495406424618e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9949219226837158, + "num_tokens": 336685715.0, + "step": 2824 + }, + { + "entropy": 0.636378787457943, + "epoch": 6.4357000285143995, + "grad_norm": 0.5625, + "learning_rate": 1.5472079713160892e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9946956858038902, + "num_tokens": 336806595.0, + "step": 2825 + }, + { + "entropy": 0.6355533376336098, + "epoch": 6.43798118049615, + "grad_norm": 0.609375, + "learning_rate": 1.5454669427760774e-06, + "loss": 0.013, + "mean_token_accuracy": 0.9959815219044685, + "num_tokens": 336925260.0, + "step": 2826 + }, + { + "entropy": 0.6369712650775909, + "epoch": 6.440262332477902, + "grad_norm": 0.53515625, + "learning_rate": 1.5437264560106014e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9934073761105537, + "num_tokens": 337044665.0, + "step": 2827 + }, + { + "entropy": 0.6305984035134315, + "epoch": 6.4425434844596525, + "grad_norm": 0.40625, + "learning_rate": 1.5419865120075267e-06, + "loss": 0.0119, + "mean_token_accuracy": 0.9962793067097664, + "num_tokens": 337164032.0, + "step": 2828 + }, + { + "entropy": 0.6375952735543251, + "epoch": 6.444824636441403, + "grad_norm": 0.5390625, + "learning_rate": 1.5402471117544143e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9945670962333679, + "num_tokens": 337282977.0, + "step": 2829 + }, + { + "entropy": 0.640318751335144, + "epoch": 6.447105788423154, + "grad_norm": 0.55859375, + "learning_rate": 1.5385082562385112e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9930700808763504, + "num_tokens": 337402883.0, + "step": 2830 + }, + { + "entropy": 0.6332748308777809, + "epoch": 6.449386940404905, + "grad_norm": 0.515625, + "learning_rate": 1.5367699464467596e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9950980022549629, + "num_tokens": 337522105.0, + "step": 2831 + }, + { + "entropy": 0.6393290087580681, + "epoch": 6.451668092386655, + "grad_norm": 0.578125, + "learning_rate": 1.5350321833657904e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.994612991809845, + "num_tokens": 337641190.0, + "step": 2832 + }, + { + "entropy": 0.630260169506073, + "epoch": 6.453949244368406, + "grad_norm": 0.515625, + "learning_rate": 1.5332949679819251e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9936431795358658, + "num_tokens": 337760308.0, + "step": 2833 + }, + { + "entropy": 0.6463821455836296, + "epoch": 6.456230396350157, + "grad_norm": 0.462890625, + "learning_rate": 1.531558301281173e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9948678836226463, + "num_tokens": 337879988.0, + "step": 2834 + }, + { + "entropy": 0.634931892156601, + "epoch": 6.4585115483319075, + "grad_norm": 0.625, + "learning_rate": 1.5298221842492328e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9937450587749481, + "num_tokens": 337998697.0, + "step": 2835 + }, + { + "entropy": 0.6372755467891693, + "epoch": 6.460792700313658, + "grad_norm": 0.79296875, + "learning_rate": 1.5280866178714898e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9936321824789047, + "num_tokens": 338119058.0, + "step": 2836 + }, + { + "entropy": 0.6375229060649872, + "epoch": 6.463073852295409, + "grad_norm": 0.47265625, + "learning_rate": 1.5263516031330195e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9945136457681656, + "num_tokens": 338238594.0, + "step": 2837 + }, + { + "entropy": 0.6394161209464073, + "epoch": 6.46535500427716, + "grad_norm": 0.73828125, + "learning_rate": 1.524617141018582e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.9938574209809303, + "num_tokens": 338357495.0, + "step": 2838 + }, + { + "entropy": 0.6349657475948334, + "epoch": 6.46763615625891, + "grad_norm": 0.54296875, + "learning_rate": 1.5228832325126248e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.994013749063015, + "num_tokens": 338477077.0, + "step": 2839 + }, + { + "entropy": 0.6317748948931694, + "epoch": 6.469917308240661, + "grad_norm": 0.58984375, + "learning_rate": 1.5211498785992818e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9945243671536446, + "num_tokens": 338596036.0, + "step": 2840 + }, + { + "entropy": 0.6343971565365791, + "epoch": 6.472198460222412, + "grad_norm": 0.58984375, + "learning_rate": 1.5194170802623692e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9924870729446411, + "num_tokens": 338715178.0, + "step": 2841 + }, + { + "entropy": 0.6434510797262192, + "epoch": 6.4744796122041635, + "grad_norm": 0.5703125, + "learning_rate": 1.5176848384853913e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9945583641529083, + "num_tokens": 338834230.0, + "step": 2842 + }, + { + "entropy": 0.6352192386984825, + "epoch": 6.476760764185914, + "grad_norm": 0.478515625, + "learning_rate": 1.515953154251535e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9938512593507767, + "num_tokens": 338953203.0, + "step": 2843 + }, + { + "entropy": 0.6338879093527794, + "epoch": 6.479041916167665, + "grad_norm": 0.482421875, + "learning_rate": 1.5142220285436701e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9947274178266525, + "num_tokens": 339072182.0, + "step": 2844 + }, + { + "entropy": 0.6350093334913254, + "epoch": 6.481323068149416, + "grad_norm": 0.6640625, + "learning_rate": 1.512491462344351e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.992927111685276, + "num_tokens": 339191354.0, + "step": 2845 + }, + { + "entropy": 0.6414572447538376, + "epoch": 6.483604220131166, + "grad_norm": 0.56640625, + "learning_rate": 1.5107614566358136e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9936225637793541, + "num_tokens": 339311452.0, + "step": 2846 + }, + { + "entropy": 0.637087382376194, + "epoch": 6.485885372112917, + "grad_norm": 0.5625, + "learning_rate": 1.5090320123999746e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9952613413333893, + "num_tokens": 339429952.0, + "step": 2847 + }, + { + "entropy": 0.6356227546930313, + "epoch": 6.488166524094668, + "grad_norm": 0.38671875, + "learning_rate": 1.5073031306184343e-06, + "loss": 0.0111, + "mean_token_accuracy": 0.9963789060711861, + "num_tokens": 339548857.0, + "step": 2848 + }, + { + "entropy": 0.6347446441650391, + "epoch": 6.4904476760764185, + "grad_norm": 0.52734375, + "learning_rate": 1.5055748122724722e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9935302436351776, + "num_tokens": 339668285.0, + "step": 2849 + }, + { + "entropy": 0.6307914182543755, + "epoch": 6.492728828058169, + "grad_norm": 0.4921875, + "learning_rate": 1.5038470583430485e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9946922063827515, + "num_tokens": 339787264.0, + "step": 2850 + }, + { + "entropy": 0.6432394906878471, + "epoch": 6.49500998003992, + "grad_norm": 0.466796875, + "learning_rate": 1.5021198698108038e-06, + "loss": 0.0112, + "mean_token_accuracy": 0.9963484704494476, + "num_tokens": 339906684.0, + "step": 2851 + }, + { + "entropy": 0.6345346868038177, + "epoch": 6.497291132021671, + "grad_norm": 0.451171875, + "learning_rate": 1.5003932476560554e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9962258189916611, + "num_tokens": 340025345.0, + "step": 2852 + }, + { + "entropy": 0.6328904628753662, + "epoch": 6.499572284003421, + "grad_norm": 0.6484375, + "learning_rate": 1.4986671928588016e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9928649663925171, + "num_tokens": 340144543.0, + "step": 2853 + }, + { + "entropy": 0.6354740709066391, + "epoch": 6.501853435985172, + "grad_norm": 0.4921875, + "learning_rate": 1.496941706398718e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9955081418156624, + "num_tokens": 340263945.0, + "step": 2854 + }, + { + "entropy": 0.6393204778432846, + "epoch": 6.504134587966924, + "grad_norm": 0.66796875, + "learning_rate": 1.495216789255156e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9944110587239265, + "num_tokens": 340382474.0, + "step": 2855 + }, + { + "entropy": 0.6371157839894295, + "epoch": 6.506415739948674, + "grad_norm": 0.5546875, + "learning_rate": 1.4934924424071479e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9943914785981178, + "num_tokens": 340501364.0, + "step": 2856 + }, + { + "entropy": 0.630872368812561, + "epoch": 6.508696891930425, + "grad_norm": 0.486328125, + "learning_rate": 1.4917686668333975e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9936394244432449, + "num_tokens": 340620372.0, + "step": 2857 + }, + { + "entropy": 0.6359670609235764, + "epoch": 6.510978043912176, + "grad_norm": 0.455078125, + "learning_rate": 1.4900454635122866e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9944755658507347, + "num_tokens": 340739280.0, + "step": 2858 + }, + { + "entropy": 0.6315942406654358, + "epoch": 6.5132591958939265, + "grad_norm": 0.57421875, + "learning_rate": 1.4883228334218727e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.994348406791687, + "num_tokens": 340858351.0, + "step": 2859 + }, + { + "entropy": 0.6381858214735985, + "epoch": 6.515540347875677, + "grad_norm": 0.51171875, + "learning_rate": 1.4866007775398874e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.995138131082058, + "num_tokens": 340978175.0, + "step": 2860 + }, + { + "epoch": 6.515540347875677, + "eval_entropy": 0.6359376116397263, + "eval_loss": 0.020573455840349197, + "eval_mean_token_accuracy": 0.9935715511271256, + "eval_num_tokens": 340978175.0, + "eval_runtime": 177.4667, + "eval_samples_per_second": 47.248, + "eval_steps_per_second": 1.482, + "step": 2860 + }, + { + "entropy": 0.6330204755067825, + "epoch": 6.517821499857428, + "grad_norm": 0.6015625, + "learning_rate": 1.4848792968437376e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.995908223092556, + "num_tokens": 341098306.0, + "step": 2861 + }, + { + "entropy": 0.6360952854156494, + "epoch": 6.520102651839179, + "grad_norm": 0.609375, + "learning_rate": 1.4831583923105e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9933330863714218, + "num_tokens": 341217953.0, + "step": 2862 + }, + { + "entropy": 0.6336257308721542, + "epoch": 6.522383803820929, + "grad_norm": 0.5703125, + "learning_rate": 1.481438064916928e-06, + "loss": 0.016, + "mean_token_accuracy": 0.994759276509285, + "num_tokens": 341337108.0, + "step": 2863 + }, + { + "entropy": 0.633852943778038, + "epoch": 6.52466495580268, + "grad_norm": 0.400390625, + "learning_rate": 1.4797183156394462e-06, + "loss": 0.0127, + "mean_token_accuracy": 0.9959749057888985, + "num_tokens": 341457534.0, + "step": 2864 + }, + { + "entropy": 0.6397223547101021, + "epoch": 6.526946107784431, + "grad_norm": 0.625, + "learning_rate": 1.477999145454152e-06, + "loss": 0.025, + "mean_token_accuracy": 0.9931529834866524, + "num_tokens": 341576840.0, + "step": 2865 + }, + { + "entropy": 0.633032888174057, + "epoch": 6.529227259766182, + "grad_norm": 0.5390625, + "learning_rate": 1.4762805553368115e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9928446635603905, + "num_tokens": 341695825.0, + "step": 2866 + }, + { + "entropy": 0.638547345995903, + "epoch": 6.531508411747932, + "grad_norm": 0.5546875, + "learning_rate": 1.4745625462628654e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9936388283967972, + "num_tokens": 341815726.0, + "step": 2867 + }, + { + "entropy": 0.6321535184979439, + "epoch": 6.533789563729684, + "grad_norm": 0.63671875, + "learning_rate": 1.47284511920742e-06, + "loss": 0.0237, + "mean_token_accuracy": 0.9922466352581978, + "num_tokens": 341934654.0, + "step": 2868 + }, + { + "entropy": 0.6334995329380035, + "epoch": 6.536070715711435, + "grad_norm": 0.4921875, + "learning_rate": 1.4711282751452549e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.993796743452549, + "num_tokens": 342053793.0, + "step": 2869 + }, + { + "entropy": 0.6423120722174644, + "epoch": 6.538351867693185, + "grad_norm": 0.5078125, + "learning_rate": 1.4694120150508179e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.9954124465584755, + "num_tokens": 342173477.0, + "step": 2870 + }, + { + "entropy": 0.6352273225784302, + "epoch": 6.540633019674936, + "grad_norm": 0.671875, + "learning_rate": 1.4676963398982248e-06, + "loss": 0.0257, + "mean_token_accuracy": 0.9933101683855057, + "num_tokens": 342293831.0, + "step": 2871 + }, + { + "entropy": 0.6312966570258141, + "epoch": 6.542914171656687, + "grad_norm": 0.494140625, + "learning_rate": 1.4659812506612608e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.995774395763874, + "num_tokens": 342412922.0, + "step": 2872 + }, + { + "entropy": 0.6363515406847, + "epoch": 6.5451953236384375, + "grad_norm": 0.53125, + "learning_rate": 1.4642667483133753e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9946076646447182, + "num_tokens": 342531752.0, + "step": 2873 + }, + { + "entropy": 0.6359499841928482, + "epoch": 6.547476475620188, + "grad_norm": 0.53515625, + "learning_rate": 1.4625528338276879e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9944972395896912, + "num_tokens": 342650928.0, + "step": 2874 + }, + { + "entropy": 0.6410178616642952, + "epoch": 6.549757627601939, + "grad_norm": 0.474609375, + "learning_rate": 1.4608395081769833e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9961149245500565, + "num_tokens": 342770910.0, + "step": 2875 + }, + { + "entropy": 0.6379326954483986, + "epoch": 6.55203877958369, + "grad_norm": 0.52734375, + "learning_rate": 1.4591267723337122e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9927325174212456, + "num_tokens": 342890657.0, + "step": 2876 + }, + { + "entropy": 0.6361019238829613, + "epoch": 6.55431993156544, + "grad_norm": 0.4921875, + "learning_rate": 1.4574146272699914e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.994524747133255, + "num_tokens": 343010068.0, + "step": 2877 + }, + { + "entropy": 0.637906551361084, + "epoch": 6.556601083547191, + "grad_norm": 0.5859375, + "learning_rate": 1.4557030739575988e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.99283666908741, + "num_tokens": 343129504.0, + "step": 2878 + }, + { + "entropy": 0.6400729566812515, + "epoch": 6.558882235528942, + "grad_norm": 0.57421875, + "learning_rate": 1.4539921133679808e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9943129792809486, + "num_tokens": 343249319.0, + "step": 2879 + }, + { + "entropy": 0.6378068327903748, + "epoch": 6.5611633875106925, + "grad_norm": 0.498046875, + "learning_rate": 1.4522817464722453e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.994667612016201, + "num_tokens": 343368539.0, + "step": 2880 + }, + { + "entropy": 0.6358176693320274, + "epoch": 6.563444539492444, + "grad_norm": 0.5234375, + "learning_rate": 1.4505719742411644e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9932379350066185, + "num_tokens": 343487822.0, + "step": 2881 + }, + { + "entropy": 0.6329219713807106, + "epoch": 6.565725691474195, + "grad_norm": 0.546875, + "learning_rate": 1.44886279764517e-06, + "loss": 0.0136, + "mean_token_accuracy": 0.9950562343001366, + "num_tokens": 343606559.0, + "step": 2882 + }, + { + "entropy": 0.6320469900965691, + "epoch": 6.5680068434559455, + "grad_norm": 0.515625, + "learning_rate": 1.4471542176543587e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9946660473942757, + "num_tokens": 343725269.0, + "step": 2883 + }, + { + "entropy": 0.6352123841643333, + "epoch": 6.570287995437696, + "grad_norm": 0.51953125, + "learning_rate": 1.4454462352384885e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9956708028912544, + "num_tokens": 343845036.0, + "step": 2884 + }, + { + "entropy": 0.632084310054779, + "epoch": 6.572569147419447, + "grad_norm": 0.466796875, + "learning_rate": 1.4437388513669754e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9960160553455353, + "num_tokens": 343964423.0, + "step": 2885 + }, + { + "entropy": 0.6341673210263252, + "epoch": 6.574850299401198, + "grad_norm": 0.70703125, + "learning_rate": 1.4420320670088977e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9926156625151634, + "num_tokens": 344085331.0, + "step": 2886 + }, + { + "entropy": 0.6353250443935394, + "epoch": 6.577131451382948, + "grad_norm": 0.455078125, + "learning_rate": 1.4403258831329947e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9941630885004997, + "num_tokens": 344204420.0, + "step": 2887 + }, + { + "entropy": 0.6365467235445976, + "epoch": 6.579412603364699, + "grad_norm": 0.474609375, + "learning_rate": 1.4386203007076632e-06, + "loss": 0.012, + "mean_token_accuracy": 0.9966049864888191, + "num_tokens": 344323971.0, + "step": 2888 + }, + { + "entropy": 0.6384789943695068, + "epoch": 6.58169375534645, + "grad_norm": 0.4609375, + "learning_rate": 1.4369153207009573e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.9962736219167709, + "num_tokens": 344443368.0, + "step": 2889 + }, + { + "entropy": 0.6380016133189201, + "epoch": 6.583974907328201, + "grad_norm": 0.4609375, + "learning_rate": 1.4352109440805917e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9950879663228989, + "num_tokens": 344562836.0, + "step": 2890 + }, + { + "entropy": 0.6398197561502457, + "epoch": 6.586256059309951, + "grad_norm": 0.5859375, + "learning_rate": 1.4335071718139379e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9950396120548248, + "num_tokens": 344682464.0, + "step": 2891 + }, + { + "entropy": 0.6373016089200974, + "epoch": 6.588537211291702, + "grad_norm": 0.53125, + "learning_rate": 1.4318040048680238e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.9942264929413795, + "num_tokens": 344802362.0, + "step": 2892 + }, + { + "entropy": 0.6350658982992172, + "epoch": 6.590818363273453, + "grad_norm": 0.388671875, + "learning_rate": 1.430101444209535e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.9951676800847054, + "num_tokens": 344922329.0, + "step": 2893 + }, + { + "entropy": 0.6324403956532478, + "epoch": 6.593099515255204, + "grad_norm": 0.546875, + "learning_rate": 1.4283994908048107e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9950217828154564, + "num_tokens": 345041617.0, + "step": 2894 + }, + { + "entropy": 0.6350469216704369, + "epoch": 6.595380667236955, + "grad_norm": 0.5078125, + "learning_rate": 1.426698145619847e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.995078332722187, + "num_tokens": 345161036.0, + "step": 2895 + }, + { + "entropy": 0.6368085667490959, + "epoch": 6.597661819218706, + "grad_norm": 0.462890625, + "learning_rate": 1.424997409620295e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.994707815349102, + "num_tokens": 345281167.0, + "step": 2896 + }, + { + "entropy": 0.637279249727726, + "epoch": 6.5999429712004565, + "grad_norm": 0.4453125, + "learning_rate": 1.4232972837714598e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9954697266221046, + "num_tokens": 345400360.0, + "step": 2897 + }, + { + "entropy": 0.6329695209860802, + "epoch": 6.602224123182207, + "grad_norm": 0.51953125, + "learning_rate": 1.4215977690382998e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9942305907607079, + "num_tokens": 345520646.0, + "step": 2898 + }, + { + "entropy": 0.6334237158298492, + "epoch": 6.604505275163958, + "grad_norm": 0.5546875, + "learning_rate": 1.4198988663854276e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9928867965936661, + "num_tokens": 345639941.0, + "step": 2899 + }, + { + "entropy": 0.6400255709886551, + "epoch": 6.606786427145709, + "grad_norm": 0.45703125, + "learning_rate": 1.4182005767771057e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9942346960306168, + "num_tokens": 345760208.0, + "step": 2900 + }, + { + "entropy": 0.636529840528965, + "epoch": 6.609067579127459, + "grad_norm": 0.455078125, + "learning_rate": 1.4165029011772513e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9947907626628876, + "num_tokens": 345879078.0, + "step": 2901 + }, + { + "entropy": 0.6398524343967438, + "epoch": 6.61134873110921, + "grad_norm": 0.65625, + "learning_rate": 1.4148058405494328e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.9951967671513557, + "num_tokens": 345998423.0, + "step": 2902 + }, + { + "entropy": 0.6386984810233116, + "epoch": 6.613629883090961, + "grad_norm": 0.498046875, + "learning_rate": 1.4131093958568695e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9948837533593178, + "num_tokens": 346117893.0, + "step": 2903 + }, + { + "entropy": 0.6320433169603348, + "epoch": 6.6159110350727115, + "grad_norm": 0.78125, + "learning_rate": 1.4114135680624291e-06, + "loss": 0.024, + "mean_token_accuracy": 0.9921987950801849, + "num_tokens": 346236964.0, + "step": 2904 + }, + { + "entropy": 0.6334750950336456, + "epoch": 6.618192187054462, + "grad_norm": 0.466796875, + "learning_rate": 1.4097183581286322e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9937204644083977, + "num_tokens": 346355732.0, + "step": 2905 + }, + { + "entropy": 0.6371889859437943, + "epoch": 6.620473339036213, + "grad_norm": 0.53515625, + "learning_rate": 1.4080237670176456e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9949009120464325, + "num_tokens": 346475394.0, + "step": 2906 + }, + { + "entropy": 0.6307118535041809, + "epoch": 6.6227544910179645, + "grad_norm": 0.52734375, + "learning_rate": 1.4063297956912875e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9956228956580162, + "num_tokens": 346595068.0, + "step": 2907 + }, + { + "entropy": 0.6381781324744225, + "epoch": 6.625035642999714, + "grad_norm": 0.5, + "learning_rate": 1.4046364451110234e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9940893203020096, + "num_tokens": 346714044.0, + "step": 2908 + }, + { + "entropy": 0.6356588155031204, + "epoch": 6.627316794981466, + "grad_norm": 0.484375, + "learning_rate": 1.4029437162379666e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.9948157146573067, + "num_tokens": 346832685.0, + "step": 2909 + }, + { + "entropy": 0.6340970247983932, + "epoch": 6.629597946963217, + "grad_norm": 0.41796875, + "learning_rate": 1.4012516100328766e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9949711039662361, + "num_tokens": 346951691.0, + "step": 2910 + }, + { + "entropy": 0.6363832578063011, + "epoch": 6.631879098944967, + "grad_norm": 0.52734375, + "learning_rate": 1.3995601274561605e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9946951940655708, + "num_tokens": 347071733.0, + "step": 2911 + }, + { + "entropy": 0.6389521732926369, + "epoch": 6.634160250926718, + "grad_norm": 0.70703125, + "learning_rate": 1.3978692694678711e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9932007864117622, + "num_tokens": 347190949.0, + "step": 2912 + }, + { + "entropy": 0.6348288506269455, + "epoch": 6.636441402908469, + "grad_norm": 0.41015625, + "learning_rate": 1.3961790370277068e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9949570968747139, + "num_tokens": 347310046.0, + "step": 2913 + }, + { + "entropy": 0.6414284706115723, + "epoch": 6.63872255489022, + "grad_norm": 0.5859375, + "learning_rate": 1.3944894310950113e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9941620007157326, + "num_tokens": 347429837.0, + "step": 2914 + }, + { + "entropy": 0.6354469284415245, + "epoch": 6.64100370687197, + "grad_norm": 0.439453125, + "learning_rate": 1.3928004526287729e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9961841851472855, + "num_tokens": 347549095.0, + "step": 2915 + }, + { + "entropy": 0.63006242364645, + "epoch": 6.643284858853721, + "grad_norm": 0.474609375, + "learning_rate": 1.3911121025876212e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9959888979792595, + "num_tokens": 347668255.0, + "step": 2916 + }, + { + "entropy": 0.6352219134569168, + "epoch": 6.645566010835472, + "grad_norm": 0.490234375, + "learning_rate": 1.389424381929832e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9944886490702629, + "num_tokens": 347787643.0, + "step": 2917 + }, + { + "entropy": 0.6309770941734314, + "epoch": 6.647847162817222, + "grad_norm": 0.8671875, + "learning_rate": 1.3877372916133234e-06, + "loss": 0.0282, + "mean_token_accuracy": 0.9915478900074959, + "num_tokens": 347906926.0, + "step": 2918 + }, + { + "entropy": 0.6365154981613159, + "epoch": 6.650128314798973, + "grad_norm": 0.51953125, + "learning_rate": 1.3860508325956549e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9933062940835953, + "num_tokens": 348026054.0, + "step": 2919 + }, + { + "entropy": 0.6383507549762726, + "epoch": 6.652409466780725, + "grad_norm": 0.478515625, + "learning_rate": 1.3843650058340291e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9941272959113121, + "num_tokens": 348145586.0, + "step": 2920 + }, + { + "entropy": 0.6372689679265022, + "epoch": 6.654690618762475, + "grad_norm": 0.5625, + "learning_rate": 1.382679812285287e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.993633434176445, + "num_tokens": 348265395.0, + "step": 2921 + }, + { + "entropy": 0.6286096274852753, + "epoch": 6.656971770744226, + "grad_norm": 0.443359375, + "learning_rate": 1.3809952529059127e-06, + "loss": 0.0136, + "mean_token_accuracy": 0.9962253570556641, + "num_tokens": 348384654.0, + "step": 2922 + }, + { + "entropy": 0.6393873244524002, + "epoch": 6.659252922725977, + "grad_norm": 0.484375, + "learning_rate": 1.3793113286520293e-06, + "loss": 0.0123, + "mean_token_accuracy": 0.9962805658578873, + "num_tokens": 348504454.0, + "step": 2923 + }, + { + "entropy": 0.6406844705343246, + "epoch": 6.661534074707728, + "grad_norm": 0.55859375, + "learning_rate": 1.3776280404794016e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9940581768751144, + "num_tokens": 348623793.0, + "step": 2924 + }, + { + "entropy": 0.6341758742928505, + "epoch": 6.663815226689478, + "grad_norm": 0.45703125, + "learning_rate": 1.3759453893434285e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9949855506420135, + "num_tokens": 348743038.0, + "step": 2925 + }, + { + "entropy": 0.6343164294958115, + "epoch": 6.666096378671229, + "grad_norm": 0.5546875, + "learning_rate": 1.3742633761991519e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9970225095748901, + "num_tokens": 348862287.0, + "step": 2926 + }, + { + "entropy": 0.6388371884822845, + "epoch": 6.66837753065298, + "grad_norm": 0.56640625, + "learning_rate": 1.3725820020012506e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.993985503911972, + "num_tokens": 348981919.0, + "step": 2927 + }, + { + "entropy": 0.6348455026745796, + "epoch": 6.6706586826347305, + "grad_norm": 0.458984375, + "learning_rate": 1.3709012677040385e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.9949459955096245, + "num_tokens": 349101418.0, + "step": 2928 + }, + { + "entropy": 0.6375269293785095, + "epoch": 6.672939834616481, + "grad_norm": 0.435546875, + "learning_rate": 1.3692211742614686e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9951573833823204, + "num_tokens": 349220273.0, + "step": 2929 + }, + { + "entropy": 0.6336026787757874, + "epoch": 6.675220986598232, + "grad_norm": 0.5234375, + "learning_rate": 1.3675417226271298e-06, + "loss": 0.0179, + "mean_token_accuracy": 0.993904821574688, + "num_tokens": 349340384.0, + "step": 2930 + }, + { + "entropy": 0.6376277059316635, + "epoch": 6.677502138579983, + "grad_norm": 0.65625, + "learning_rate": 1.365862913754247e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9922922179102898, + "num_tokens": 349460464.0, + "step": 2931 + }, + { + "entropy": 0.6327280551195145, + "epoch": 6.679783290561733, + "grad_norm": 0.65234375, + "learning_rate": 1.3641847485956782e-06, + "loss": 0.0237, + "mean_token_accuracy": 0.9926522597670555, + "num_tokens": 349579558.0, + "step": 2932 + }, + { + "entropy": 0.6359711810946465, + "epoch": 6.682064442543484, + "grad_norm": 0.5703125, + "learning_rate": 1.362507228103918e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9934621527791023, + "num_tokens": 349699087.0, + "step": 2933 + }, + { + "entropy": 0.6385207027196884, + "epoch": 6.684345594525235, + "grad_norm": 0.5390625, + "learning_rate": 1.3608303532310956e-06, + "loss": 0.0122, + "mean_token_accuracy": 0.9964470341801643, + "num_tokens": 349818232.0, + "step": 2934 + }, + { + "entropy": 0.6337127909064293, + "epoch": 6.686626746506986, + "grad_norm": 0.5546875, + "learning_rate": 1.3591541249289718e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9947812706232071, + "num_tokens": 349938376.0, + "step": 2935 + }, + { + "entropy": 0.6431235000491142, + "epoch": 6.688907898488737, + "grad_norm": 0.51953125, + "learning_rate": 1.357478544148943e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.994553916156292, + "num_tokens": 350058370.0, + "step": 2936 + }, + { + "entropy": 0.6378772333264351, + "epoch": 6.691189050470488, + "grad_norm": 0.5625, + "learning_rate": 1.3558036118420343e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9954608827829361, + "num_tokens": 350177730.0, + "step": 2937 + }, + { + "entropy": 0.6364620849490166, + "epoch": 6.693470202452239, + "grad_norm": 0.44140625, + "learning_rate": 1.3541293289589058e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.9955110102891922, + "num_tokens": 350297766.0, + "step": 2938 + }, + { + "entropy": 0.6371370702981949, + "epoch": 6.695751354433989, + "grad_norm": 0.6640625, + "learning_rate": 1.3524556964498482e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9931941106915474, + "num_tokens": 350417227.0, + "step": 2939 + }, + { + "entropy": 0.6355059668421745, + "epoch": 6.69803250641574, + "grad_norm": 0.421875, + "learning_rate": 1.3507827152647835e-06, + "loss": 0.0126, + "mean_token_accuracy": 0.9959360286593437, + "num_tokens": 350536703.0, + "step": 2940 + }, + { + "entropy": 0.6339104101061821, + "epoch": 6.700313658397491, + "grad_norm": 0.52734375, + "learning_rate": 1.3491103863532626e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9938155487179756, + "num_tokens": 350656167.0, + "step": 2941 + }, + { + "entropy": 0.6365273222327232, + "epoch": 6.702594810379241, + "grad_norm": 0.5078125, + "learning_rate": 1.3474387106644688e-06, + "loss": 0.0146, + "mean_token_accuracy": 0.9959696531295776, + "num_tokens": 350775545.0, + "step": 2942 + }, + { + "entropy": 0.6342323496937752, + "epoch": 6.704875962360992, + "grad_norm": 0.57421875, + "learning_rate": 1.345767689147211e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9928453043103218, + "num_tokens": 350895008.0, + "step": 2943 + }, + { + "entropy": 0.6392230540513992, + "epoch": 6.707157114342743, + "grad_norm": 0.671875, + "learning_rate": 1.3440973227499293e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9955042153596878, + "num_tokens": 351014109.0, + "step": 2944 + }, + { + "entropy": 0.6298525109887123, + "epoch": 6.709438266324494, + "grad_norm": 0.5078125, + "learning_rate": 1.3424276124206917e-06, + "loss": 0.0096, + "mean_token_accuracy": 0.9972305297851562, + "num_tokens": 351133364.0, + "step": 2945 + }, + { + "entropy": 0.6315116956830025, + "epoch": 6.711719418306244, + "grad_norm": 0.58203125, + "learning_rate": 1.3407585591071944e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9936975166201591, + "num_tokens": 351252115.0, + "step": 2946 + }, + { + "entropy": 0.62917809933424, + "epoch": 6.714000570287995, + "grad_norm": 0.80078125, + "learning_rate": 1.3390901637567579e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9942162185907364, + "num_tokens": 351371919.0, + "step": 2947 + }, + { + "entropy": 0.635927602648735, + "epoch": 6.716281722269747, + "grad_norm": 0.7109375, + "learning_rate": 1.3374224273163334e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9911101683974266, + "num_tokens": 351491152.0, + "step": 2948 + }, + { + "entropy": 0.6331725940108299, + "epoch": 6.718562874251497, + "grad_norm": 0.51171875, + "learning_rate": 1.3357553507324938e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.99437215924263, + "num_tokens": 351610023.0, + "step": 2949 + }, + { + "entropy": 0.6367366760969162, + "epoch": 6.720844026233248, + "grad_norm": 0.65234375, + "learning_rate": 1.3340889349514403e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9926595762372017, + "num_tokens": 351729037.0, + "step": 2950 + }, + { + "entropy": 0.6373251229524612, + "epoch": 6.723125178214999, + "grad_norm": 0.51171875, + "learning_rate": 1.3324231809189985e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.9947595596313477, + "num_tokens": 351848167.0, + "step": 2951 + }, + { + "entropy": 0.6335643976926804, + "epoch": 6.7254063301967495, + "grad_norm": 0.46484375, + "learning_rate": 1.3307580895806194e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9948648661375046, + "num_tokens": 351967620.0, + "step": 2952 + }, + { + "entropy": 0.6309294328093529, + "epoch": 6.7276874821785, + "grad_norm": 0.412109375, + "learning_rate": 1.3290936618813747e-06, + "loss": 0.0122, + "mean_token_accuracy": 0.9953417629003525, + "num_tokens": 352086735.0, + "step": 2953 + }, + { + "entropy": 0.631330132484436, + "epoch": 6.729968634160251, + "grad_norm": 0.462890625, + "learning_rate": 1.327429898765962e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9949871301651001, + "num_tokens": 352206091.0, + "step": 2954 + }, + { + "entropy": 0.6359875574707985, + "epoch": 6.732249786142002, + "grad_norm": 0.45703125, + "learning_rate": 1.3257668011787018e-06, + "loss": 0.0131, + "mean_token_accuracy": 0.9962180778384209, + "num_tokens": 352325600.0, + "step": 2955 + }, + { + "entropy": 0.6341905668377876, + "epoch": 6.734530938123752, + "grad_norm": 0.49609375, + "learning_rate": 1.3241043700635352e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.9958096295595169, + "num_tokens": 352444509.0, + "step": 2956 + }, + { + "entropy": 0.6377925202250481, + "epoch": 6.736812090105503, + "grad_norm": 0.734375, + "learning_rate": 1.3224426063640272e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.994290292263031, + "num_tokens": 352563870.0, + "step": 2957 + }, + { + "entropy": 0.6311382427811623, + "epoch": 6.739093242087254, + "grad_norm": 0.61328125, + "learning_rate": 1.320781511023363e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9947998002171516, + "num_tokens": 352683993.0, + "step": 2958 + }, + { + "entropy": 0.6396006792783737, + "epoch": 6.7413743940690045, + "grad_norm": 0.494140625, + "learning_rate": 1.3191210849843461e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9948416873812675, + "num_tokens": 352804493.0, + "step": 2959 + }, + { + "entropy": 0.6371501088142395, + "epoch": 6.743655546050755, + "grad_norm": 0.53515625, + "learning_rate": 1.3174613291894039e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.995597705245018, + "num_tokens": 352923853.0, + "step": 2960 + }, + { + "entropy": 0.6336658746004105, + "epoch": 6.745936698032507, + "grad_norm": 0.64453125, + "learning_rate": 1.3158022445805816e-06, + "loss": 0.0235, + "mean_token_accuracy": 0.993378296494484, + "num_tokens": 353043316.0, + "step": 2961 + }, + { + "entropy": 0.6309764087200165, + "epoch": 6.748217850014258, + "grad_norm": 0.44921875, + "learning_rate": 1.3141438320995433e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9958077520132065, + "num_tokens": 353162428.0, + "step": 2962 + }, + { + "entropy": 0.6348898336291313, + "epoch": 6.750499001996008, + "grad_norm": 0.482421875, + "learning_rate": 1.3124860926875732e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9954937323927879, + "num_tokens": 353282057.0, + "step": 2963 + }, + { + "entropy": 0.632026769220829, + "epoch": 6.752780153977759, + "grad_norm": 0.7421875, + "learning_rate": 1.3108290272855697e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9937203973531723, + "num_tokens": 353401761.0, + "step": 2964 + }, + { + "entropy": 0.6403724104166031, + "epoch": 6.75506130595951, + "grad_norm": 0.431640625, + "learning_rate": 1.309172636834053e-06, + "loss": 0.0081, + "mean_token_accuracy": 0.9970739334821701, + "num_tokens": 353521453.0, + "step": 2965 + }, + { + "entropy": 0.6364849582314491, + "epoch": 6.75734245794126, + "grad_norm": 0.48046875, + "learning_rate": 1.3075169222731573e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9938288033008575, + "num_tokens": 353641196.0, + "step": 2966 + }, + { + "entropy": 0.6326749250292778, + "epoch": 6.759623609923011, + "grad_norm": 0.50390625, + "learning_rate": 1.305861884542636e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9943681508302689, + "num_tokens": 353760498.0, + "step": 2967 + }, + { + "entropy": 0.6277784407138824, + "epoch": 6.761904761904762, + "grad_norm": 0.59765625, + "learning_rate": 1.3042075245818542e-06, + "loss": 0.0221, + "mean_token_accuracy": 0.9939833134412766, + "num_tokens": 353879919.0, + "step": 2968 + }, + { + "entropy": 0.6338493078947067, + "epoch": 6.764185913886513, + "grad_norm": 0.4921875, + "learning_rate": 1.3025538433297957e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.993187703192234, + "num_tokens": 354000133.0, + "step": 2969 + }, + { + "entropy": 0.63945122808218, + "epoch": 6.766467065868263, + "grad_norm": 0.51953125, + "learning_rate": 1.3009008417250597e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9951031357049942, + "num_tokens": 354119650.0, + "step": 2970 + }, + { + "entropy": 0.6380373984575272, + "epoch": 6.768748217850014, + "grad_norm": 0.59765625, + "learning_rate": 1.2992485207058548e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9944958686828613, + "num_tokens": 354238641.0, + "step": 2971 + }, + { + "entropy": 0.6368511468172073, + "epoch": 6.771029369831765, + "grad_norm": 0.59765625, + "learning_rate": 1.2975968812100081e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9940152987837791, + "num_tokens": 354358109.0, + "step": 2972 + }, + { + "entropy": 0.6354017779231071, + "epoch": 6.7733105218135155, + "grad_norm": 0.453125, + "learning_rate": 1.295945924174959e-06, + "loss": 0.0125, + "mean_token_accuracy": 0.9956970810890198, + "num_tokens": 354476831.0, + "step": 2973 + }, + { + "entropy": 0.6338790506124496, + "epoch": 6.775591673795267, + "grad_norm": 0.48828125, + "learning_rate": 1.2942956505377585e-06, + "loss": 0.0119, + "mean_token_accuracy": 0.995662622153759, + "num_tokens": 354595830.0, + "step": 2974 + }, + { + "entropy": 0.6301216557621956, + "epoch": 6.777872825777018, + "grad_norm": 0.6015625, + "learning_rate": 1.2926460612350688e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.9921172186732292, + "num_tokens": 354715166.0, + "step": 2975 + }, + { + "entropy": 0.6312245950102806, + "epoch": 6.7801539777587685, + "grad_norm": 0.52734375, + "learning_rate": 1.2909971572031663e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9950401410460472, + "num_tokens": 354835269.0, + "step": 2976 + }, + { + "entropy": 0.6316149905323982, + "epoch": 6.782435129740519, + "grad_norm": 0.376953125, + "learning_rate": 1.2893489393779362e-06, + "loss": 0.0077, + "mean_token_accuracy": 0.9975699260830879, + "num_tokens": 354954607.0, + "step": 2977 + }, + { + "entropy": 0.6339201778173447, + "epoch": 6.78471628172227, + "grad_norm": 0.486328125, + "learning_rate": 1.2877014086948762e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9944858998060226, + "num_tokens": 355073865.0, + "step": 2978 + }, + { + "entropy": 0.6378664523363113, + "epoch": 6.786997433704021, + "grad_norm": 0.435546875, + "learning_rate": 1.2860545660890928e-06, + "loss": 0.0108, + "mean_token_accuracy": 0.9959897324442863, + "num_tokens": 355192956.0, + "step": 2979 + }, + { + "entropy": 0.6382182240486145, + "epoch": 6.789278585685771, + "grad_norm": 0.447265625, + "learning_rate": 1.2844084124953006e-06, + "loss": 0.0162, + "mean_token_accuracy": 0.9949559047818184, + "num_tokens": 355312427.0, + "step": 2980 + }, + { + "entropy": 0.6342368051409721, + "epoch": 6.791559737667522, + "grad_norm": 0.63671875, + "learning_rate": 1.2827629488478254e-06, + "loss": 0.0228, + "mean_token_accuracy": 0.9938898012042046, + "num_tokens": 355432224.0, + "step": 2981 + }, + { + "entropy": 0.6361190378665924, + "epoch": 6.793840889649273, + "grad_norm": 0.515625, + "learning_rate": 1.2811181760806013e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9938919693231583, + "num_tokens": 355551280.0, + "step": 2982 + }, + { + "entropy": 0.635370597243309, + "epoch": 6.7961220416310235, + "grad_norm": 0.55859375, + "learning_rate": 1.2794740951271686e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9940051734447479, + "num_tokens": 355670067.0, + "step": 2983 + }, + { + "entropy": 0.6333052217960358, + "epoch": 6.798403193612774, + "grad_norm": 0.484375, + "learning_rate": 1.2778307069206764e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9946959838271141, + "num_tokens": 355789271.0, + "step": 2984 + }, + { + "entropy": 0.63845444470644, + "epoch": 6.800684345594525, + "grad_norm": 0.6328125, + "learning_rate": 1.2761880123938814e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9926401749253273, + "num_tokens": 355909643.0, + "step": 2985 + }, + { + "entropy": 0.6416340544819832, + "epoch": 6.802965497576276, + "grad_norm": 0.53515625, + "learning_rate": 1.2745460124791425e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.99338548630476, + "num_tokens": 356029887.0, + "step": 2986 + }, + { + "entropy": 0.6359507218003273, + "epoch": 6.805246649558027, + "grad_norm": 0.466796875, + "learning_rate": 1.272904708108429e-06, + "loss": 0.0131, + "mean_token_accuracy": 0.9955985024571419, + "num_tokens": 356148410.0, + "step": 2987 + }, + { + "entropy": 0.6357943937182426, + "epoch": 6.807527801539777, + "grad_norm": 0.53515625, + "learning_rate": 1.2712641002133128e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9933916479349136, + "num_tokens": 356267120.0, + "step": 2988 + }, + { + "entropy": 0.6311572417616844, + "epoch": 6.809808953521529, + "grad_norm": 0.55859375, + "learning_rate": 1.2696241897249728e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9938133656978607, + "num_tokens": 356386122.0, + "step": 2989 + }, + { + "entropy": 0.640383169054985, + "epoch": 6.812090105503279, + "grad_norm": 0.443359375, + "learning_rate": 1.2679849775741884e-06, + "loss": 0.0122, + "mean_token_accuracy": 0.9969994947314262, + "num_tokens": 356505560.0, + "step": 2990 + }, + { + "entropy": 0.6357828378677368, + "epoch": 6.81437125748503, + "grad_norm": 0.5078125, + "learning_rate": 1.266346464691346e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9955501332879066, + "num_tokens": 356624664.0, + "step": 2991 + }, + { + "entropy": 0.634650282561779, + "epoch": 6.816652409466781, + "grad_norm": 0.546875, + "learning_rate": 1.2647086520064343e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9920232966542244, + "num_tokens": 356744306.0, + "step": 2992 + }, + { + "entropy": 0.6268183141946793, + "epoch": 6.818933561448532, + "grad_norm": 0.400390625, + "learning_rate": 1.2630715404490424e-06, + "loss": 0.0088, + "mean_token_accuracy": 0.9968946948647499, + "num_tokens": 356863244.0, + "step": 2993 + }, + { + "entropy": 0.6322263106703758, + "epoch": 6.821214713430282, + "grad_norm": 0.51171875, + "learning_rate": 1.2614351309483646e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9952085316181183, + "num_tokens": 356983581.0, + "step": 2994 + }, + { + "entropy": 0.6315128207206726, + "epoch": 6.823495865412033, + "grad_norm": 0.62109375, + "learning_rate": 1.259799424433196e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9945899397134781, + "num_tokens": 357102371.0, + "step": 2995 + }, + { + "entropy": 0.6323722302913666, + "epoch": 6.825777017393784, + "grad_norm": 0.43359375, + "learning_rate": 1.25816442183193e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9948410019278526, + "num_tokens": 357221610.0, + "step": 2996 + }, + { + "entropy": 0.6343763172626495, + "epoch": 6.8280581693755344, + "grad_norm": 0.546875, + "learning_rate": 1.2565301240725636e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9937428385019302, + "num_tokens": 357340670.0, + "step": 2997 + }, + { + "entropy": 0.629795603454113, + "epoch": 6.830339321357285, + "grad_norm": 0.421875, + "learning_rate": 1.2548965320826928e-06, + "loss": 0.0125, + "mean_token_accuracy": 0.9963724315166473, + "num_tokens": 357459693.0, + "step": 2998 + }, + { + "entropy": 0.6340207532048225, + "epoch": 6.832620473339036, + "grad_norm": 0.53125, + "learning_rate": 1.2532636467895126e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9929986447095871, + "num_tokens": 357578928.0, + "step": 2999 + }, + { + "entropy": 0.6313799023628235, + "epoch": 6.8349016253207875, + "grad_norm": 0.48828125, + "learning_rate": 1.2516314691198172e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9935583248734474, + "num_tokens": 357698533.0, + "step": 3000 + }, + { + "entropy": 0.6322964504361153, + "epoch": 6.837182777302537, + "grad_norm": 0.5, + "learning_rate": 1.2500000000000007e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9938056170940399, + "num_tokens": 357817276.0, + "step": 3001 + }, + { + "entropy": 0.6323680281639099, + "epoch": 6.839463929284289, + "grad_norm": 0.44140625, + "learning_rate": 1.2483692403560507e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9942272305488586, + "num_tokens": 357937961.0, + "step": 3002 + }, + { + "entropy": 0.6328236162662506, + "epoch": 6.84174508126604, + "grad_norm": 0.6015625, + "learning_rate": 1.2467391911135562e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9935239255428314, + "num_tokens": 358056674.0, + "step": 3003 + }, + { + "entropy": 0.6332319602370262, + "epoch": 6.84402623324779, + "grad_norm": 0.5, + "learning_rate": 1.2451098531977015e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9931394457817078, + "num_tokens": 358176604.0, + "step": 3004 + }, + { + "entropy": 0.6376748904585838, + "epoch": 6.846307385229541, + "grad_norm": 0.412109375, + "learning_rate": 1.2434812275332678e-06, + "loss": 0.0135, + "mean_token_accuracy": 0.9950798451900482, + "num_tokens": 358295880.0, + "step": 3005 + }, + { + "entropy": 0.6391732767224312, + "epoch": 6.848588537211292, + "grad_norm": 0.447265625, + "learning_rate": 1.2418533150446324e-06, + "loss": 0.015, + "mean_token_accuracy": 0.9942489191889763, + "num_tokens": 358415522.0, + "step": 3006 + }, + { + "entropy": 0.6360207051038742, + "epoch": 6.8508696891930425, + "grad_norm": 0.53515625, + "learning_rate": 1.2402261166557647e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.994352824985981, + "num_tokens": 358534006.0, + "step": 3007 + }, + { + "entropy": 0.6372492387890816, + "epoch": 6.853150841174793, + "grad_norm": 0.53515625, + "learning_rate": 1.2385996332902326e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9947217032313347, + "num_tokens": 358653137.0, + "step": 3008 + }, + { + "entropy": 0.6365557387471199, + "epoch": 6.855431993156544, + "grad_norm": 0.5390625, + "learning_rate": 1.236973865871196e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.995607778429985, + "num_tokens": 358772122.0, + "step": 3009 + }, + { + "entropy": 0.6359140500426292, + "epoch": 6.857713145138295, + "grad_norm": 0.484375, + "learning_rate": 1.2353488153214096e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9941314905881882, + "num_tokens": 358892163.0, + "step": 3010 + }, + { + "entropy": 0.640624925494194, + "epoch": 6.859994297120045, + "grad_norm": 0.5703125, + "learning_rate": 1.2337244825632217e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9951541647315025, + "num_tokens": 359011628.0, + "step": 3011 + }, + { + "entropy": 0.6285162419080734, + "epoch": 6.862275449101796, + "grad_norm": 0.57421875, + "learning_rate": 1.2321008685185699e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9932080954313278, + "num_tokens": 359129629.0, + "step": 3012 + }, + { + "entropy": 0.6347610652446747, + "epoch": 6.864556601083547, + "grad_norm": 0.5390625, + "learning_rate": 1.2304779741089884e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9948434010148048, + "num_tokens": 359249131.0, + "step": 3013 + }, + { + "entropy": 0.6365630552172661, + "epoch": 6.8668377530652975, + "grad_norm": 0.6171875, + "learning_rate": 1.228855800255599e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.9943842738866806, + "num_tokens": 359367889.0, + "step": 3014 + }, + { + "entropy": 0.635319173336029, + "epoch": 6.869118905047049, + "grad_norm": 0.498046875, + "learning_rate": 1.2272343478791165e-06, + "loss": 0.0108, + "mean_token_accuracy": 0.9970289841294289, + "num_tokens": 359487456.0, + "step": 3015 + }, + { + "entropy": 0.6357793062925339, + "epoch": 6.8714000570288, + "grad_norm": 0.59375, + "learning_rate": 1.2256136178998468e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9922231882810593, + "num_tokens": 359607672.0, + "step": 3016 + }, + { + "entropy": 0.6456190198659897, + "epoch": 6.873681209010551, + "grad_norm": 0.625, + "learning_rate": 1.2239936112376858e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9940197020769119, + "num_tokens": 359727456.0, + "step": 3017 + }, + { + "entropy": 0.6355801895260811, + "epoch": 6.875962360992301, + "grad_norm": 0.51171875, + "learning_rate": 1.2223743288121155e-06, + "loss": 0.0097, + "mean_token_accuracy": 0.996520146727562, + "num_tokens": 359846491.0, + "step": 3018 + }, + { + "entropy": 0.6400950625538826, + "epoch": 6.878243512974052, + "grad_norm": 0.46484375, + "learning_rate": 1.2207557715422106e-06, + "loss": 0.0116, + "mean_token_accuracy": 0.9962030425667763, + "num_tokens": 359966499.0, + "step": 3019 + }, + { + "entropy": 0.6377749145030975, + "epoch": 6.880524664955803, + "grad_norm": 0.62109375, + "learning_rate": 1.219137940346633e-06, + "loss": 0.0245, + "mean_token_accuracy": 0.9925379157066345, + "num_tokens": 360085622.0, + "step": 3020 + }, + { + "entropy": 0.6300649717450142, + "epoch": 6.8828058169375534, + "grad_norm": 0.447265625, + "learning_rate": 1.2175208361436328e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9955899268388748, + "num_tokens": 360204975.0, + "step": 3021 + }, + { + "entropy": 0.6311782598495483, + "epoch": 6.885086968919304, + "grad_norm": 0.5234375, + "learning_rate": 1.2159044598510473e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9945155903697014, + "num_tokens": 360324203.0, + "step": 3022 + }, + { + "entropy": 0.6346739083528519, + "epoch": 6.887368120901055, + "grad_norm": 0.63671875, + "learning_rate": 1.2142888123862992e-06, + "loss": 0.0219, + "mean_token_accuracy": 0.9923786669969559, + "num_tokens": 360443821.0, + "step": 3023 + }, + { + "entropy": 0.6335613653063774, + "epoch": 6.889649272882806, + "grad_norm": 0.486328125, + "learning_rate": 1.2126738946663996e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9943098425865173, + "num_tokens": 360562764.0, + "step": 3024 + }, + { + "entropy": 0.6275825947523117, + "epoch": 6.891930424864556, + "grad_norm": 0.51953125, + "learning_rate": 1.2110597076079448e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9941380769014359, + "num_tokens": 360681791.0, + "step": 3025 + }, + { + "entropy": 0.6330292001366615, + "epoch": 6.894211576846307, + "grad_norm": 0.37890625, + "learning_rate": 1.2094462521271156e-06, + "loss": 0.0116, + "mean_token_accuracy": 0.9964875057339668, + "num_tokens": 360800912.0, + "step": 3026 + }, + { + "entropy": 0.6370054185390472, + "epoch": 6.896492728828058, + "grad_norm": 0.47265625, + "learning_rate": 1.2078335291396798e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9949543923139572, + "num_tokens": 360920328.0, + "step": 3027 + }, + { + "entropy": 0.636949323117733, + "epoch": 6.898773880809809, + "grad_norm": 0.427734375, + "learning_rate": 1.2062215395609856e-06, + "loss": 0.0092, + "mean_token_accuracy": 0.9973116368055344, + "num_tokens": 361039452.0, + "step": 3028 + }, + { + "entropy": 0.6361266300082207, + "epoch": 6.90105503279156, + "grad_norm": 0.48046875, + "learning_rate": 1.2046102843059681e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9941678047180176, + "num_tokens": 361158696.0, + "step": 3029 + }, + { + "entropy": 0.6395246461033821, + "epoch": 6.903336184773311, + "grad_norm": 0.4921875, + "learning_rate": 1.202999764289145e-06, + "loss": 0.0131, + "mean_token_accuracy": 0.9946194663643837, + "num_tokens": 361278400.0, + "step": 3030 + }, + { + "entropy": 0.6317725479602814, + "epoch": 6.9056173367550615, + "grad_norm": 0.5234375, + "learning_rate": 1.201389980424616e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9959233924746513, + "num_tokens": 361397368.0, + "step": 3031 + }, + { + "entropy": 0.631452813744545, + "epoch": 6.907898488736812, + "grad_norm": 0.453125, + "learning_rate": 1.1997809336260644e-06, + "loss": 0.0159, + "mean_token_accuracy": 0.9940673857927322, + "num_tokens": 361515933.0, + "step": 3032 + }, + { + "entropy": 0.6314042285084724, + "epoch": 6.910179640718563, + "grad_norm": 0.41015625, + "learning_rate": 1.1981726248067521e-06, + "loss": 0.0128, + "mean_token_accuracy": 0.9940178915858269, + "num_tokens": 361634897.0, + "step": 3033 + }, + { + "entropy": 0.6405288577079773, + "epoch": 6.912460792700314, + "grad_norm": 0.369140625, + "learning_rate": 1.1965650548795251e-06, + "loss": 0.007, + "mean_token_accuracy": 0.9978989809751511, + "num_tokens": 361754331.0, + "step": 3034 + }, + { + "entropy": 0.634571373462677, + "epoch": 6.914741944682064, + "grad_norm": 0.62109375, + "learning_rate": 1.1949582247568107e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9941046312451363, + "num_tokens": 361873410.0, + "step": 3035 + }, + { + "entropy": 0.634027436375618, + "epoch": 6.917023096663815, + "grad_norm": 0.49609375, + "learning_rate": 1.1933521353506117e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9925772473216057, + "num_tokens": 361993460.0, + "step": 3036 + }, + { + "entropy": 0.6358236819505692, + "epoch": 6.919304248645566, + "grad_norm": 0.470703125, + "learning_rate": 1.1917467875725148e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9933085441589355, + "num_tokens": 362112615.0, + "step": 3037 + }, + { + "entropy": 0.6353302150964737, + "epoch": 6.9215854006273165, + "grad_norm": 0.46875, + "learning_rate": 1.1901421823336856e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9948729500174522, + "num_tokens": 362232196.0, + "step": 3038 + }, + { + "entropy": 0.6353921964764595, + "epoch": 6.923866552609067, + "grad_norm": 0.58203125, + "learning_rate": 1.188538320544865e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.9924131035804749, + "num_tokens": 362351469.0, + "step": 3039 + }, + { + "entropy": 0.6387206688523293, + "epoch": 6.926147704590818, + "grad_norm": 0.75, + "learning_rate": 1.1869352031163746e-06, + "loss": 0.0245, + "mean_token_accuracy": 0.9925548732280731, + "num_tokens": 362470689.0, + "step": 3040 + }, + { + "entropy": 0.6291440054774284, + "epoch": 6.92842885657257, + "grad_norm": 0.41015625, + "learning_rate": 1.1853328309581139e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.9956337809562683, + "num_tokens": 362589764.0, + "step": 3041 + }, + { + "entropy": 0.6348235085606575, + "epoch": 6.93071000855432, + "grad_norm": 0.6328125, + "learning_rate": 1.183731204979557e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9926017597317696, + "num_tokens": 362709752.0, + "step": 3042 + }, + { + "entropy": 0.6378262788057327, + "epoch": 6.932991160536071, + "grad_norm": 0.625, + "learning_rate": 1.182130326089758e-06, + "loss": 0.015, + "mean_token_accuracy": 0.994606539607048, + "num_tokens": 362829523.0, + "step": 3043 + }, + { + "entropy": 0.6356984674930573, + "epoch": 6.935272312517822, + "grad_norm": 0.6015625, + "learning_rate": 1.1805301951973423e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9953272044658661, + "num_tokens": 362949035.0, + "step": 3044 + }, + { + "entropy": 0.6394585743546486, + "epoch": 6.937553464499572, + "grad_norm": 0.5078125, + "learning_rate": 1.1789308132105145e-06, + "loss": 0.0127, + "mean_token_accuracy": 0.995325118303299, + "num_tokens": 363067971.0, + "step": 3045 + }, + { + "entropy": 0.6302739381790161, + "epoch": 6.939834616481323, + "grad_norm": 0.53125, + "learning_rate": 1.1773321810370527e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9937824755907059, + "num_tokens": 363186823.0, + "step": 3046 + }, + { + "entropy": 0.6366415619850159, + "epoch": 6.942115768463074, + "grad_norm": 0.43359375, + "learning_rate": 1.1757342995843103e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9945184662938118, + "num_tokens": 363305837.0, + "step": 3047 + }, + { + "entropy": 0.6384926065802574, + "epoch": 6.944396920444825, + "grad_norm": 0.443359375, + "learning_rate": 1.1741371697592134e-06, + "loss": 0.0118, + "mean_token_accuracy": 0.9957948178052902, + "num_tokens": 363425063.0, + "step": 3048 + }, + { + "entropy": 0.6349277347326279, + "epoch": 6.946678072426575, + "grad_norm": 0.56640625, + "learning_rate": 1.1725407924682628e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9951661974191666, + "num_tokens": 363545245.0, + "step": 3049 + }, + { + "entropy": 0.6379545629024506, + "epoch": 6.948959224408326, + "grad_norm": 0.451171875, + "learning_rate": 1.17094516861753e-06, + "loss": 0.0112, + "mean_token_accuracy": 0.9965931624174118, + "num_tokens": 363665530.0, + "step": 3050 + }, + { + "entropy": 0.6296952366828918, + "epoch": 6.951240376390077, + "grad_norm": 0.5546875, + "learning_rate": 1.1693502991126609e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9933000281453133, + "num_tokens": 363784770.0, + "step": 3051 + }, + { + "entropy": 0.6343287006020546, + "epoch": 6.9535215283718275, + "grad_norm": 0.53515625, + "learning_rate": 1.1677561848588734e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9941115155816078, + "num_tokens": 363904154.0, + "step": 3052 + }, + { + "entropy": 0.6360017880797386, + "epoch": 6.955802680353578, + "grad_norm": 0.54296875, + "learning_rate": 1.166162826760955e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9930659160017967, + "num_tokens": 364024028.0, + "step": 3053 + }, + { + "entropy": 0.633079931139946, + "epoch": 6.95808383233533, + "grad_norm": 0.48828125, + "learning_rate": 1.1645702257232663e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.995020903646946, + "num_tokens": 364143104.0, + "step": 3054 + }, + { + "entropy": 0.6347816288471222, + "epoch": 6.9603649843170805, + "grad_norm": 0.640625, + "learning_rate": 1.1629783826497351e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.994701161980629, + "num_tokens": 364262383.0, + "step": 3055 + }, + { + "entropy": 0.6311990469694138, + "epoch": 6.962646136298831, + "grad_norm": 0.57421875, + "learning_rate": 1.161387298443863e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9960086643695831, + "num_tokens": 364381794.0, + "step": 3056 + }, + { + "entropy": 0.6332893893122673, + "epoch": 6.964927288280582, + "grad_norm": 0.48828125, + "learning_rate": 1.1597969740087159e-06, + "loss": 0.0136, + "mean_token_accuracy": 0.9958059713244438, + "num_tokens": 364500949.0, + "step": 3057 + }, + { + "entropy": 0.634690061211586, + "epoch": 6.967208440262333, + "grad_norm": 0.7734375, + "learning_rate": 1.1582074102469332e-06, + "loss": 0.0276, + "mean_token_accuracy": 0.991781122982502, + "num_tokens": 364620843.0, + "step": 3058 + }, + { + "entropy": 0.6358847171068192, + "epoch": 6.969489592244083, + "grad_norm": 0.51171875, + "learning_rate": 1.1566186080607198e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9954819157719612, + "num_tokens": 364740586.0, + "step": 3059 + }, + { + "entropy": 0.6353931725025177, + "epoch": 6.971770744225834, + "grad_norm": 0.59765625, + "learning_rate": 1.1550305683518506e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9957445412874222, + "num_tokens": 364859973.0, + "step": 3060 + }, + { + "entropy": 0.6316233426332474, + "epoch": 6.974051896207585, + "grad_norm": 0.72265625, + "learning_rate": 1.1534432920216643e-06, + "loss": 0.0278, + "mean_token_accuracy": 0.9908296018838882, + "num_tokens": 364978699.0, + "step": 3061 + }, + { + "entropy": 0.6320228278636932, + "epoch": 6.9763330481893355, + "grad_norm": 0.376953125, + "learning_rate": 1.151856779971069e-06, + "loss": 0.0092, + "mean_token_accuracy": 0.9973373785614967, + "num_tokens": 365097808.0, + "step": 3062 + }, + { + "entropy": 0.6382093206048012, + "epoch": 6.978614200171086, + "grad_norm": 0.5390625, + "learning_rate": 1.1502710331005384e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.994241289794445, + "num_tokens": 365217133.0, + "step": 3063 + }, + { + "entropy": 0.6326189190149307, + "epoch": 6.980895352152837, + "grad_norm": 0.53515625, + "learning_rate": 1.148686052310112e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9951288998126984, + "num_tokens": 365335813.0, + "step": 3064 + }, + { + "entropy": 0.635586753487587, + "epoch": 6.983176504134588, + "grad_norm": 0.5703125, + "learning_rate": 1.147101838499395e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9917220696806908, + "num_tokens": 365455114.0, + "step": 3065 + }, + { + "entropy": 0.6359457224607468, + "epoch": 6.985457656116338, + "grad_norm": 0.5078125, + "learning_rate": 1.145518392567555e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9950269535183907, + "num_tokens": 365574120.0, + "step": 3066 + }, + { + "entropy": 0.6370173245668411, + "epoch": 6.98773880809809, + "grad_norm": 0.53515625, + "learning_rate": 1.1439357154133263e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9942919835448265, + "num_tokens": 365694735.0, + "step": 3067 + }, + { + "entropy": 0.6429724767804146, + "epoch": 6.99001996007984, + "grad_norm": 0.6015625, + "learning_rate": 1.1423538079350053e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9940526038408279, + "num_tokens": 365814238.0, + "step": 3068 + }, + { + "entropy": 0.6358332931995392, + "epoch": 6.992301112061591, + "grad_norm": 0.45703125, + "learning_rate": 1.1407726710304525e-06, + "loss": 0.0149, + "mean_token_accuracy": 0.9956086650490761, + "num_tokens": 365933162.0, + "step": 3069 + }, + { + "entropy": 0.6342254281044006, + "epoch": 6.994582264043342, + "grad_norm": 0.609375, + "learning_rate": 1.139192305597092e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9937140718102455, + "num_tokens": 366052166.0, + "step": 3070 + }, + { + "entropy": 0.6297565549612045, + "epoch": 6.996863416025093, + "grad_norm": 0.59375, + "learning_rate": 1.1376127125319065e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9949942454695702, + "num_tokens": 366171828.0, + "step": 3071 + }, + { + "entropy": 0.6306156441569328, + "epoch": 6.999144568006844, + "grad_norm": 0.640625, + "learning_rate": 1.1360338927314432e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9947856292128563, + "num_tokens": 366290614.0, + "step": 3072 + }, + { + "entropy": 0.6374910076459249, + "epoch": 7.0, + "grad_norm": 0.86328125, + "learning_rate": 1.1344558470918098e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.9940271178881327, + "num_tokens": 366334346.0, + "step": 3073 + }, + { + "entropy": 0.6349820122122765, + "epoch": 7.002281151981751, + "grad_norm": 0.5234375, + "learning_rate": 1.1328785765086752e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.9948492646217346, + "num_tokens": 366453691.0, + "step": 3074 + }, + { + "entropy": 0.6317384392023087, + "epoch": 7.004562303963501, + "grad_norm": 0.61328125, + "learning_rate": 1.131302081877268e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9921156466007233, + "num_tokens": 366572757.0, + "step": 3075 + }, + { + "entropy": 0.6297228559851646, + "epoch": 7.006843455945252, + "grad_norm": 0.47265625, + "learning_rate": 1.1297263640923745e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9933271482586861, + "num_tokens": 366691507.0, + "step": 3076 + }, + { + "entropy": 0.6375428661704063, + "epoch": 7.009124607927003, + "grad_norm": 0.56640625, + "learning_rate": 1.1281514240483427e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9945895671844482, + "num_tokens": 366810229.0, + "step": 3077 + }, + { + "entropy": 0.637428767979145, + "epoch": 7.011405759908754, + "grad_norm": 0.48046875, + "learning_rate": 1.1265772626390786e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9955852031707764, + "num_tokens": 366929955.0, + "step": 3078 + }, + { + "entropy": 0.6337930262088776, + "epoch": 7.013686911890504, + "grad_norm": 0.40234375, + "learning_rate": 1.1250038807580449e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9961013197898865, + "num_tokens": 367048880.0, + "step": 3079 + }, + { + "entropy": 0.6415168270468712, + "epoch": 7.015968063872256, + "grad_norm": 0.41796875, + "learning_rate": 1.1234312792982627e-06, + "loss": 0.0114, + "mean_token_accuracy": 0.9957900568842888, + "num_tokens": 367168639.0, + "step": 3080 + }, + { + "epoch": 7.015968063872256, + "eval_entropy": 0.6350783553866832, + "eval_loss": 0.020568370819091797, + "eval_mean_token_accuracy": 0.9935885785650391, + "eval_num_tokens": 367168639.0, + "eval_runtime": 177.5734, + "eval_samples_per_second": 47.22, + "eval_steps_per_second": 1.481, + "step": 3080 + }, + { + "entropy": 0.6359517797827721, + "epoch": 7.018249215854007, + "grad_norm": 0.3671875, + "learning_rate": 1.1218594591523118e-06, + "loss": 0.0107, + "mean_token_accuracy": 0.9970712587237358, + "num_tokens": 367288288.0, + "step": 3081 + }, + { + "entropy": 0.6337655186653137, + "epoch": 7.020530367835757, + "grad_norm": 0.46484375, + "learning_rate": 1.120288421212325e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9950264990329742, + "num_tokens": 367407570.0, + "step": 3082 + }, + { + "entropy": 0.6420150995254517, + "epoch": 7.022811519817508, + "grad_norm": 0.64453125, + "learning_rate": 1.1187181663699935e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9937262311577797, + "num_tokens": 367527275.0, + "step": 3083 + }, + { + "entropy": 0.6352385953068733, + "epoch": 7.025092671799259, + "grad_norm": 0.5234375, + "learning_rate": 1.1171486955165645e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9952161461114883, + "num_tokens": 367646937.0, + "step": 3084 + }, + { + "entropy": 0.6362062618136406, + "epoch": 7.0273738237810095, + "grad_norm": 0.47265625, + "learning_rate": 1.115580009542839e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9954133927822113, + "num_tokens": 367766589.0, + "step": 3085 + }, + { + "entropy": 0.6314065530896187, + "epoch": 7.02965497576276, + "grad_norm": 0.470703125, + "learning_rate": 1.1140121093391736e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9945486485958099, + "num_tokens": 367886414.0, + "step": 3086 + }, + { + "entropy": 0.6334340572357178, + "epoch": 7.031936127744511, + "grad_norm": 0.53515625, + "learning_rate": 1.1124449957954764e-06, + "loss": 0.0145, + "mean_token_accuracy": 0.9945666715502739, + "num_tokens": 368005795.0, + "step": 3087 + }, + { + "entropy": 0.6338682845234871, + "epoch": 7.034217279726262, + "grad_norm": 0.41796875, + "learning_rate": 1.110878669801212e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.99635399132967, + "num_tokens": 368125668.0, + "step": 3088 + }, + { + "entropy": 0.6334460005164146, + "epoch": 7.036498431708012, + "grad_norm": 0.43359375, + "learning_rate": 1.1093131322453966e-06, + "loss": 0.0125, + "mean_token_accuracy": 0.9953039661049843, + "num_tokens": 368244844.0, + "step": 3089 + }, + { + "entropy": 0.6343573108315468, + "epoch": 7.038779583689763, + "grad_norm": 0.4765625, + "learning_rate": 1.1077483840165986e-06, + "loss": 0.0123, + "mean_token_accuracy": 0.9947286769747734, + "num_tokens": 368363812.0, + "step": 3090 + }, + { + "entropy": 0.6322126090526581, + "epoch": 7.041060735671514, + "grad_norm": 0.4765625, + "learning_rate": 1.10618442600294e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9954757690429688, + "num_tokens": 368482892.0, + "step": 3091 + }, + { + "entropy": 0.6367430314421654, + "epoch": 7.0433418876532645, + "grad_norm": 0.443359375, + "learning_rate": 1.1046212590920931e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9955750554800034, + "num_tokens": 368603024.0, + "step": 3092 + }, + { + "entropy": 0.6356260925531387, + "epoch": 7.045623039635016, + "grad_norm": 0.671875, + "learning_rate": 1.10305888417128e-06, + "loss": 0.023, + "mean_token_accuracy": 0.9932549968361855, + "num_tokens": 368722155.0, + "step": 3093 + }, + { + "entropy": 0.6300321817398071, + "epoch": 7.047904191616767, + "grad_norm": 0.38671875, + "learning_rate": 1.101497302127275e-06, + "loss": 0.012, + "mean_token_accuracy": 0.9961551055312157, + "num_tokens": 368840751.0, + "step": 3094 + }, + { + "entropy": 0.634417362511158, + "epoch": 7.050185343598518, + "grad_norm": 0.5078125, + "learning_rate": 1.0999365138464024e-06, + "loss": 0.0119, + "mean_token_accuracy": 0.9964703395962715, + "num_tokens": 368960452.0, + "step": 3095 + }, + { + "entropy": 0.6360241323709488, + "epoch": 7.052466495580268, + "grad_norm": 0.64453125, + "learning_rate": 1.0983765202145351e-06, + "loss": 0.0271, + "mean_token_accuracy": 0.9927521869540215, + "num_tokens": 369079059.0, + "step": 3096 + }, + { + "entropy": 0.6379231959581375, + "epoch": 7.054747647562019, + "grad_norm": 0.5078125, + "learning_rate": 1.0968173221170966e-06, + "loss": 0.0139, + "mean_token_accuracy": 0.9975702837109566, + "num_tokens": 369198480.0, + "step": 3097 + }, + { + "entropy": 0.6256024986505508, + "epoch": 7.05702879954377, + "grad_norm": 0.478515625, + "learning_rate": 1.0952589204390557e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.993292786180973, + "num_tokens": 369318487.0, + "step": 3098 + }, + { + "entropy": 0.6404053121805191, + "epoch": 7.05930995152552, + "grad_norm": 0.58203125, + "learning_rate": 1.0937013160649328e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9943149462342262, + "num_tokens": 369437848.0, + "step": 3099 + }, + { + "entropy": 0.6341419667005539, + "epoch": 7.061591103507271, + "grad_norm": 0.546875, + "learning_rate": 1.0921445098787923e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9940514639019966, + "num_tokens": 369557882.0, + "step": 3100 + }, + { + "entropy": 0.629876896739006, + "epoch": 7.063872255489022, + "grad_norm": 0.4609375, + "learning_rate": 1.0905885027642484e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.994902178645134, + "num_tokens": 369677064.0, + "step": 3101 + }, + { + "entropy": 0.6353949904441833, + "epoch": 7.066153407470773, + "grad_norm": 0.48046875, + "learning_rate": 1.0890332956044614e-06, + "loss": 0.0114, + "mean_token_accuracy": 0.9962372332811356, + "num_tokens": 369796018.0, + "step": 3102 + }, + { + "entropy": 0.6372416168451309, + "epoch": 7.068434559452523, + "grad_norm": 0.490234375, + "learning_rate": 1.0874788892821354e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9955978095531464, + "num_tokens": 369915481.0, + "step": 3103 + }, + { + "entropy": 0.63969786465168, + "epoch": 7.070715711434274, + "grad_norm": 0.52734375, + "learning_rate": 1.0859252846795215e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.994732566177845, + "num_tokens": 370035131.0, + "step": 3104 + }, + { + "entropy": 0.6320174485445023, + "epoch": 7.072996863416025, + "grad_norm": 0.5, + "learning_rate": 1.0843724826784165e-06, + "loss": 0.0132, + "mean_token_accuracy": 0.9955779016017914, + "num_tokens": 370153929.0, + "step": 3105 + }, + { + "entropy": 0.6359575167298317, + "epoch": 7.0752780153977755, + "grad_norm": 0.494140625, + "learning_rate": 1.0828204841601608e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9958814978599548, + "num_tokens": 370273252.0, + "step": 3106 + }, + { + "entropy": 0.6304692327976227, + "epoch": 7.077559167379527, + "grad_norm": 0.55859375, + "learning_rate": 1.0812692900056384e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9931458383798599, + "num_tokens": 370392602.0, + "step": 3107 + }, + { + "entropy": 0.6323088258504868, + "epoch": 7.079840319361278, + "grad_norm": 0.498046875, + "learning_rate": 1.0797189010952784e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9947322830557823, + "num_tokens": 370511700.0, + "step": 3108 + }, + { + "entropy": 0.6332127377390862, + "epoch": 7.0821214713430285, + "grad_norm": 0.57421875, + "learning_rate": 1.0781693183090495e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9941490218043327, + "num_tokens": 370631301.0, + "step": 3109 + }, + { + "entropy": 0.6264070942997932, + "epoch": 7.084402623324779, + "grad_norm": 0.466796875, + "learning_rate": 1.076620542526466e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9944610446691513, + "num_tokens": 370750702.0, + "step": 3110 + }, + { + "entropy": 0.6340319812297821, + "epoch": 7.08668377530653, + "grad_norm": 0.53515625, + "learning_rate": 1.0750725746265832e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9941121488809586, + "num_tokens": 370871223.0, + "step": 3111 + }, + { + "entropy": 0.6320077702403069, + "epoch": 7.088964927288281, + "grad_norm": 0.4140625, + "learning_rate": 1.0735254154879979e-06, + "loss": 0.0131, + "mean_token_accuracy": 0.9971850290894508, + "num_tokens": 370990096.0, + "step": 3112 + }, + { + "entropy": 0.6342846229672432, + "epoch": 7.091246079270031, + "grad_norm": 0.48828125, + "learning_rate": 1.0719790659888481e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.995010532438755, + "num_tokens": 371109599.0, + "step": 3113 + }, + { + "entropy": 0.63421980291605, + "epoch": 7.093527231251782, + "grad_norm": 0.466796875, + "learning_rate": 1.070433527006811e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9938091039657593, + "num_tokens": 371228929.0, + "step": 3114 + }, + { + "entropy": 0.6361847221851349, + "epoch": 7.095808383233533, + "grad_norm": 0.56640625, + "learning_rate": 1.0688887994191049e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9932485297322273, + "num_tokens": 371348666.0, + "step": 3115 + }, + { + "entropy": 0.6374924406409264, + "epoch": 7.0980895352152835, + "grad_norm": 0.57421875, + "learning_rate": 1.0673448841024875e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.9947165995836258, + "num_tokens": 371468772.0, + "step": 3116 + }, + { + "entropy": 0.6348633840680122, + "epoch": 7.100370687197034, + "grad_norm": 0.4453125, + "learning_rate": 1.0658017819332556e-06, + "loss": 0.013, + "mean_token_accuracy": 0.9968965500593185, + "num_tokens": 371588827.0, + "step": 3117 + }, + { + "entropy": 0.6340183466672897, + "epoch": 7.102651839178785, + "grad_norm": 0.5390625, + "learning_rate": 1.064259493787244e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9947556555271149, + "num_tokens": 371708393.0, + "step": 3118 + }, + { + "entropy": 0.6351324543356895, + "epoch": 7.104932991160536, + "grad_norm": 0.59375, + "learning_rate": 1.0627180205398263e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9918829575181007, + "num_tokens": 371827697.0, + "step": 3119 + }, + { + "entropy": 0.6386658698320389, + "epoch": 7.107214143142287, + "grad_norm": 0.50390625, + "learning_rate": 1.0611773630659117e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9953688159584999, + "num_tokens": 371947402.0, + "step": 3120 + }, + { + "entropy": 0.6358364969491959, + "epoch": 7.109495295124038, + "grad_norm": 0.423828125, + "learning_rate": 1.0596375222399491e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9963491037487984, + "num_tokens": 372066657.0, + "step": 3121 + }, + { + "entropy": 0.630089245736599, + "epoch": 7.111776447105789, + "grad_norm": 0.53125, + "learning_rate": 1.0580984989359205e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9954024702310562, + "num_tokens": 372185827.0, + "step": 3122 + }, + { + "entropy": 0.6373429074883461, + "epoch": 7.114057599087539, + "grad_norm": 0.50390625, + "learning_rate": 1.0565602940273472e-06, + "loss": 0.0129, + "mean_token_accuracy": 0.9956922978162766, + "num_tokens": 372305323.0, + "step": 3123 + }, + { + "entropy": 0.6396269649267197, + "epoch": 7.11633875106929, + "grad_norm": 0.45703125, + "learning_rate": 1.055022908387285e-06, + "loss": 0.016, + "mean_token_accuracy": 0.9950522854924202, + "num_tokens": 372424681.0, + "step": 3124 + }, + { + "entropy": 0.6341502070426941, + "epoch": 7.118619903051041, + "grad_norm": 0.404296875, + "learning_rate": 1.053486342888323e-06, + "loss": 0.0135, + "mean_token_accuracy": 0.996442161500454, + "num_tokens": 372544454.0, + "step": 3125 + }, + { + "entropy": 0.632528118789196, + "epoch": 7.120901055032792, + "grad_norm": 0.5859375, + "learning_rate": 1.0519505984025865e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9947325587272644, + "num_tokens": 372663262.0, + "step": 3126 + }, + { + "entropy": 0.6304703056812286, + "epoch": 7.123182207014542, + "grad_norm": 0.408203125, + "learning_rate": 1.050415675801735e-06, + "loss": 0.0118, + "mean_token_accuracy": 0.995544508099556, + "num_tokens": 372782890.0, + "step": 3127 + }, + { + "entropy": 0.6355683952569962, + "epoch": 7.125463358996293, + "grad_norm": 0.5390625, + "learning_rate": 1.0488815759569605e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9954581335186958, + "num_tokens": 372902265.0, + "step": 3128 + }, + { + "entropy": 0.6321379244327545, + "epoch": 7.127744510978044, + "grad_norm": 0.443359375, + "learning_rate": 1.0473482997389891e-06, + "loss": 0.0125, + "mean_token_accuracy": 0.9952060952782631, + "num_tokens": 373021484.0, + "step": 3129 + }, + { + "entropy": 0.6365072503685951, + "epoch": 7.1300256629597945, + "grad_norm": 0.404296875, + "learning_rate": 1.0458158480180777e-06, + "loss": 0.0096, + "mean_token_accuracy": 0.9967480152845383, + "num_tokens": 373140527.0, + "step": 3130 + }, + { + "entropy": 0.6390231922268867, + "epoch": 7.132306814941545, + "grad_norm": 0.51171875, + "learning_rate": 1.0442842216640168e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9936532750725746, + "num_tokens": 373260030.0, + "step": 3131 + }, + { + "entropy": 0.630491703748703, + "epoch": 7.134587966923296, + "grad_norm": 0.419921875, + "learning_rate": 1.042753421546128e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9953824803233147, + "num_tokens": 373379111.0, + "step": 3132 + }, + { + "entropy": 0.6368322893977165, + "epoch": 7.136869118905047, + "grad_norm": 0.396484375, + "learning_rate": 1.0412234485332636e-06, + "loss": 0.0099, + "mean_token_accuracy": 0.9957470968365669, + "num_tokens": 373498373.0, + "step": 3133 + }, + { + "entropy": 0.6307329162955284, + "epoch": 7.139150270886798, + "grad_norm": 0.4921875, + "learning_rate": 1.0396943034938077e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9950628206133842, + "num_tokens": 373617092.0, + "step": 3134 + }, + { + "entropy": 0.6331895440816879, + "epoch": 7.141431422868549, + "grad_norm": 0.6015625, + "learning_rate": 1.0381659872956732e-06, + "loss": 0.0156, + "mean_token_accuracy": 0.9937996864318848, + "num_tokens": 373735837.0, + "step": 3135 + }, + { + "entropy": 0.6336100921034813, + "epoch": 7.1437125748503, + "grad_norm": 0.5390625, + "learning_rate": 1.0366385008063015e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9952037557959557, + "num_tokens": 373854871.0, + "step": 3136 + }, + { + "entropy": 0.6376906409859657, + "epoch": 7.14599372683205, + "grad_norm": 0.73046875, + "learning_rate": 1.0351118448926658e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9935107156634331, + "num_tokens": 373974667.0, + "step": 3137 + }, + { + "entropy": 0.6344331875443459, + "epoch": 7.148274878813801, + "grad_norm": 0.671875, + "learning_rate": 1.0335860204212662e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9928370714187622, + "num_tokens": 374093843.0, + "step": 3138 + }, + { + "entropy": 0.6327697411179543, + "epoch": 7.150556030795552, + "grad_norm": 0.53125, + "learning_rate": 1.0320610282581309e-06, + "loss": 0.0131, + "mean_token_accuracy": 0.9948916360735893, + "num_tokens": 374213062.0, + "step": 3139 + }, + { + "entropy": 0.6361751928925514, + "epoch": 7.1528371827773025, + "grad_norm": 0.5078125, + "learning_rate": 1.0305368692688175e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.992705762386322, + "num_tokens": 374332717.0, + "step": 3140 + }, + { + "entropy": 0.6365441381931305, + "epoch": 7.155118334759053, + "grad_norm": 0.51171875, + "learning_rate": 1.029013544318407e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9956522434949875, + "num_tokens": 374452372.0, + "step": 3141 + }, + { + "entropy": 0.6323189735412598, + "epoch": 7.157399486740804, + "grad_norm": 0.61328125, + "learning_rate": 1.0274910542715103e-06, + "loss": 0.021, + "mean_token_accuracy": 0.9943337365984917, + "num_tokens": 374571403.0, + "step": 3142 + }, + { + "entropy": 0.636553592979908, + "epoch": 7.159680638722555, + "grad_norm": 0.59765625, + "learning_rate": 1.025969399992264e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9928077682852745, + "num_tokens": 374690905.0, + "step": 3143 + }, + { + "entropy": 0.6364502459764481, + "epoch": 7.161961790704305, + "grad_norm": 0.5546875, + "learning_rate": 1.0244485823443281e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.99508947879076, + "num_tokens": 374810386.0, + "step": 3144 + }, + { + "entropy": 0.6359213963150978, + "epoch": 7.164242942686056, + "grad_norm": 0.5390625, + "learning_rate": 1.0229286021908913e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9938142448663712, + "num_tokens": 374930242.0, + "step": 3145 + }, + { + "entropy": 0.6381103619933128, + "epoch": 7.166524094667807, + "grad_norm": 0.546875, + "learning_rate": 1.021409460394663e-06, + "loss": 0.0171, + "mean_token_accuracy": 0.9942701309919357, + "num_tokens": 375051234.0, + "step": 3146 + }, + { + "entropy": 0.6339331939816475, + "epoch": 7.168805246649558, + "grad_norm": 0.58203125, + "learning_rate": 1.0198911578178797e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9935072287917137, + "num_tokens": 375171031.0, + "step": 3147 + }, + { + "entropy": 0.6306760981678963, + "epoch": 7.171086398631309, + "grad_norm": 0.68359375, + "learning_rate": 1.0183736953223005e-06, + "loss": 0.0199, + "mean_token_accuracy": 0.993847019970417, + "num_tokens": 375290124.0, + "step": 3148 + }, + { + "entropy": 0.6349038109183311, + "epoch": 7.17336755061306, + "grad_norm": 0.65625, + "learning_rate": 1.0168570737692082e-06, + "loss": 0.0207, + "mean_token_accuracy": 0.9933270364999771, + "num_tokens": 375409775.0, + "step": 3149 + }, + { + "entropy": 0.6351282522082329, + "epoch": 7.175648702594811, + "grad_norm": 0.56640625, + "learning_rate": 1.0153412940194073e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9940589964389801, + "num_tokens": 375529979.0, + "step": 3150 + }, + { + "entropy": 0.6293105334043503, + "epoch": 7.177929854576561, + "grad_norm": 0.4765625, + "learning_rate": 1.0138263569332268e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9950124993920326, + "num_tokens": 375649205.0, + "step": 3151 + }, + { + "entropy": 0.6313612535595894, + "epoch": 7.180211006558312, + "grad_norm": 0.51953125, + "learning_rate": 1.0123122633705131e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9945074021816254, + "num_tokens": 375768034.0, + "step": 3152 + }, + { + "entropy": 0.6320651099085808, + "epoch": 7.182492158540063, + "grad_norm": 0.47265625, + "learning_rate": 1.0107990141906378e-06, + "loss": 0.0135, + "mean_token_accuracy": 0.9946425706148148, + "num_tokens": 375886885.0, + "step": 3153 + }, + { + "entropy": 0.6334495395421982, + "epoch": 7.1847733105218134, + "grad_norm": 0.390625, + "learning_rate": 1.0092866102524922e-06, + "loss": 0.0166, + "mean_token_accuracy": 0.9931411594152451, + "num_tokens": 376006312.0, + "step": 3154 + }, + { + "entropy": 0.6375441178679466, + "epoch": 7.187054462503564, + "grad_norm": 0.494140625, + "learning_rate": 1.0077750524144871e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9950477182865143, + "num_tokens": 376126011.0, + "step": 3155 + }, + { + "entropy": 0.6323321312665939, + "epoch": 7.189335614485315, + "grad_norm": 0.54296875, + "learning_rate": 1.0062643415345546e-06, + "loss": 0.02, + "mean_token_accuracy": 0.994036965072155, + "num_tokens": 376245046.0, + "step": 3156 + }, + { + "entropy": 0.6344771534204483, + "epoch": 7.191616766467066, + "grad_norm": 0.4453125, + "learning_rate": 1.0047544784701435e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9939015954732895, + "num_tokens": 376365021.0, + "step": 3157 + }, + { + "entropy": 0.6329099908471107, + "epoch": 7.193897918448816, + "grad_norm": 0.40625, + "learning_rate": 1.0032454640782232e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9947059527039528, + "num_tokens": 376483738.0, + "step": 3158 + }, + { + "entropy": 0.6365019977092743, + "epoch": 7.196179070430567, + "grad_norm": 0.51171875, + "learning_rate": 1.0017372992152819e-06, + "loss": 0.0167, + "mean_token_accuracy": 0.994713731110096, + "num_tokens": 376603727.0, + "step": 3159 + }, + { + "entropy": 0.6309832260012627, + "epoch": 7.198460222412319, + "grad_norm": 0.53125, + "learning_rate": 1.0002299847373243e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9932510554790497, + "num_tokens": 376723017.0, + "step": 3160 + }, + { + "entropy": 0.6296091228723526, + "epoch": 7.200741374394069, + "grad_norm": 0.609375, + "learning_rate": 9.987235214998741e-07, + "loss": 0.0222, + "mean_token_accuracy": 0.9930503889918327, + "num_tokens": 376841807.0, + "step": 3161 + }, + { + "entropy": 0.6354536488652229, + "epoch": 7.20302252637582, + "grad_norm": 0.43359375, + "learning_rate": 9.972179103579687e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9932919070124626, + "num_tokens": 376960835.0, + "step": 3162 + }, + { + "entropy": 0.6335585340857506, + "epoch": 7.205303678357571, + "grad_norm": 0.64453125, + "learning_rate": 9.957131521661655e-07, + "loss": 0.0216, + "mean_token_accuracy": 0.9941623210906982, + "num_tokens": 377080396.0, + "step": 3163 + }, + { + "entropy": 0.629380889236927, + "epoch": 7.2075848303393215, + "grad_norm": 0.4609375, + "learning_rate": 9.942092477785365e-07, + "loss": 0.0135, + "mean_token_accuracy": 0.995454765856266, + "num_tokens": 377199801.0, + "step": 3164 + }, + { + "entropy": 0.6326709240674973, + "epoch": 7.209865982321072, + "grad_norm": 0.51953125, + "learning_rate": 9.927061980486668e-07, + "loss": 0.0173, + "mean_token_accuracy": 0.9958153963088989, + "num_tokens": 377318687.0, + "step": 3165 + }, + { + "entropy": 0.6373377963900566, + "epoch": 7.212147134302823, + "grad_norm": 0.369140625, + "learning_rate": 9.9120400382966e-07, + "loss": 0.0103, + "mean_token_accuracy": 0.9964360073208809, + "num_tokens": 377437945.0, + "step": 3166 + }, + { + "entropy": 0.633292905986309, + "epoch": 7.214428286284574, + "grad_norm": 0.5390625, + "learning_rate": 9.897026659741328e-07, + "loss": 0.0193, + "mean_token_accuracy": 0.9948393180966377, + "num_tokens": 377557116.0, + "step": 3167 + }, + { + "entropy": 0.6359308287501335, + "epoch": 7.216709438266324, + "grad_norm": 0.421875, + "learning_rate": 9.882021853342143e-07, + "loss": 0.0126, + "mean_token_accuracy": 0.9963874667882919, + "num_tokens": 377676128.0, + "step": 3168 + }, + { + "entropy": 0.6351453438401222, + "epoch": 7.218990590248075, + "grad_norm": 0.44140625, + "learning_rate": 9.867025627615493e-07, + "loss": 0.0098, + "mean_token_accuracy": 0.9962119162082672, + "num_tokens": 377795717.0, + "step": 3169 + }, + { + "entropy": 0.6295133680105209, + "epoch": 7.221271742229826, + "grad_norm": 0.58984375, + "learning_rate": 9.852037991072941e-07, + "loss": 0.0227, + "mean_token_accuracy": 0.993132047355175, + "num_tokens": 377914919.0, + "step": 3170 + }, + { + "entropy": 0.6319722384214401, + "epoch": 7.2235528942115765, + "grad_norm": 0.55078125, + "learning_rate": 9.837058952221182e-07, + "loss": 0.019, + "mean_token_accuracy": 0.9937842860817909, + "num_tokens": 378034372.0, + "step": 3171 + }, + { + "entropy": 0.6305086314678192, + "epoch": 7.225834046193327, + "grad_norm": 0.4453125, + "learning_rate": 9.822088519562038e-07, + "loss": 0.0136, + "mean_token_accuracy": 0.9961704686284065, + "num_tokens": 378153714.0, + "step": 3172 + }, + { + "entropy": 0.6374504864215851, + "epoch": 7.228115198175079, + "grad_norm": 0.478515625, + "learning_rate": 9.80712670159242e-07, + "loss": 0.0119, + "mean_token_accuracy": 0.996232658624649, + "num_tokens": 378272831.0, + "step": 3173 + }, + { + "entropy": 0.6370948255062103, + "epoch": 7.23039635015683, + "grad_norm": 0.5546875, + "learning_rate": 9.792173506804378e-07, + "loss": 0.0161, + "mean_token_accuracy": 0.9939969107508659, + "num_tokens": 378391555.0, + "step": 3174 + }, + { + "entropy": 0.6352758333086967, + "epoch": 7.23267750213858, + "grad_norm": 0.54296875, + "learning_rate": 9.777228943685055e-07, + "loss": 0.0178, + "mean_token_accuracy": 0.9959843680262566, + "num_tokens": 378511091.0, + "step": 3175 + }, + { + "entropy": 0.6297379285097122, + "epoch": 7.234958654120331, + "grad_norm": 0.47265625, + "learning_rate": 9.762293020716696e-07, + "loss": 0.012, + "mean_token_accuracy": 0.9962622821331024, + "num_tokens": 378629920.0, + "step": 3176 + }, + { + "entropy": 0.6321572363376617, + "epoch": 7.237239806102082, + "grad_norm": 0.8046875, + "learning_rate": 9.74736574637665e-07, + "loss": 0.0163, + "mean_token_accuracy": 0.9962349757552147, + "num_tokens": 378749066.0, + "step": 3177 + }, + { + "entropy": 0.6339247152209282, + "epoch": 7.2395209580838324, + "grad_norm": 0.640625, + "learning_rate": 9.732447129137337e-07, + "loss": 0.02, + "mean_token_accuracy": 0.9931877702474594, + "num_tokens": 378868603.0, + "step": 3178 + }, + { + "entropy": 0.6316575482487679, + "epoch": 7.241802110065583, + "grad_norm": 0.7265625, + "learning_rate": 9.717537177466279e-07, + "loss": 0.0217, + "mean_token_accuracy": 0.9948482513427734, + "num_tokens": 378987792.0, + "step": 3179 + }, + { + "entropy": 0.6357111781835556, + "epoch": 7.244083262047334, + "grad_norm": 0.427734375, + "learning_rate": 9.702635899826082e-07, + "loss": 0.0153, + "mean_token_accuracy": 0.9959608092904091, + "num_tokens": 379107074.0, + "step": 3180 + }, + { + "entropy": 0.6314530596137047, + "epoch": 7.246364414029085, + "grad_norm": 0.46484375, + "learning_rate": 9.687743304674421e-07, + "loss": 0.0132, + "mean_token_accuracy": 0.9954675063490868, + "num_tokens": 379225360.0, + "step": 3181 + }, + { + "entropy": 0.6376235410571098, + "epoch": 7.248645566010835, + "grad_norm": 0.828125, + "learning_rate": 9.672859400464046e-07, + "loss": 0.0252, + "mean_token_accuracy": 0.9921997785568237, + "num_tokens": 379344325.0, + "step": 3182 + }, + { + "entropy": 0.634081058204174, + "epoch": 7.250926717992586, + "grad_norm": 0.6015625, + "learning_rate": 9.657984195642783e-07, + "loss": 0.0218, + "mean_token_accuracy": 0.9929080083966255, + "num_tokens": 379462635.0, + "step": 3183 + }, + { + "entropy": 0.6386000588536263, + "epoch": 7.253207869974337, + "grad_norm": 0.5390625, + "learning_rate": 9.64311769865349e-07, + "loss": 0.0154, + "mean_token_accuracy": 0.9957747608423233, + "num_tokens": 379581894.0, + "step": 3184 + }, + { + "entropy": 0.6321931481361389, + "epoch": 7.2554890219560875, + "grad_norm": 0.55078125, + "learning_rate": 9.628259917934118e-07, + "loss": 0.0184, + "mean_token_accuracy": 0.9934622272849083, + "num_tokens": 379701071.0, + "step": 3185 + }, + { + "entropy": 0.6350580155849457, + "epoch": 7.257770173937839, + "grad_norm": 0.57421875, + "learning_rate": 9.613410861917661e-07, + "loss": 0.0168, + "mean_token_accuracy": 0.9930899739265442, + "num_tokens": 379820321.0, + "step": 3186 + }, + { + "entropy": 0.6358660832047462, + "epoch": 7.26005132591959, + "grad_norm": 0.439453125, + "learning_rate": 9.59857053903214e-07, + "loss": 0.0136, + "mean_token_accuracy": 0.9946546405553818, + "num_tokens": 379939720.0, + "step": 3187 + }, + { + "entropy": 0.6379834413528442, + "epoch": 7.2623324779013405, + "grad_norm": 0.98828125, + "learning_rate": 9.583738957700653e-07, + "loss": 0.0175, + "mean_token_accuracy": 0.994237095117569, + "num_tokens": 380059467.0, + "step": 3188 + }, + { + "entropy": 0.6329062432050705, + "epoch": 7.264613629883091, + "grad_norm": 0.4921875, + "learning_rate": 9.568916126341305e-07, + "loss": 0.0144, + "mean_token_accuracy": 0.9948809891939163, + "num_tokens": 380178356.0, + "step": 3189 + }, + { + "entropy": 0.6385719925165176, + "epoch": 7.266894781864842, + "grad_norm": 0.46484375, + "learning_rate": 9.554102053367253e-07, + "loss": 0.0165, + "mean_token_accuracy": 0.9953120946884155, + "num_tokens": 380298060.0, + "step": 3190 + }, + { + "entropy": 0.640006385743618, + "epoch": 7.269175933846593, + "grad_norm": 0.58984375, + "learning_rate": 9.53929674718668e-07, + "loss": 0.0195, + "mean_token_accuracy": 0.9951125830411911, + "num_tokens": 380417229.0, + "step": 3191 + }, + { + "entropy": 0.6309252455830574, + "epoch": 7.271457085828343, + "grad_norm": 0.6640625, + "learning_rate": 9.524500216202795e-07, + "loss": 0.0249, + "mean_token_accuracy": 0.989452138543129, + "num_tokens": 380536601.0, + "step": 3192 + }, + { + "entropy": 0.6380420625209808, + "epoch": 7.273738237810094, + "grad_norm": 0.490234375, + "learning_rate": 9.50971246881382e-07, + "loss": 0.0173, + "mean_token_accuracy": 0.9939146265387535, + "num_tokens": 380655545.0, + "step": 3193 + }, + { + "entropy": 0.6327937543392181, + "epoch": 7.276019389791845, + "grad_norm": 0.43359375, + "learning_rate": 9.494933513413007e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9947640746831894, + "num_tokens": 380774058.0, + "step": 3194 + }, + { + "entropy": 0.6364415138959885, + "epoch": 7.2783005417735955, + "grad_norm": 0.62890625, + "learning_rate": 9.480163358388584e-07, + "loss": 0.0164, + "mean_token_accuracy": 0.9935459271073341, + "num_tokens": 380893927.0, + "step": 3195 + }, + { + "entropy": 0.6349638104438782, + "epoch": 7.280581693755346, + "grad_norm": 0.50390625, + "learning_rate": 9.465402012123818e-07, + "loss": 0.0183, + "mean_token_accuracy": 0.9936476945877075, + "num_tokens": 381013517.0, + "step": 3196 + }, + { + "entropy": 0.6359527856111526, + "epoch": 7.282862845737097, + "grad_norm": 0.46875, + "learning_rate": 9.45064948299696e-07, + "loss": 0.0203, + "mean_token_accuracy": 0.99544258415699, + "num_tokens": 381132721.0, + "step": 3197 + }, + { + "entropy": 0.6328136548399925, + "epoch": 7.285143997718848, + "grad_norm": 0.61328125, + "learning_rate": 9.435905779381265e-07, + "loss": 0.0165, + "mean_token_accuracy": 0.9953528866171837, + "num_tokens": 381251024.0, + "step": 3198 + }, + { + "entropy": 0.6350339353084564, + "epoch": 7.287425149700598, + "grad_norm": 0.455078125, + "learning_rate": 9.421170909644983e-07, + "loss": 0.0121, + "mean_token_accuracy": 0.9967426732182503, + "num_tokens": 381371271.0, + "step": 3199 + }, + { + "entropy": 0.6366783678531647, + "epoch": 7.289706301682349, + "grad_norm": 0.4453125, + "learning_rate": 9.406444882151322e-07, + "loss": 0.0138, + "mean_token_accuracy": 0.9942903742194176, + "num_tokens": 381491073.0, + "step": 3200 + }, + { + "entropy": 0.6392788514494896, + "epoch": 7.291987453664101, + "grad_norm": 0.51171875, + "learning_rate": 9.391727705258502e-07, + "loss": 0.0166, + "mean_token_accuracy": 0.9951516389846802, + "num_tokens": 381611022.0, + "step": 3201 + }, + { + "entropy": 0.6356100216507912, + "epoch": 7.2942686056458514, + "grad_norm": 0.546875, + "learning_rate": 9.377019387319705e-07, + "loss": 0.0152, + "mean_token_accuracy": 0.9944534003734589, + "num_tokens": 381730092.0, + "step": 3202 + }, + { + "entropy": 0.633549153804779, + "epoch": 7.296549757627602, + "grad_norm": 0.55078125, + "learning_rate": 9.362319936683092e-07, + "loss": 0.0176, + "mean_token_accuracy": 0.993553102016449, + "num_tokens": 381849388.0, + "step": 3203 + }, + { + "entropy": 0.6331641599535942, + "epoch": 7.298830909609353, + "grad_norm": 0.48046875, + "learning_rate": 9.347629361691795e-07, + "loss": 0.0145, + "mean_token_accuracy": 0.9955428019165993, + "num_tokens": 381969104.0, + "step": 3204 + }, + { + "entropy": 0.6346428692340851, + "epoch": 7.301112061591104, + "grad_norm": 0.44921875, + "learning_rate": 9.332947670683882e-07, + "loss": 0.013, + "mean_token_accuracy": 0.996285118162632, + "num_tokens": 382088406.0, + "step": 3205 + }, + { + "entropy": 0.6346376538276672, + "epoch": 7.303393213572854, + "grad_norm": 0.5546875, + "learning_rate": 9.318274871992408e-07, + "loss": 0.0161, + "mean_token_accuracy": 0.995022751390934, + "num_tokens": 382208749.0, + "step": 3206 + }, + { + "entropy": 0.6411733254790306, + "epoch": 7.305674365554605, + "grad_norm": 0.484375, + "learning_rate": 9.303610973945376e-07, + "loss": 0.0161, + "mean_token_accuracy": 0.9938062727451324, + "num_tokens": 382328610.0, + "step": 3207 + }, + { + "entropy": 0.627198651432991, + "epoch": 7.307955517536356, + "grad_norm": 0.51171875, + "learning_rate": 9.288955984865717e-07, + "loss": 0.0138, + "mean_token_accuracy": 0.9955696165561676, + "num_tokens": 382447625.0, + "step": 3208 + }, + { + "entropy": 0.6331163719296455, + "epoch": 7.3102366695181065, + "grad_norm": 0.75, + "learning_rate": 9.274309913071328e-07, + "loss": 0.0162, + "mean_token_accuracy": 0.994100034236908, + "num_tokens": 382566399.0, + "step": 3209 + }, + { + "entropy": 0.634904071688652, + "epoch": 7.312517821499857, + "grad_norm": 0.56640625, + "learning_rate": 9.259672766875044e-07, + "loss": 0.0197, + "mean_token_accuracy": 0.9941210672259331, + "num_tokens": 382685565.0, + "step": 3210 + }, + { + "entropy": 0.6356063559651375, + "epoch": 7.314798973481608, + "grad_norm": 0.443359375, + "learning_rate": 9.245044554584609e-07, + "loss": 0.0117, + "mean_token_accuracy": 0.9953567758202553, + "num_tokens": 382804889.0, + "step": 3211 + }, + { + "entropy": 0.63348238915205, + "epoch": 7.317080125463359, + "grad_norm": 0.453125, + "learning_rate": 9.230425284502725e-07, + "loss": 0.0184, + "mean_token_accuracy": 0.9948332831263542, + "num_tokens": 382924383.0, + "step": 3212 + }, + { + "entropy": 0.63861183822155, + "epoch": 7.319361277445109, + "grad_norm": 0.490234375, + "learning_rate": 9.215814964927005e-07, + "loss": 0.0142, + "mean_token_accuracy": 0.9954968616366386, + "num_tokens": 383043810.0, + "step": 3213 + }, + { + "entropy": 0.6318069472908974, + "epoch": 7.321642429426861, + "grad_norm": 0.51953125, + "learning_rate": 9.201213604149989e-07, + "loss": 0.0171, + "mean_token_accuracy": 0.9941596761345863, + "num_tokens": 383162341.0, + "step": 3214 + }, + { + "entropy": 0.6337085515260696, + "epoch": 7.323923581408612, + "grad_norm": 0.8046875, + "learning_rate": 9.186621210459129e-07, + "loss": 0.0304, + "mean_token_accuracy": 0.9916827082633972, + "num_tokens": 383281916.0, + "step": 3215 + }, + { + "entropy": 0.6372535228729248, + "epoch": 7.326204733390362, + "grad_norm": 0.486328125, + "learning_rate": 9.172037792136773e-07, + "loss": 0.0134, + "mean_token_accuracy": 0.9940377101302147, + "num_tokens": 383402084.0, + "step": 3216 + }, + { + "entropy": 0.6328097581863403, + "epoch": 7.328485885372113, + "grad_norm": 0.5390625, + "learning_rate": 9.157463357460194e-07, + "loss": 0.0097, + "mean_token_accuracy": 0.9959743097424507, + "num_tokens": 383521568.0, + "step": 3217 + }, + { + "entropy": 0.6325557082891464, + "epoch": 7.330767037353864, + "grad_norm": 0.69921875, + "learning_rate": 9.142897914701565e-07, + "loss": 0.0187, + "mean_token_accuracy": 0.9932654872536659, + "num_tokens": 383641339.0, + "step": 3218 + }, + { + "entropy": 0.6388049423694611, + "epoch": 7.3330481893356145, + "grad_norm": 0.58984375, + "learning_rate": 9.128341472127944e-07, + "loss": 0.0182, + "mean_token_accuracy": 0.9949917197227478, + "num_tokens": 383760966.0, + "step": 3219 + }, + { + "entropy": 0.6356083676218987, + "epoch": 7.335329341317365, + "grad_norm": 0.55859375, + "learning_rate": 9.113794038001298e-07, + "loss": 0.0239, + "mean_token_accuracy": 0.9919374585151672, + "num_tokens": 383880785.0, + "step": 3220 + }, + { + "entropy": 0.630994901061058, + "epoch": 7.337610493299116, + "grad_norm": 0.515625, + "learning_rate": 9.099255620578451e-07, + "loss": 0.0129, + "mean_token_accuracy": 0.9955395683646202, + "num_tokens": 384000401.0, + "step": 3221 + }, + { + "entropy": 0.6376608312129974, + "epoch": 7.339891645280867, + "grad_norm": 0.458984375, + "learning_rate": 9.084726228111141e-07, + "loss": 0.0145, + "mean_token_accuracy": 0.9955434501171112, + "num_tokens": 384119171.0, + "step": 3222 + }, + { + "entropy": 0.6318785399198532, + "epoch": 7.342172797262617, + "grad_norm": 0.48828125, + "learning_rate": 9.070205868845966e-07, + "loss": 0.0118, + "mean_token_accuracy": 0.9962022677063942, + "num_tokens": 384238137.0, + "step": 3223 + }, + { + "entropy": 0.6338217854499817, + "epoch": 7.344453949244368, + "grad_norm": 0.5234375, + "learning_rate": 9.055694551024402e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.9947351217269897, + "num_tokens": 384358036.0, + "step": 3224 + }, + { + "entropy": 0.6330504268407822, + "epoch": 7.346735101226119, + "grad_norm": 0.59765625, + "learning_rate": 9.041192282882796e-07, + "loss": 0.0214, + "mean_token_accuracy": 0.9932188764214516, + "num_tokens": 384477625.0, + "step": 3225 + }, + { + "entropy": 0.6360602304339409, + "epoch": 7.3490162532078696, + "grad_norm": 0.5, + "learning_rate": 9.026699072652361e-07, + "loss": 0.0115, + "mean_token_accuracy": 0.9957920908927917, + "num_tokens": 384596790.0, + "step": 3226 + }, + { + "entropy": 0.6386269479990005, + "epoch": 7.351297405189621, + "grad_norm": 0.546875, + "learning_rate": 9.012214928559149e-07, + "loss": 0.0167, + "mean_token_accuracy": 0.9935852363705635, + "num_tokens": 384717303.0, + "step": 3227 + }, + { + "entropy": 0.6351408734917641, + "epoch": 7.353578557171372, + "grad_norm": 0.421875, + "learning_rate": 8.997739858824083e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9963550269603729, + "num_tokens": 384836723.0, + "step": 3228 + }, + { + "entropy": 0.6333401501178741, + "epoch": 7.355859709153123, + "grad_norm": 0.515625, + "learning_rate": 8.983273871662951e-07, + "loss": 0.0101, + "mean_token_accuracy": 0.9961800426244736, + "num_tokens": 384956692.0, + "step": 3229 + }, + { + "entropy": 0.6344352215528488, + "epoch": 7.358140861134873, + "grad_norm": 0.412109375, + "learning_rate": 8.968816975286346e-07, + "loss": 0.0143, + "mean_token_accuracy": 0.9942577332258224, + "num_tokens": 385075732.0, + "step": 3230 + }, + { + "entropy": 0.6347010880708694, + "epoch": 7.360422013116624, + "grad_norm": 0.66015625, + "learning_rate": 8.954369177899727e-07, + "loss": 0.0305, + "mean_token_accuracy": 0.9928447753190994, + "num_tokens": 385195524.0, + "step": 3231 + }, + { + "entropy": 0.6362119168043137, + "epoch": 7.362703165098375, + "grad_norm": 0.62890625, + "learning_rate": 8.939930487703402e-07, + "loss": 0.0253, + "mean_token_accuracy": 0.9927683845162392, + "num_tokens": 385315349.0, + "step": 3232 + }, + { + "entropy": 0.6377596035599709, + "epoch": 7.3649843170801255, + "grad_norm": 0.421875, + "learning_rate": 8.925500912892471e-07, + "loss": 0.0151, + "mean_token_accuracy": 0.9939955621957779, + "num_tokens": 385434133.0, + "step": 3233 + }, + { + "entropy": 0.6373363733291626, + "epoch": 7.367265469061876, + "grad_norm": 0.474609375, + "learning_rate": 8.911080461656893e-07, + "loss": 0.014, + "mean_token_accuracy": 0.9963131323456764, + "num_tokens": 385552799.0, + "step": 3234 + }, + { + "entropy": 0.6400044113397598, + "epoch": 7.369546621043627, + "grad_norm": 0.498046875, + "learning_rate": 8.896669142181436e-07, + "loss": 0.0173, + "mean_token_accuracy": 0.9962252750992775, + "num_tokens": 385672773.0, + "step": 3235 + }, + { + "entropy": 0.6335572823882103, + "epoch": 7.371827773025378, + "grad_norm": 0.486328125, + "learning_rate": 8.882266962645695e-07, + "loss": 0.0195, + "mean_token_accuracy": 0.9940142259001732, + "num_tokens": 385792492.0, + "step": 3236 + }, + { + "entropy": 0.6325846016407013, + "epoch": 7.374108925007128, + "grad_norm": 0.55859375, + "learning_rate": 8.867873931224053e-07, + "loss": 0.0175, + "mean_token_accuracy": 0.993807964026928, + "num_tokens": 385912237.0, + "step": 3237 + }, + { + "entropy": 0.6321521922945976, + "epoch": 7.376390076988879, + "grad_norm": 0.5546875, + "learning_rate": 8.853490056085723e-07, + "loss": 0.0203, + "mean_token_accuracy": 0.9921271726489067, + "num_tokens": 386031403.0, + "step": 3238 + }, + { + "entropy": 0.6346258893609047, + "epoch": 7.37867122897063, + "grad_norm": 0.53125, + "learning_rate": 8.839115345394716e-07, + "loss": 0.016, + "mean_token_accuracy": 0.994689330458641, + "num_tokens": 386150571.0, + "step": 3239 + }, + { + "entropy": 0.6347269788384438, + "epoch": 7.380952380952381, + "grad_norm": 0.58984375, + "learning_rate": 8.824749807309846e-07, + "loss": 0.0198, + "mean_token_accuracy": 0.9951428696513176, + "num_tokens": 386270471.0, + "step": 3240 + }, + { + "entropy": 0.6345096454024315, + "epoch": 7.383233532934132, + "grad_norm": 0.609375, + "learning_rate": 8.810393449984706e-07, + "loss": 0.0229, + "mean_token_accuracy": 0.9930067658424377, + "num_tokens": 386389457.0, + "step": 3241 + }, + { + "entropy": 0.6353055983781815, + "epoch": 7.385514684915883, + "grad_norm": 0.41796875, + "learning_rate": 8.7960462815677e-07, + "loss": 0.0142, + "mean_token_accuracy": 0.9961632937192917, + "num_tokens": 386508105.0, + "step": 3242 + }, + { + "entropy": 0.6352842003107071, + "epoch": 7.3877958368976335, + "grad_norm": 0.462890625, + "learning_rate": 8.781708310201989e-07, + "loss": 0.014, + "mean_token_accuracy": 0.9958514124155045, + "num_tokens": 386628297.0, + "step": 3243 + }, + { + "entropy": 0.6372435390949249, + "epoch": 7.390076988879384, + "grad_norm": 0.8515625, + "learning_rate": 8.767379544025531e-07, + "loss": 0.0311, + "mean_token_accuracy": 0.9898180440068245, + "num_tokens": 386747176.0, + "step": 3244 + }, + { + "entropy": 0.6374886259436607, + "epoch": 7.392358140861135, + "grad_norm": 0.5546875, + "learning_rate": 8.753059991171065e-07, + "loss": 0.0152, + "mean_token_accuracy": 0.993800476193428, + "num_tokens": 386865876.0, + "step": 3245 + }, + { + "entropy": 0.6305877864360809, + "epoch": 7.394639292842886, + "grad_norm": 0.65625, + "learning_rate": 8.738749659766085e-07, + "loss": 0.0216, + "mean_token_accuracy": 0.9945859983563423, + "num_tokens": 386985272.0, + "step": 3246 + }, + { + "entropy": 0.6330234035849571, + "epoch": 7.396920444824636, + "grad_norm": 0.67578125, + "learning_rate": 8.724448557932874e-07, + "loss": 0.0266, + "mean_token_accuracy": 0.9918053597211838, + "num_tokens": 387105130.0, + "step": 3247 + }, + { + "entropy": 0.6295353397727013, + "epoch": 7.399201596806387, + "grad_norm": 0.58203125, + "learning_rate": 8.71015669378844e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9933076202869415, + "num_tokens": 387224646.0, + "step": 3248 + }, + { + "entropy": 0.631928838789463, + "epoch": 7.401482748788138, + "grad_norm": 0.49609375, + "learning_rate": 8.69587407544458e-07, + "loss": 0.0119, + "mean_token_accuracy": 0.9971795231103897, + "num_tokens": 387343502.0, + "step": 3249 + }, + { + "entropy": 0.6375774070620537, + "epoch": 7.4037639007698886, + "grad_norm": 0.4921875, + "learning_rate": 8.681600711007832e-07, + "loss": 0.0126, + "mean_token_accuracy": 0.9959183931350708, + "num_tokens": 387462308.0, + "step": 3250 + }, + { + "entropy": 0.635082334280014, + "epoch": 7.406045052751639, + "grad_norm": 0.5078125, + "learning_rate": 8.667336608579488e-07, + "loss": 0.0153, + "mean_token_accuracy": 0.9957398101687431, + "num_tokens": 387581767.0, + "step": 3251 + }, + { + "entropy": 0.6346960812807083, + "epoch": 7.40832620473339, + "grad_norm": 0.392578125, + "learning_rate": 8.653081776255562e-07, + "loss": 0.0124, + "mean_token_accuracy": 0.9949546307325363, + "num_tokens": 387701742.0, + "step": 3252 + }, + { + "entropy": 0.6333270445466042, + "epoch": 7.410607356715142, + "grad_norm": 0.365234375, + "learning_rate": 8.638836222126839e-07, + "loss": 0.009, + "mean_token_accuracy": 0.9969546422362328, + "num_tokens": 387821653.0, + "step": 3253 + }, + { + "entropy": 0.6392737850546837, + "epoch": 7.412888508696892, + "grad_norm": 0.5234375, + "learning_rate": 8.624599954278803e-07, + "loss": 0.0155, + "mean_token_accuracy": 0.9951820895075798, + "num_tokens": 387940434.0, + "step": 3254 + }, + { + "entropy": 0.6379840523004532, + "epoch": 7.415169660678643, + "grad_norm": 0.470703125, + "learning_rate": 8.610372980791695e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.99410729855299, + "num_tokens": 388059798.0, + "step": 3255 + }, + { + "entropy": 0.6380589231848717, + "epoch": 7.417450812660394, + "grad_norm": 0.4765625, + "learning_rate": 8.59615530974047e-07, + "loss": 0.013, + "mean_token_accuracy": 0.9954761937260628, + "num_tokens": 388179536.0, + "step": 3256 + }, + { + "entropy": 0.6443532407283783, + "epoch": 7.4197319646421445, + "grad_norm": 0.6171875, + "learning_rate": 8.581946949194802e-07, + "loss": 0.023, + "mean_token_accuracy": 0.9936416670680046, + "num_tokens": 388298692.0, + "step": 3257 + }, + { + "entropy": 0.6326147392392159, + "epoch": 7.422013116623895, + "grad_norm": 0.4609375, + "learning_rate": 8.56774790721909e-07, + "loss": 0.0102, + "mean_token_accuracy": 0.9955715835094452, + "num_tokens": 388418417.0, + "step": 3258 + }, + { + "entropy": 0.6352388560771942, + "epoch": 7.424294268605646, + "grad_norm": 0.52734375, + "learning_rate": 8.553558191872422e-07, + "loss": 0.0146, + "mean_token_accuracy": 0.9942400008440018, + "num_tokens": 388537604.0, + "step": 3259 + }, + { + "entropy": 0.6309987083077431, + "epoch": 7.426575420587397, + "grad_norm": 0.65234375, + "learning_rate": 8.539377811208613e-07, + "loss": 0.02, + "mean_token_accuracy": 0.9937703907489777, + "num_tokens": 388657069.0, + "step": 3260 + }, + { + "entropy": 0.633402444422245, + "epoch": 7.428856572569147, + "grad_norm": 0.51953125, + "learning_rate": 8.525206773276173e-07, + "loss": 0.0202, + "mean_token_accuracy": 0.9959453418850899, + "num_tokens": 388776022.0, + "step": 3261 + }, + { + "entropy": 0.6389647647738457, + "epoch": 7.431137724550898, + "grad_norm": 0.451171875, + "learning_rate": 8.511045086118311e-07, + "loss": 0.0132, + "mean_token_accuracy": 0.9952664077281952, + "num_tokens": 388895964.0, + "step": 3262 + }, + { + "entropy": 0.6333517953753471, + "epoch": 7.433418876532649, + "grad_norm": 0.56640625, + "learning_rate": 8.496892757772934e-07, + "loss": 0.0135, + "mean_token_accuracy": 0.9948504939675331, + "num_tokens": 389016068.0, + "step": 3263 + }, + { + "entropy": 0.6351913511753082, + "epoch": 7.4357000285143995, + "grad_norm": 0.5703125, + "learning_rate": 8.482749796272613e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9954832792282104, + "num_tokens": 389134926.0, + "step": 3264 + }, + { + "entropy": 0.6386267766356468, + "epoch": 7.43798118049615, + "grad_norm": 0.443359375, + "learning_rate": 8.468616209644634e-07, + "loss": 0.0119, + "mean_token_accuracy": 0.9963128343224525, + "num_tokens": 389253892.0, + "step": 3265 + }, + { + "entropy": 0.6346601843833923, + "epoch": 7.440262332477902, + "grad_norm": 0.4296875, + "learning_rate": 8.454492005910942e-07, + "loss": 0.0106, + "mean_token_accuracy": 0.9970691725611687, + "num_tokens": 389373312.0, + "step": 3266 + }, + { + "entropy": 0.6362580955028534, + "epoch": 7.4425434844596525, + "grad_norm": 0.578125, + "learning_rate": 8.440377193088162e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.995345413684845, + "num_tokens": 389492905.0, + "step": 3267 + }, + { + "entropy": 0.6391911655664444, + "epoch": 7.444824636441403, + "grad_norm": 0.54296875, + "learning_rate": 8.426271779187592e-07, + "loss": 0.0162, + "mean_token_accuracy": 0.9939671754837036, + "num_tokens": 389612899.0, + "step": 3268 + }, + { + "entropy": 0.6327624246478081, + "epoch": 7.447105788423154, + "grad_norm": 0.52734375, + "learning_rate": 8.4121757722152e-07, + "loss": 0.0136, + "mean_token_accuracy": 0.9965995252132416, + "num_tokens": 389732257.0, + "step": 3269 + }, + { + "entropy": 0.6319688111543655, + "epoch": 7.449386940404905, + "grad_norm": 0.51171875, + "learning_rate": 8.398089180171592e-07, + "loss": 0.0141, + "mean_token_accuracy": 0.9955659508705139, + "num_tokens": 389851656.0, + "step": 3270 + }, + { + "entropy": 0.6421779692173004, + "epoch": 7.451668092386655, + "grad_norm": 0.482421875, + "learning_rate": 8.384012011052053e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9939877614378929, + "num_tokens": 389971723.0, + "step": 3271 + }, + { + "entropy": 0.6325620785355568, + "epoch": 7.453949244368406, + "grad_norm": 0.54296875, + "learning_rate": 8.369944272846522e-07, + "loss": 0.019, + "mean_token_accuracy": 0.9935583621263504, + "num_tokens": 390091167.0, + "step": 3272 + }, + { + "entropy": 0.6413876265287399, + "epoch": 7.456230396350157, + "grad_norm": 0.5546875, + "learning_rate": 8.355885973539557e-07, + "loss": 0.0156, + "mean_token_accuracy": 0.9957999363541603, + "num_tokens": 390210282.0, + "step": 3273 + }, + { + "entropy": 0.6364572569727898, + "epoch": 7.4585115483319075, + "grad_norm": 0.54296875, + "learning_rate": 8.341837121110386e-07, + "loss": 0.0194, + "mean_token_accuracy": 0.9950196892023087, + "num_tokens": 390330139.0, + "step": 3274 + }, + { + "entropy": 0.635563313961029, + "epoch": 7.460792700313658, + "grad_norm": 0.419921875, + "learning_rate": 8.327797723532874e-07, + "loss": 0.0118, + "mean_token_accuracy": 0.9961223304271698, + "num_tokens": 390449989.0, + "step": 3275 + }, + { + "entropy": 0.6369268670678139, + "epoch": 7.463073852295409, + "grad_norm": 0.58203125, + "learning_rate": 8.313767788775498e-07, + "loss": 0.019, + "mean_token_accuracy": 0.9948150143027306, + "num_tokens": 390568871.0, + "step": 3276 + }, + { + "entropy": 0.6377203688025475, + "epoch": 7.46535500427716, + "grad_norm": 0.62109375, + "learning_rate": 8.299747324801385e-07, + "loss": 0.0161, + "mean_token_accuracy": 0.9942247867584229, + "num_tokens": 390687955.0, + "step": 3277 + }, + { + "entropy": 0.6313562765717506, + "epoch": 7.46763615625891, + "grad_norm": 0.361328125, + "learning_rate": 8.285736339568279e-07, + "loss": 0.0138, + "mean_token_accuracy": 0.9954286441206932, + "num_tokens": 390807073.0, + "step": 3278 + }, + { + "entropy": 0.6312079280614853, + "epoch": 7.469917308240661, + "grad_norm": 0.369140625, + "learning_rate": 8.271734841028553e-07, + "loss": 0.0123, + "mean_token_accuracy": 0.9963722229003906, + "num_tokens": 390926414.0, + "step": 3279 + }, + { + "entropy": 0.6336323395371437, + "epoch": 7.472198460222412, + "grad_norm": 0.5390625, + "learning_rate": 8.25774283712917e-07, + "loss": 0.0196, + "mean_token_accuracy": 0.9933202415704727, + "num_tokens": 391046820.0, + "step": 3280 + }, + { + "entropy": 0.6319271102547646, + "epoch": 7.4744796122041635, + "grad_norm": 0.6171875, + "learning_rate": 8.243760335811734e-07, + "loss": 0.0171, + "mean_token_accuracy": 0.9945070892572403, + "num_tokens": 391165565.0, + "step": 3281 + }, + { + "entropy": 0.6401942372322083, + "epoch": 7.476760764185914, + "grad_norm": 0.55078125, + "learning_rate": 8.229787345012439e-07, + "loss": 0.017, + "mean_token_accuracy": 0.9939602538943291, + "num_tokens": 391284970.0, + "step": 3282 + }, + { + "entropy": 0.636204794049263, + "epoch": 7.479041916167665, + "grad_norm": 0.6875, + "learning_rate": 8.215823872662084e-07, + "loss": 0.0226, + "mean_token_accuracy": 0.9921455234289169, + "num_tokens": 391404987.0, + "step": 3283 + }, + { + "entropy": 0.6345239356160164, + "epoch": 7.481323068149416, + "grad_norm": 0.4765625, + "learning_rate": 8.201869926686068e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.9954396709799767, + "num_tokens": 391524649.0, + "step": 3284 + }, + { + "entropy": 0.6376892849802971, + "epoch": 7.483604220131166, + "grad_norm": 0.56640625, + "learning_rate": 8.187925515004391e-07, + "loss": 0.0148, + "mean_token_accuracy": 0.994087889790535, + "num_tokens": 391643903.0, + "step": 3285 + }, + { + "entropy": 0.6386966705322266, + "epoch": 7.485885372112917, + "grad_norm": 0.63671875, + "learning_rate": 8.173990645531612e-07, + "loss": 0.0159, + "mean_token_accuracy": 0.9953105002641678, + "num_tokens": 391763570.0, + "step": 3286 + }, + { + "entropy": 0.6364014223217964, + "epoch": 7.488166524094668, + "grad_norm": 0.48046875, + "learning_rate": 8.160065326176905e-07, + "loss": 0.0147, + "mean_token_accuracy": 0.9958532750606537, + "num_tokens": 391882484.0, + "step": 3287 + }, + { + "entropy": 0.6359819173812866, + "epoch": 7.4904476760764185, + "grad_norm": 0.51953125, + "learning_rate": 8.14614956484401e-07, + "loss": 0.0153, + "mean_token_accuracy": 0.9956971853971481, + "num_tokens": 392001707.0, + "step": 3288 + }, + { + "entropy": 0.6328353211283684, + "epoch": 7.492728828058169, + "grad_norm": 0.5703125, + "learning_rate": 8.132243369431248e-07, + "loss": 0.0154, + "mean_token_accuracy": 0.9936883226037025, + "num_tokens": 392122203.0, + "step": 3289 + }, + { + "entropy": 0.6391125097870827, + "epoch": 7.49500998003992, + "grad_norm": 0.57421875, + "learning_rate": 8.11834674783151e-07, + "loss": 0.0213, + "mean_token_accuracy": 0.9930542260408401, + "num_tokens": 392241404.0, + "step": 3290 + }, + { + "entropy": 0.6353010758757591, + "epoch": 7.497291132021671, + "grad_norm": 0.56640625, + "learning_rate": 8.104459707932238e-07, + "loss": 0.0137, + "mean_token_accuracy": 0.9949190840125084, + "num_tokens": 392360656.0, + "step": 3291 + }, + { + "entropy": 0.6264548525214195, + "epoch": 7.499572284003421, + "grad_norm": 0.54296875, + "learning_rate": 8.090582257615456e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.9951038360595703, + "num_tokens": 392479931.0, + "step": 3292 + }, + { + "entropy": 0.6325851753354073, + "epoch": 7.501853435985172, + "grad_norm": 0.5703125, + "learning_rate": 8.076714404757735e-07, + "loss": 0.0176, + "mean_token_accuracy": 0.9947512149810791, + "num_tokens": 392599137.0, + "step": 3293 + }, + { + "entropy": 0.6379763409495354, + "epoch": 7.504134587966924, + "grad_norm": 0.44921875, + "learning_rate": 8.062856157230209e-07, + "loss": 0.0182, + "mean_token_accuracy": 0.9940667673945427, + "num_tokens": 392718085.0, + "step": 3294 + }, + { + "entropy": 0.637290321290493, + "epoch": 7.506415739948674, + "grad_norm": 0.6171875, + "learning_rate": 8.049007522898536e-07, + "loss": 0.0164, + "mean_token_accuracy": 0.9938595741987228, + "num_tokens": 392837169.0, + "step": 3295 + }, + { + "entropy": 0.6293230727314949, + "epoch": 7.508696891930425, + "grad_norm": 0.5546875, + "learning_rate": 8.035168509622948e-07, + "loss": 0.0125, + "mean_token_accuracy": 0.9961462169885635, + "num_tokens": 392956009.0, + "step": 3296 + }, + { + "entropy": 0.6390967145562172, + "epoch": 7.510978043912176, + "grad_norm": 0.59375, + "learning_rate": 8.02133912525819e-07, + "loss": 0.0241, + "mean_token_accuracy": 0.9932846873998642, + "num_tokens": 393076593.0, + "step": 3297 + }, + { + "entropy": 0.6323270201683044, + "epoch": 7.5132591958939265, + "grad_norm": 0.4765625, + "learning_rate": 8.007519377653558e-07, + "loss": 0.012, + "mean_token_accuracy": 0.995063066482544, + "num_tokens": 393195186.0, + "step": 3298 + }, + { + "entropy": 0.6341218128800392, + "epoch": 7.515540347875677, + "grad_norm": 0.5703125, + "learning_rate": 7.993709274652872e-07, + "loss": 0.0127, + "mean_token_accuracy": 0.996982179582119, + "num_tokens": 393313915.0, + "step": 3299 + }, + { + "entropy": 0.6330108717083931, + "epoch": 7.517821499857428, + "grad_norm": 0.53125, + "learning_rate": 7.979908824094484e-07, + "loss": 0.0138, + "mean_token_accuracy": 0.9954887330532074, + "num_tokens": 393434262.0, + "step": 3300 + }, + { + "epoch": 7.517821499857428, + "eval_entropy": 0.6338581610541834, + "eval_loss": 0.020540663972496986, + "eval_mean_token_accuracy": 0.9936253652826461, + "eval_num_tokens": 393434262.0, + "eval_runtime": 177.5269, + "eval_samples_per_second": 47.232, + "eval_steps_per_second": 1.481, + "step": 3300 + }, + { + "entropy": 0.6322118490934372, + "epoch": 7.520102651839179, + "grad_norm": 0.6796875, + "learning_rate": 7.966118033811271e-07, + "loss": 0.0204, + "mean_token_accuracy": 0.9937509447336197, + "num_tokens": 393554341.0, + "step": 3301 + }, + { + "entropy": 0.6306112185120583, + "epoch": 7.522383803820929, + "grad_norm": 0.58984375, + "learning_rate": 7.952336911630604e-07, + "loss": 0.0176, + "mean_token_accuracy": 0.9952610805630684, + "num_tokens": 393674069.0, + "step": 3302 + }, + { + "entropy": 0.6350048333406448, + "epoch": 7.52466495580268, + "grad_norm": 0.52734375, + "learning_rate": 7.938565465374384e-07, + "loss": 0.014, + "mean_token_accuracy": 0.995182104408741, + "num_tokens": 393793768.0, + "step": 3303 + }, + { + "entropy": 0.6375582441687584, + "epoch": 7.526946107784431, + "grad_norm": 0.53515625, + "learning_rate": 7.924803702859024e-07, + "loss": 0.0207, + "mean_token_accuracy": 0.9938037768006325, + "num_tokens": 393913129.0, + "step": 3304 + }, + { + "entropy": 0.6360964849591255, + "epoch": 7.529227259766182, + "grad_norm": 0.5078125, + "learning_rate": 7.911051631895433e-07, + "loss": 0.0196, + "mean_token_accuracy": 0.9954104498028755, + "num_tokens": 394032519.0, + "step": 3305 + }, + { + "entropy": 0.6301134526729584, + "epoch": 7.531508411747932, + "grad_norm": 0.52734375, + "learning_rate": 7.897309260289027e-07, + "loss": 0.0145, + "mean_token_accuracy": 0.9944526553153992, + "num_tokens": 394151618.0, + "step": 3306 + }, + { + "entropy": 0.6343382596969604, + "epoch": 7.533789563729684, + "grad_norm": 0.51953125, + "learning_rate": 7.883576595839698e-07, + "loss": 0.0188, + "mean_token_accuracy": 0.9941438362002373, + "num_tokens": 394271036.0, + "step": 3307 + }, + { + "entropy": 0.6273718774318695, + "epoch": 7.536070715711435, + "grad_norm": 0.6953125, + "learning_rate": 7.869853646341849e-07, + "loss": 0.0218, + "mean_token_accuracy": 0.9934746250510216, + "num_tokens": 394389329.0, + "step": 3308 + }, + { + "entropy": 0.6314441859722137, + "epoch": 7.538351867693185, + "grad_norm": 0.7265625, + "learning_rate": 7.856140419584357e-07, + "loss": 0.0263, + "mean_token_accuracy": 0.9943873658776283, + "num_tokens": 394508400.0, + "step": 3309 + }, + { + "entropy": 0.6366250216960907, + "epoch": 7.540633019674936, + "grad_norm": 0.498046875, + "learning_rate": 7.842436923350591e-07, + "loss": 0.0146, + "mean_token_accuracy": 0.9941807314753532, + "num_tokens": 394628200.0, + "step": 3310 + }, + { + "entropy": 0.6341413259506226, + "epoch": 7.542914171656687, + "grad_norm": 0.59765625, + "learning_rate": 7.828743165418393e-07, + "loss": 0.0205, + "mean_token_accuracy": 0.9930286929011345, + "num_tokens": 394747111.0, + "step": 3311 + }, + { + "entropy": 0.6353755444288254, + "epoch": 7.5451953236384375, + "grad_norm": 0.59765625, + "learning_rate": 7.815059153560065e-07, + "loss": 0.0171, + "mean_token_accuracy": 0.9940722361207008, + "num_tokens": 394867002.0, + "step": 3312 + }, + { + "entropy": 0.6353518068790436, + "epoch": 7.547476475620188, + "grad_norm": 0.47265625, + "learning_rate": 7.801384895542391e-07, + "loss": 0.0114, + "mean_token_accuracy": 0.9964521750807762, + "num_tokens": 394986587.0, + "step": 3313 + }, + { + "entropy": 0.6361161097884178, + "epoch": 7.549757627601939, + "grad_norm": 0.515625, + "learning_rate": 7.78772039912662e-07, + "loss": 0.014, + "mean_token_accuracy": 0.995022788643837, + "num_tokens": 395105957.0, + "step": 3314 + }, + { + "entropy": 0.6315803304314613, + "epoch": 7.55203877958369, + "grad_norm": 0.4453125, + "learning_rate": 7.774065672068463e-07, + "loss": 0.0124, + "mean_token_accuracy": 0.9955874532461166, + "num_tokens": 395225286.0, + "step": 3315 + }, + { + "entropy": 0.6311120539903641, + "epoch": 7.55431993156544, + "grad_norm": 0.48828125, + "learning_rate": 7.760420722118059e-07, + "loss": 0.0129, + "mean_token_accuracy": 0.9950401410460472, + "num_tokens": 395344617.0, + "step": 3316 + }, + { + "entropy": 0.638682596385479, + "epoch": 7.556601083547191, + "grad_norm": 0.4296875, + "learning_rate": 7.746785557020034e-07, + "loss": 0.0164, + "mean_token_accuracy": 0.9935962036252022, + "num_tokens": 395464856.0, + "step": 3317 + }, + { + "entropy": 0.6361925378441811, + "epoch": 7.558882235528942, + "grad_norm": 0.5078125, + "learning_rate": 7.733160184513447e-07, + "loss": 0.0142, + "mean_token_accuracy": 0.9950376227498055, + "num_tokens": 395583742.0, + "step": 3318 + }, + { + "entropy": 0.6427748128771782, + "epoch": 7.5611633875106925, + "grad_norm": 0.423828125, + "learning_rate": 7.719544612331781e-07, + "loss": 0.0132, + "mean_token_accuracy": 0.9950528144836426, + "num_tokens": 395702987.0, + "step": 3319 + }, + { + "entropy": 0.6336889937520027, + "epoch": 7.563444539492444, + "grad_norm": 0.53515625, + "learning_rate": 7.705938848202985e-07, + "loss": 0.0133, + "mean_token_accuracy": 0.9947771653532982, + "num_tokens": 395821773.0, + "step": 3320 + }, + { + "entropy": 0.6386550068855286, + "epoch": 7.565725691474195, + "grad_norm": 0.51953125, + "learning_rate": 7.692342899849419e-07, + "loss": 0.0145, + "mean_token_accuracy": 0.994300402700901, + "num_tokens": 395941485.0, + "step": 3321 + }, + { + "entropy": 0.642134964466095, + "epoch": 7.5680068434559455, + "grad_norm": 0.482421875, + "learning_rate": 7.678756774987897e-07, + "loss": 0.0184, + "mean_token_accuracy": 0.9947060272097588, + "num_tokens": 396061116.0, + "step": 3322 + }, + { + "entropy": 0.6371975615620613, + "epoch": 7.570287995437696, + "grad_norm": 0.46484375, + "learning_rate": 7.665180481329621e-07, + "loss": 0.0152, + "mean_token_accuracy": 0.9952811151742935, + "num_tokens": 396180054.0, + "step": 3323 + }, + { + "entropy": 0.6350036785006523, + "epoch": 7.572569147419447, + "grad_norm": 0.52734375, + "learning_rate": 7.651614026580243e-07, + "loss": 0.0144, + "mean_token_accuracy": 0.9946000054478645, + "num_tokens": 396298880.0, + "step": 3324 + }, + { + "entropy": 0.6338519603013992, + "epoch": 7.574850299401198, + "grad_norm": 0.5859375, + "learning_rate": 7.638057418439818e-07, + "loss": 0.0185, + "mean_token_accuracy": 0.9947049990296364, + "num_tokens": 396418329.0, + "step": 3325 + }, + { + "entropy": 0.6354232132434845, + "epoch": 7.577131451382948, + "grad_norm": 0.439453125, + "learning_rate": 7.624510664602819e-07, + "loss": 0.0128, + "mean_token_accuracy": 0.9960441067814827, + "num_tokens": 396537632.0, + "step": 3326 + }, + { + "entropy": 0.6356603503227234, + "epoch": 7.579412603364699, + "grad_norm": 0.443359375, + "learning_rate": 7.610973772758118e-07, + "loss": 0.0126, + "mean_token_accuracy": 0.9964862763881683, + "num_tokens": 396656458.0, + "step": 3327 + }, + { + "entropy": 0.6297992244362831, + "epoch": 7.58169375534645, + "grad_norm": 0.416015625, + "learning_rate": 7.597446750589005e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.9942820966243744, + "num_tokens": 396775394.0, + "step": 3328 + }, + { + "entropy": 0.6363051906228065, + "epoch": 7.583974907328201, + "grad_norm": 0.59765625, + "learning_rate": 7.583929605773138e-07, + "loss": 0.0155, + "mean_token_accuracy": 0.9946112185716629, + "num_tokens": 396894531.0, + "step": 3329 + }, + { + "entropy": 0.6350549086928368, + "epoch": 7.586256059309951, + "grad_norm": 0.58203125, + "learning_rate": 7.570422345982598e-07, + "loss": 0.0243, + "mean_token_accuracy": 0.9934422522783279, + "num_tokens": 397014115.0, + "step": 3330 + }, + { + "entropy": 0.6281477734446526, + "epoch": 7.588537211291702, + "grad_norm": 0.4609375, + "learning_rate": 7.556924978883843e-07, + "loss": 0.0135, + "mean_token_accuracy": 0.9957791790366173, + "num_tokens": 397133170.0, + "step": 3331 + }, + { + "entropy": 0.6273889914155006, + "epoch": 7.590818363273453, + "grad_norm": 0.52734375, + "learning_rate": 7.543437512137717e-07, + "loss": 0.0237, + "mean_token_accuracy": 0.9949691072106361, + "num_tokens": 397252467.0, + "step": 3332 + }, + { + "entropy": 0.6353944689035416, + "epoch": 7.593099515255204, + "grad_norm": 0.427734375, + "learning_rate": 7.529959953399455e-07, + "loss": 0.0129, + "mean_token_accuracy": 0.9958343431353569, + "num_tokens": 397372646.0, + "step": 3333 + }, + { + "entropy": 0.6340312957763672, + "epoch": 7.595380667236955, + "grad_norm": 0.54296875, + "learning_rate": 7.516492310318643e-07, + "loss": 0.0134, + "mean_token_accuracy": 0.9957044869661331, + "num_tokens": 397491498.0, + "step": 3334 + }, + { + "entropy": 0.6318125128746033, + "epoch": 7.597661819218706, + "grad_norm": 0.435546875, + "learning_rate": 7.503034590539266e-07, + "loss": 0.012, + "mean_token_accuracy": 0.9964729696512222, + "num_tokens": 397610303.0, + "step": 3335 + }, + { + "entropy": 0.6322143897414207, + "epoch": 7.5999429712004565, + "grad_norm": 0.494140625, + "learning_rate": 7.489586801699661e-07, + "loss": 0.012, + "mean_token_accuracy": 0.9962347969412804, + "num_tokens": 397729148.0, + "step": 3336 + }, + { + "entropy": 0.6301423981785774, + "epoch": 7.602224123182207, + "grad_norm": 0.5859375, + "learning_rate": 7.476148951432543e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9942048713564873, + "num_tokens": 397848245.0, + "step": 3337 + }, + { + "entropy": 0.6382800489664078, + "epoch": 7.604505275163958, + "grad_norm": 0.75390625, + "learning_rate": 7.462721047364965e-07, + "loss": 0.0268, + "mean_token_accuracy": 0.9919798970222473, + "num_tokens": 397967969.0, + "step": 3338 + }, + { + "entropy": 0.6317357420921326, + "epoch": 7.606786427145709, + "grad_norm": 0.60546875, + "learning_rate": 7.449303097118355e-07, + "loss": 0.0165, + "mean_token_accuracy": 0.9936380088329315, + "num_tokens": 398087344.0, + "step": 3339 + }, + { + "entropy": 0.6414213180541992, + "epoch": 7.609067579127459, + "grad_norm": 0.57421875, + "learning_rate": 7.435895108308472e-07, + "loss": 0.0202, + "mean_token_accuracy": 0.9937860444188118, + "num_tokens": 398207293.0, + "step": 3340 + }, + { + "entropy": 0.6344632655382156, + "epoch": 7.61134873110921, + "grad_norm": 0.62109375, + "learning_rate": 7.422497088545436e-07, + "loss": 0.0185, + "mean_token_accuracy": 0.9937090128660202, + "num_tokens": 398326741.0, + "step": 3341 + }, + { + "entropy": 0.6333240121603012, + "epoch": 7.613629883090961, + "grad_norm": 0.6484375, + "learning_rate": 7.409109045433704e-07, + "loss": 0.0206, + "mean_token_accuracy": 0.9942409619688988, + "num_tokens": 398446295.0, + "step": 3342 + }, + { + "entropy": 0.6355526074767113, + "epoch": 7.6159110350727115, + "grad_norm": 0.57421875, + "learning_rate": 7.395730986572075e-07, + "loss": 0.0194, + "mean_token_accuracy": 0.9936162680387497, + "num_tokens": 398565217.0, + "step": 3343 + }, + { + "entropy": 0.6374208033084869, + "epoch": 7.618192187054462, + "grad_norm": 0.4140625, + "learning_rate": 7.382362919553682e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.995328314602375, + "num_tokens": 398684765.0, + "step": 3344 + }, + { + "entropy": 0.6364301517605782, + "epoch": 7.620473339036213, + "grad_norm": 0.515625, + "learning_rate": 7.369004851965966e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.9944184571504593, + "num_tokens": 398804596.0, + "step": 3345 + }, + { + "entropy": 0.6371932327747345, + "epoch": 7.6227544910179645, + "grad_norm": 0.5546875, + "learning_rate": 7.355656791390717e-07, + "loss": 0.0189, + "mean_token_accuracy": 0.9948907941579819, + "num_tokens": 398924835.0, + "step": 3346 + }, + { + "entropy": 0.6303799971938133, + "epoch": 7.625035642999714, + "grad_norm": 0.458984375, + "learning_rate": 7.342318745404034e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.9951719045639038, + "num_tokens": 399044610.0, + "step": 3347 + }, + { + "entropy": 0.6347168982028961, + "epoch": 7.627316794981466, + "grad_norm": 0.71875, + "learning_rate": 7.32899072157634e-07, + "loss": 0.0189, + "mean_token_accuracy": 0.9938200041651726, + "num_tokens": 399163935.0, + "step": 3348 + }, + { + "entropy": 0.6384136229753494, + "epoch": 7.629597946963217, + "grad_norm": 0.48046875, + "learning_rate": 7.315672727472365e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9967293366789818, + "num_tokens": 399283712.0, + "step": 3349 + }, + { + "entropy": 0.6300618052482605, + "epoch": 7.631879098944967, + "grad_norm": 0.390625, + "learning_rate": 7.302364770651132e-07, + "loss": 0.0131, + "mean_token_accuracy": 0.9958118498325348, + "num_tokens": 399403017.0, + "step": 3350 + }, + { + "entropy": 0.6328745484352112, + "epoch": 7.634160250926718, + "grad_norm": 0.5234375, + "learning_rate": 7.289066858665991e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.9939709678292274, + "num_tokens": 399522741.0, + "step": 3351 + }, + { + "entropy": 0.6339434757828712, + "epoch": 7.636441402908469, + "grad_norm": 0.66015625, + "learning_rate": 7.275778999064578e-07, + "loss": 0.0206, + "mean_token_accuracy": 0.9944528192281723, + "num_tokens": 399642443.0, + "step": 3352 + }, + { + "entropy": 0.6351944133639336, + "epoch": 7.63872255489022, + "grad_norm": 0.453125, + "learning_rate": 7.262501199388827e-07, + "loss": 0.0131, + "mean_token_accuracy": 0.9959475696086884, + "num_tokens": 399761914.0, + "step": 3353 + }, + { + "entropy": 0.6358554735779762, + "epoch": 7.64100370687197, + "grad_norm": 0.498046875, + "learning_rate": 7.249233467174965e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9942122772336006, + "num_tokens": 399880422.0, + "step": 3354 + }, + { + "entropy": 0.6318546012043953, + "epoch": 7.643284858853721, + "grad_norm": 0.41796875, + "learning_rate": 7.235975809953491e-07, + "loss": 0.0093, + "mean_token_accuracy": 0.9971444308757782, + "num_tokens": 399999896.0, + "step": 3355 + }, + { + "entropy": 0.6346433386206627, + "epoch": 7.645566010835472, + "grad_norm": 0.470703125, + "learning_rate": 7.222728235249196e-07, + "loss": 0.0138, + "mean_token_accuracy": 0.9957666546106339, + "num_tokens": 400119418.0, + "step": 3356 + }, + { + "entropy": 0.6383068040013313, + "epoch": 7.647847162817222, + "grad_norm": 0.47265625, + "learning_rate": 7.209490750581152e-07, + "loss": 0.0134, + "mean_token_accuracy": 0.9950955137610435, + "num_tokens": 400238679.0, + "step": 3357 + }, + { + "entropy": 0.635070689022541, + "epoch": 7.650128314798973, + "grad_norm": 0.6015625, + "learning_rate": 7.196263363462699e-07, + "loss": 0.0198, + "mean_token_accuracy": 0.992848314344883, + "num_tokens": 400357808.0, + "step": 3358 + }, + { + "entropy": 0.6287651136517525, + "epoch": 7.652409466780725, + "grad_norm": 0.443359375, + "learning_rate": 7.183046081401454e-07, + "loss": 0.0123, + "mean_token_accuracy": 0.9951979443430901, + "num_tokens": 400476978.0, + "step": 3359 + }, + { + "entropy": 0.6345900967717171, + "epoch": 7.654690618762475, + "grad_norm": 0.62109375, + "learning_rate": 7.169838911899276e-07, + "loss": 0.0234, + "mean_token_accuracy": 0.9920513778924942, + "num_tokens": 400596570.0, + "step": 3360 + }, + { + "entropy": 0.6335230022668839, + "epoch": 7.656971770744226, + "grad_norm": 0.609375, + "learning_rate": 7.156641862452316e-07, + "loss": 0.0188, + "mean_token_accuracy": 0.9948164820671082, + "num_tokens": 400715436.0, + "step": 3361 + }, + { + "entropy": 0.638471432030201, + "epoch": 7.659252922725977, + "grad_norm": 0.65625, + "learning_rate": 7.143454940550948e-07, + "loss": 0.0133, + "mean_token_accuracy": 0.9958316758275032, + "num_tokens": 400834816.0, + "step": 3362 + }, + { + "entropy": 0.6324852481484413, + "epoch": 7.661534074707728, + "grad_norm": 0.53515625, + "learning_rate": 7.13027815367982e-07, + "loss": 0.0176, + "mean_token_accuracy": 0.9943631514906883, + "num_tokens": 400953811.0, + "step": 3363 + }, + { + "entropy": 0.6350883319973946, + "epoch": 7.663815226689478, + "grad_norm": 0.375, + "learning_rate": 7.117111509317823e-07, + "loss": 0.0104, + "mean_token_accuracy": 0.9970263913273811, + "num_tokens": 401073233.0, + "step": 3364 + }, + { + "entropy": 0.636774331331253, + "epoch": 7.666096378671229, + "grad_norm": 0.5703125, + "learning_rate": 7.103955014938099e-07, + "loss": 0.0203, + "mean_token_accuracy": 0.9919825121760368, + "num_tokens": 401192484.0, + "step": 3365 + }, + { + "entropy": 0.6282460913062096, + "epoch": 7.66837753065298, + "grad_norm": 0.53515625, + "learning_rate": 7.090808678008005e-07, + "loss": 0.014, + "mean_token_accuracy": 0.9951673969626427, + "num_tokens": 401311199.0, + "step": 3366 + }, + { + "entropy": 0.628217488527298, + "epoch": 7.6706586826347305, + "grad_norm": 0.65625, + "learning_rate": 7.077672505989155e-07, + "loss": 0.0238, + "mean_token_accuracy": 0.9935095682740211, + "num_tokens": 401430971.0, + "step": 3367 + }, + { + "entropy": 0.6346548646688461, + "epoch": 7.672939834616481, + "grad_norm": 0.44921875, + "learning_rate": 7.064546506337386e-07, + "loss": 0.0153, + "mean_token_accuracy": 0.9944638907909393, + "num_tokens": 401550364.0, + "step": 3368 + }, + { + "entropy": 0.63689224421978, + "epoch": 7.675220986598232, + "grad_norm": 0.62890625, + "learning_rate": 7.051430686502764e-07, + "loss": 0.0201, + "mean_token_accuracy": 0.9935038834810257, + "num_tokens": 401670536.0, + "step": 3369 + }, + { + "entropy": 0.6370474547147751, + "epoch": 7.677502138579983, + "grad_norm": 0.47265625, + "learning_rate": 7.038325053929582e-07, + "loss": 0.013, + "mean_token_accuracy": 0.9953282922506332, + "num_tokens": 401789973.0, + "step": 3370 + }, + { + "entropy": 0.6350683569908142, + "epoch": 7.679783290561733, + "grad_norm": 0.52734375, + "learning_rate": 7.025229616056326e-07, + "loss": 0.0103, + "mean_token_accuracy": 0.9966486617922783, + "num_tokens": 401908696.0, + "step": 3371 + }, + { + "entropy": 0.6339009255170822, + "epoch": 7.682064442543484, + "grad_norm": 0.61328125, + "learning_rate": 7.012144380315724e-07, + "loss": 0.0186, + "mean_token_accuracy": 0.9931662455201149, + "num_tokens": 402027684.0, + "step": 3372 + }, + { + "entropy": 0.6344879791140556, + "epoch": 7.684345594525235, + "grad_norm": 0.4765625, + "learning_rate": 6.999069354134703e-07, + "loss": 0.0168, + "mean_token_accuracy": 0.994738832116127, + "num_tokens": 402147972.0, + "step": 3373 + }, + { + "entropy": 0.6361728012561798, + "epoch": 7.686626746506986, + "grad_norm": 0.58984375, + "learning_rate": 6.986004544934394e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9937233999371529, + "num_tokens": 402267151.0, + "step": 3374 + }, + { + "entropy": 0.6346034184098244, + "epoch": 7.688907898488737, + "grad_norm": 0.5390625, + "learning_rate": 6.972949960130135e-07, + "loss": 0.0167, + "mean_token_accuracy": 0.9948048889636993, + "num_tokens": 402386568.0, + "step": 3375 + }, + { + "entropy": 0.6367208734154701, + "epoch": 7.691189050470488, + "grad_norm": 0.453125, + "learning_rate": 6.959905607131457e-07, + "loss": 0.0107, + "mean_token_accuracy": 0.9962816163897514, + "num_tokens": 402505557.0, + "step": 3376 + }, + { + "entropy": 0.6347191035747528, + "epoch": 7.693470202452239, + "grad_norm": 0.52734375, + "learning_rate": 6.946871493342072e-07, + "loss": 0.0192, + "mean_token_accuracy": 0.9932654872536659, + "num_tokens": 402625318.0, + "step": 3377 + }, + { + "entropy": 0.6328184753656387, + "epoch": 7.695751354433989, + "grad_norm": 0.44140625, + "learning_rate": 6.933847626159898e-07, + "loss": 0.0104, + "mean_token_accuracy": 0.9957331642508507, + "num_tokens": 402744688.0, + "step": 3378 + }, + { + "entropy": 0.6352939456701279, + "epoch": 7.69803250641574, + "grad_norm": 0.76953125, + "learning_rate": 6.920834012977032e-07, + "loss": 0.0233, + "mean_token_accuracy": 0.9920494630932808, + "num_tokens": 402863774.0, + "step": 3379 + }, + { + "entropy": 0.6310678198933601, + "epoch": 7.700313658397491, + "grad_norm": 0.50390625, + "learning_rate": 6.907830661179757e-07, + "loss": 0.0224, + "mean_token_accuracy": 0.9946008920669556, + "num_tokens": 402982903.0, + "step": 3380 + }, + { + "entropy": 0.6376622915267944, + "epoch": 7.702594810379241, + "grad_norm": 0.435546875, + "learning_rate": 6.894837578148505e-07, + "loss": 0.0122, + "mean_token_accuracy": 0.9959247633814812, + "num_tokens": 403103274.0, + "step": 3381 + }, + { + "entropy": 0.6274687051773071, + "epoch": 7.704875962360992, + "grad_norm": 0.478515625, + "learning_rate": 6.881854771257912e-07, + "loss": 0.0174, + "mean_token_accuracy": 0.9952769055962563, + "num_tokens": 403222422.0, + "step": 3382 + }, + { + "entropy": 0.6313759237527847, + "epoch": 7.707157114342743, + "grad_norm": 0.6875, + "learning_rate": 6.868882247876776e-07, + "loss": 0.0184, + "mean_token_accuracy": 0.992721900343895, + "num_tokens": 403341297.0, + "step": 3383 + }, + { + "entropy": 0.6328517347574234, + "epoch": 7.709438266324494, + "grad_norm": 0.54296875, + "learning_rate": 6.855920015368032e-07, + "loss": 0.0145, + "mean_token_accuracy": 0.9963301494717598, + "num_tokens": 403460408.0, + "step": 3384 + }, + { + "entropy": 0.6313546746969223, + "epoch": 7.711719418306244, + "grad_norm": 0.51953125, + "learning_rate": 6.8429680810888e-07, + "loss": 0.0135, + "mean_token_accuracy": 0.9966804310679436, + "num_tokens": 403579222.0, + "step": 3385 + }, + { + "entropy": 0.635981097817421, + "epoch": 7.714000570287995, + "grad_norm": 0.5078125, + "learning_rate": 6.830026452390354e-07, + "loss": 0.0146, + "mean_token_accuracy": 0.9955038651823997, + "num_tokens": 403698888.0, + "step": 3386 + }, + { + "entropy": 0.6356370002031326, + "epoch": 7.716281722269747, + "grad_norm": 0.5234375, + "learning_rate": 6.817095136618113e-07, + "loss": 0.0138, + "mean_token_accuracy": 0.9954708069562912, + "num_tokens": 403818437.0, + "step": 3387 + }, + { + "entropy": 0.6412869691848755, + "epoch": 7.718562874251497, + "grad_norm": 0.60546875, + "learning_rate": 6.804174141111631e-07, + "loss": 0.021, + "mean_token_accuracy": 0.9936181902885437, + "num_tokens": 403938458.0, + "step": 3388 + }, + { + "entropy": 0.6328496634960175, + "epoch": 7.720844026233248, + "grad_norm": 0.546875, + "learning_rate": 6.791263473204624e-07, + "loss": 0.0142, + "mean_token_accuracy": 0.9959841370582581, + "num_tokens": 404057665.0, + "step": 3389 + }, + { + "entropy": 0.633496955037117, + "epoch": 7.723125178214999, + "grad_norm": 0.58203125, + "learning_rate": 6.778363140224933e-07, + "loss": 0.0189, + "mean_token_accuracy": 0.9933216646313667, + "num_tokens": 404177569.0, + "step": 3390 + }, + { + "entropy": 0.6353738456964493, + "epoch": 7.7254063301967495, + "grad_norm": 0.63671875, + "learning_rate": 6.765473149494545e-07, + "loss": 0.0175, + "mean_token_accuracy": 0.9931190833449364, + "num_tokens": 404297619.0, + "step": 3391 + }, + { + "entropy": 0.6345236450433731, + "epoch": 7.7276874821785, + "grad_norm": 0.5078125, + "learning_rate": 6.752593508329572e-07, + "loss": 0.0153, + "mean_token_accuracy": 0.9954070001840591, + "num_tokens": 404417194.0, + "step": 3392 + }, + { + "entropy": 0.639557734131813, + "epoch": 7.729968634160251, + "grad_norm": 0.58203125, + "learning_rate": 6.739724224040236e-07, + "loss": 0.0212, + "mean_token_accuracy": 0.9942702725529671, + "num_tokens": 404537093.0, + "step": 3393 + }, + { + "entropy": 0.6304379180073738, + "epoch": 7.732249786142002, + "grad_norm": 0.7265625, + "learning_rate": 6.726865303930905e-07, + "loss": 0.0198, + "mean_token_accuracy": 0.99413101375103, + "num_tokens": 404656240.0, + "step": 3394 + }, + { + "entropy": 0.6367425546050072, + "epoch": 7.734530938123752, + "grad_norm": 0.54296875, + "learning_rate": 6.714016755300048e-07, + "loss": 0.0179, + "mean_token_accuracy": 0.995092898607254, + "num_tokens": 404775486.0, + "step": 3395 + }, + { + "entropy": 0.6365671902894974, + "epoch": 7.736812090105503, + "grad_norm": 0.49609375, + "learning_rate": 6.701178585440257e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9945843145251274, + "num_tokens": 404895263.0, + "step": 3396 + }, + { + "entropy": 0.6361879706382751, + "epoch": 7.739093242087254, + "grad_norm": 0.470703125, + "learning_rate": 6.688350801638235e-07, + "loss": 0.0175, + "mean_token_accuracy": 0.9947721511125565, + "num_tokens": 405014339.0, + "step": 3397 + }, + { + "entropy": 0.6343268305063248, + "epoch": 7.7413743940690045, + "grad_norm": 0.5625, + "learning_rate": 6.67553341117477e-07, + "loss": 0.0184, + "mean_token_accuracy": 0.9932208508253098, + "num_tokens": 405134033.0, + "step": 3398 + }, + { + "entropy": 0.6339136436581612, + "epoch": 7.743655546050755, + "grad_norm": 0.62890625, + "learning_rate": 6.662726421324775e-07, + "loss": 0.0138, + "mean_token_accuracy": 0.9953488931059837, + "num_tokens": 405253453.0, + "step": 3399 + }, + { + "entropy": 0.6365788877010345, + "epoch": 7.745936698032507, + "grad_norm": 0.51953125, + "learning_rate": 6.649929839357247e-07, + "loss": 0.0151, + "mean_token_accuracy": 0.9965000301599503, + "num_tokens": 405373160.0, + "step": 3400 + }, + { + "entropy": 0.6335291936993599, + "epoch": 7.748217850014258, + "grad_norm": 0.51953125, + "learning_rate": 6.637143672535282e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9952194765210152, + "num_tokens": 405491805.0, + "step": 3401 + }, + { + "entropy": 0.6326406747102737, + "epoch": 7.750499001996008, + "grad_norm": 0.55859375, + "learning_rate": 6.624367928116066e-07, + "loss": 0.0185, + "mean_token_accuracy": 0.9949883371591568, + "num_tokens": 405610979.0, + "step": 3402 + }, + { + "entropy": 0.6375214830040932, + "epoch": 7.752780153977759, + "grad_norm": 0.6328125, + "learning_rate": 6.611602613350854e-07, + "loss": 0.018, + "mean_token_accuracy": 0.9930341392755508, + "num_tokens": 405730765.0, + "step": 3403 + }, + { + "entropy": 0.6309774294495583, + "epoch": 7.75506130595951, + "grad_norm": 0.455078125, + "learning_rate": 6.598847735485001e-07, + "loss": 0.0155, + "mean_token_accuracy": 0.9956400021910667, + "num_tokens": 405850224.0, + "step": 3404 + }, + { + "entropy": 0.6405323818325996, + "epoch": 7.75734245794126, + "grad_norm": 0.466796875, + "learning_rate": 6.586103301757918e-07, + "loss": 0.0129, + "mean_token_accuracy": 0.9957956299185753, + "num_tokens": 405969469.0, + "step": 3405 + }, + { + "entropy": 0.6332292184233665, + "epoch": 7.759623609923011, + "grad_norm": 0.52734375, + "learning_rate": 6.573369319403108e-07, + "loss": 0.0147, + "mean_token_accuracy": 0.9963762164115906, + "num_tokens": 406089360.0, + "step": 3406 + }, + { + "entropy": 0.6308197230100632, + "epoch": 7.761904761904762, + "grad_norm": 0.6640625, + "learning_rate": 6.560645795648132e-07, + "loss": 0.022, + "mean_token_accuracy": 0.9929729774594307, + "num_tokens": 406208676.0, + "step": 3407 + }, + { + "entropy": 0.6336059793829918, + "epoch": 7.764185913886513, + "grad_norm": 0.56640625, + "learning_rate": 6.547932737714624e-07, + "loss": 0.0209, + "mean_token_accuracy": 0.9942584484815598, + "num_tokens": 406328723.0, + "step": 3408 + }, + { + "entropy": 0.6349048465490341, + "epoch": 7.766467065868263, + "grad_norm": 0.435546875, + "learning_rate": 6.535230152818256e-07, + "loss": 0.0143, + "mean_token_accuracy": 0.9957463443279266, + "num_tokens": 406448164.0, + "step": 3409 + }, + { + "entropy": 0.6361136063933372, + "epoch": 7.768748217850014, + "grad_norm": 0.4453125, + "learning_rate": 6.522538048168777e-07, + "loss": 0.0161, + "mean_token_accuracy": 0.994721531867981, + "num_tokens": 406567614.0, + "step": 3410 + }, + { + "entropy": 0.6309549957513809, + "epoch": 7.771029369831765, + "grad_norm": 0.59765625, + "learning_rate": 6.509856430969982e-07, + "loss": 0.0184, + "mean_token_accuracy": 0.994589626789093, + "num_tokens": 406686495.0, + "step": 3411 + }, + { + "entropy": 0.6350840553641319, + "epoch": 7.7733105218135155, + "grad_norm": 0.63671875, + "learning_rate": 6.49718530841971e-07, + "loss": 0.0218, + "mean_token_accuracy": 0.9937118664383888, + "num_tokens": 406806167.0, + "step": 3412 + }, + { + "entropy": 0.6294358745217323, + "epoch": 7.775591673795267, + "grad_norm": 0.404296875, + "learning_rate": 6.484524687709853e-07, + "loss": 0.0111, + "mean_token_accuracy": 0.9968230649828911, + "num_tokens": 406925169.0, + "step": 3413 + }, + { + "entropy": 0.6322370320558548, + "epoch": 7.777872825777018, + "grad_norm": 0.482421875, + "learning_rate": 6.471874576026321e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.9940994754433632, + "num_tokens": 407044399.0, + "step": 3414 + }, + { + "entropy": 0.6319432705640793, + "epoch": 7.7801539777587685, + "grad_norm": 0.5546875, + "learning_rate": 6.459234980549081e-07, + "loss": 0.0184, + "mean_token_accuracy": 0.9929343312978745, + "num_tokens": 407163200.0, + "step": 3415 + }, + { + "entropy": 0.6358423084020615, + "epoch": 7.782435129740519, + "grad_norm": 0.43359375, + "learning_rate": 6.446605908452122e-07, + "loss": 0.0147, + "mean_token_accuracy": 0.9951678663492203, + "num_tokens": 407282337.0, + "step": 3416 + }, + { + "entropy": 0.6378575935959816, + "epoch": 7.78471628172227, + "grad_norm": 0.640625, + "learning_rate": 6.433987366903461e-07, + "loss": 0.0161, + "mean_token_accuracy": 0.993573822081089, + "num_tokens": 407401877.0, + "step": 3417 + }, + { + "entropy": 0.6382683590054512, + "epoch": 7.786997433704021, + "grad_norm": 0.51171875, + "learning_rate": 6.421379363065142e-07, + "loss": 0.0185, + "mean_token_accuracy": 0.9930720031261444, + "num_tokens": 407521998.0, + "step": 3418 + }, + { + "entropy": 0.6370548009872437, + "epoch": 7.789278585685771, + "grad_norm": 0.4296875, + "learning_rate": 6.408781904093228e-07, + "loss": 0.0127, + "mean_token_accuracy": 0.9952062889933586, + "num_tokens": 407640810.0, + "step": 3419 + }, + { + "entropy": 0.6335834562778473, + "epoch": 7.791559737667522, + "grad_norm": 0.6484375, + "learning_rate": 6.39619499713778e-07, + "loss": 0.0153, + "mean_token_accuracy": 0.9944994896650314, + "num_tokens": 407760382.0, + "step": 3420 + }, + { + "entropy": 0.6313471049070358, + "epoch": 7.793840889649273, + "grad_norm": 0.56640625, + "learning_rate": 6.383618649342894e-07, + "loss": 0.0201, + "mean_token_accuracy": 0.9937909990549088, + "num_tokens": 407878754.0, + "step": 3421 + }, + { + "entropy": 0.6353915631771088, + "epoch": 7.7961220416310235, + "grad_norm": 0.474609375, + "learning_rate": 6.371052867846658e-07, + "loss": 0.0118, + "mean_token_accuracy": 0.9955379217863083, + "num_tokens": 407998029.0, + "step": 3422 + }, + { + "entropy": 0.6352147981524467, + "epoch": 7.798403193612774, + "grad_norm": 0.51171875, + "learning_rate": 6.358497659781177e-07, + "loss": 0.0145, + "mean_token_accuracy": 0.9952563792467117, + "num_tokens": 408117232.0, + "step": 3423 + }, + { + "entropy": 0.631833128631115, + "epoch": 7.800684345594525, + "grad_norm": 0.40625, + "learning_rate": 6.345953032272525e-07, + "loss": 0.0115, + "mean_token_accuracy": 0.9971388056874275, + "num_tokens": 408236758.0, + "step": 3424 + }, + { + "entropy": 0.6348193734884262, + "epoch": 7.802965497576276, + "grad_norm": 0.419921875, + "learning_rate": 6.333418992440804e-07, + "loss": 0.0128, + "mean_token_accuracy": 0.9967619329690933, + "num_tokens": 408356140.0, + "step": 3425 + }, + { + "entropy": 0.6368216052651405, + "epoch": 7.805246649558027, + "grad_norm": 0.484375, + "learning_rate": 6.3208955474001e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.9947201013565063, + "num_tokens": 408475679.0, + "step": 3426 + }, + { + "entropy": 0.6294946670532227, + "epoch": 7.807527801539777, + "grad_norm": 0.5546875, + "learning_rate": 6.308382704258459e-07, + "loss": 0.0166, + "mean_token_accuracy": 0.9950311928987503, + "num_tokens": 408595000.0, + "step": 3427 + }, + { + "entropy": 0.6416565775871277, + "epoch": 7.809808953521529, + "grad_norm": 0.6015625, + "learning_rate": 6.29588047011794e-07, + "loss": 0.0176, + "mean_token_accuracy": 0.9938788041472435, + "num_tokens": 408714637.0, + "step": 3428 + }, + { + "entropy": 0.6321659311652184, + "epoch": 7.812090105503279, + "grad_norm": 0.447265625, + "learning_rate": 6.283388852074576e-07, + "loss": 0.0127, + "mean_token_accuracy": 0.9957041889429092, + "num_tokens": 408834036.0, + "step": 3429 + }, + { + "entropy": 0.639438733458519, + "epoch": 7.81437125748503, + "grad_norm": 0.63671875, + "learning_rate": 6.270907857218356e-07, + "loss": 0.018, + "mean_token_accuracy": 0.9937725961208344, + "num_tokens": 408953633.0, + "step": 3430 + }, + { + "entropy": 0.6344861835241318, + "epoch": 7.816652409466781, + "grad_norm": 0.482421875, + "learning_rate": 6.258437492633254e-07, + "loss": 0.0148, + "mean_token_accuracy": 0.9961202517151833, + "num_tokens": 409072813.0, + "step": 3431 + }, + { + "entropy": 0.6336042582988739, + "epoch": 7.818933561448532, + "grad_norm": 0.46484375, + "learning_rate": 6.245977765397216e-07, + "loss": 0.0166, + "mean_token_accuracy": 0.9948833286762238, + "num_tokens": 409192422.0, + "step": 3432 + }, + { + "entropy": 0.6366801410913467, + "epoch": 7.821214713430282, + "grad_norm": 0.47265625, + "learning_rate": 6.233528682582132e-07, + "loss": 0.0122, + "mean_token_accuracy": 0.9964569658041, + "num_tokens": 409311681.0, + "step": 3433 + }, + { + "entropy": 0.6296543478965759, + "epoch": 7.823495865412033, + "grad_norm": 0.546875, + "learning_rate": 6.221090251253872e-07, + "loss": 0.019, + "mean_token_accuracy": 0.9949755966663361, + "num_tokens": 409431959.0, + "step": 3434 + }, + { + "entropy": 0.6330264210700989, + "epoch": 7.825777017393784, + "grad_norm": 0.5546875, + "learning_rate": 6.208662478472249e-07, + "loss": 0.0191, + "mean_token_accuracy": 0.992968462407589, + "num_tokens": 409551317.0, + "step": 3435 + }, + { + "entropy": 0.6322203204035759, + "epoch": 7.8280581693755344, + "grad_norm": 0.70703125, + "learning_rate": 6.196245371291015e-07, + "loss": 0.0238, + "mean_token_accuracy": 0.9924449697136879, + "num_tokens": 409670733.0, + "step": 3436 + }, + { + "entropy": 0.6321752443909645, + "epoch": 7.830339321357285, + "grad_norm": 0.4921875, + "learning_rate": 6.183838936757891e-07, + "loss": 0.0186, + "mean_token_accuracy": 0.9957587569952011, + "num_tokens": 409790509.0, + "step": 3437 + }, + { + "entropy": 0.63224658370018, + "epoch": 7.832620473339036, + "grad_norm": 0.478515625, + "learning_rate": 6.171443181914524e-07, + "loss": 0.0197, + "mean_token_accuracy": 0.9943910911679268, + "num_tokens": 409910705.0, + "step": 3438 + }, + { + "entropy": 0.6364821344614029, + "epoch": 7.8349016253207875, + "grad_norm": 0.61328125, + "learning_rate": 6.159058113796507e-07, + "loss": 0.0178, + "mean_token_accuracy": 0.9942322298884392, + "num_tokens": 410030238.0, + "step": 3439 + }, + { + "entropy": 0.6368167102336884, + "epoch": 7.837182777302537, + "grad_norm": 0.5390625, + "learning_rate": 6.146683739433374e-07, + "loss": 0.019, + "mean_token_accuracy": 0.9932166039943695, + "num_tokens": 410149249.0, + "step": 3440 + }, + { + "entropy": 0.6337881907820702, + "epoch": 7.839463929284289, + "grad_norm": 0.39453125, + "learning_rate": 6.134320065848564e-07, + "loss": 0.0108, + "mean_token_accuracy": 0.9966397807002068, + "num_tokens": 410268658.0, + "step": 3441 + }, + { + "entropy": 0.6332950592041016, + "epoch": 7.84174508126604, + "grad_norm": 0.5625, + "learning_rate": 6.121967100059473e-07, + "loss": 0.022, + "mean_token_accuracy": 0.9927376955747604, + "num_tokens": 410387599.0, + "step": 3442 + }, + { + "entropy": 0.6386866047978401, + "epoch": 7.84402623324779, + "grad_norm": 0.62109375, + "learning_rate": 6.109624849077397e-07, + "loss": 0.0207, + "mean_token_accuracy": 0.9947901144623756, + "num_tokens": 410506899.0, + "step": 3443 + }, + { + "entropy": 0.6314909383654594, + "epoch": 7.846307385229541, + "grad_norm": 0.5234375, + "learning_rate": 6.097293319907566e-07, + "loss": 0.0178, + "mean_token_accuracy": 0.994565524160862, + "num_tokens": 410625949.0, + "step": 3444 + }, + { + "entropy": 0.6397695019841194, + "epoch": 7.848588537211292, + "grad_norm": 0.65234375, + "learning_rate": 6.084972519549123e-07, + "loss": 0.0233, + "mean_token_accuracy": 0.9932422265410423, + "num_tokens": 410744876.0, + "step": 3445 + }, + { + "entropy": 0.6380265727639198, + "epoch": 7.8508696891930425, + "grad_norm": 0.58984375, + "learning_rate": 6.072662454995101e-07, + "loss": 0.0187, + "mean_token_accuracy": 0.9937006384134293, + "num_tokens": 410864156.0, + "step": 3446 + }, + { + "entropy": 0.6371433138847351, + "epoch": 7.853150841174793, + "grad_norm": 0.60546875, + "learning_rate": 6.060363133232472e-07, + "loss": 0.022, + "mean_token_accuracy": 0.9929639771580696, + "num_tokens": 410984092.0, + "step": 3447 + }, + { + "entropy": 0.6338736414909363, + "epoch": 7.855431993156544, + "grad_norm": 0.419921875, + "learning_rate": 6.048074561242076e-07, + "loss": 0.0112, + "mean_token_accuracy": 0.9962293282151222, + "num_tokens": 411103938.0, + "step": 3448 + }, + { + "entropy": 0.6348715722560883, + "epoch": 7.857713145138295, + "grad_norm": 0.42578125, + "learning_rate": 6.035796745998679e-07, + "loss": 0.0153, + "mean_token_accuracy": 0.9956468641757965, + "num_tokens": 411223611.0, + "step": 3449 + }, + { + "entropy": 0.632002130150795, + "epoch": 7.859994297120045, + "grad_norm": 0.4921875, + "learning_rate": 6.023529694470931e-07, + "loss": 0.0085, + "mean_token_accuracy": 0.9966758415102959, + "num_tokens": 411342608.0, + "step": 3450 + }, + { + "entropy": 0.6399052515625954, + "epoch": 7.862275449101796, + "grad_norm": 0.7890625, + "learning_rate": 6.01127341362138e-07, + "loss": 0.0227, + "mean_token_accuracy": 0.9940595105290413, + "num_tokens": 411461647.0, + "step": 3451 + }, + { + "entropy": 0.6338033676147461, + "epoch": 7.864556601083547, + "grad_norm": 0.515625, + "learning_rate": 5.999027910406441e-07, + "loss": 0.0173, + "mean_token_accuracy": 0.995192788541317, + "num_tokens": 411581003.0, + "step": 3452 + }, + { + "entropy": 0.6314221918582916, + "epoch": 7.8668377530652975, + "grad_norm": 0.65234375, + "learning_rate": 5.98679319177643e-07, + "loss": 0.0195, + "mean_token_accuracy": 0.9932462573051453, + "num_tokens": 411700217.0, + "step": 3453 + }, + { + "entropy": 0.6378269866108894, + "epoch": 7.869118905047049, + "grad_norm": 0.4453125, + "learning_rate": 5.974569264675542e-07, + "loss": 0.0161, + "mean_token_accuracy": 0.9940430000424385, + "num_tokens": 411819600.0, + "step": 3454 + }, + { + "entropy": 0.6365525275468826, + "epoch": 7.8714000570288, + "grad_norm": 0.388671875, + "learning_rate": 5.962356136041835e-07, + "loss": 0.011, + "mean_token_accuracy": 0.9966035410761833, + "num_tokens": 411938494.0, + "step": 3455 + }, + { + "entropy": 0.636716902256012, + "epoch": 7.873681209010551, + "grad_norm": 0.51953125, + "learning_rate": 5.95015381280726e-07, + "loss": 0.0141, + "mean_token_accuracy": 0.9952518716454506, + "num_tokens": 412058214.0, + "step": 3456 + }, + { + "entropy": 0.6379837244749069, + "epoch": 7.875962360992301, + "grad_norm": 0.52734375, + "learning_rate": 5.937962301897604e-07, + "loss": 0.0221, + "mean_token_accuracy": 0.9930263608694077, + "num_tokens": 412178385.0, + "step": 3457 + }, + { + "entropy": 0.6328062862157822, + "epoch": 7.878243512974052, + "grad_norm": 0.71484375, + "learning_rate": 5.925781610232534e-07, + "loss": 0.0206, + "mean_token_accuracy": 0.9939941912889481, + "num_tokens": 412298108.0, + "step": 3458 + }, + { + "entropy": 0.6357534229755402, + "epoch": 7.880524664955803, + "grad_norm": 0.4765625, + "learning_rate": 5.913611744725584e-07, + "loss": 0.0151, + "mean_token_accuracy": 0.9949315711855888, + "num_tokens": 412417971.0, + "step": 3459 + }, + { + "entropy": 0.6376344114542007, + "epoch": 7.8828058169375534, + "grad_norm": 0.60546875, + "learning_rate": 5.901452712284128e-07, + "loss": 0.021, + "mean_token_accuracy": 0.994343563914299, + "num_tokens": 412537758.0, + "step": 3460 + }, + { + "entropy": 0.6329527720808983, + "epoch": 7.885086968919304, + "grad_norm": 0.55859375, + "learning_rate": 5.889304519809402e-07, + "loss": 0.0215, + "mean_token_accuracy": 0.9947630763053894, + "num_tokens": 412658028.0, + "step": 3461 + }, + { + "entropy": 0.6355074197053909, + "epoch": 7.887368120901055, + "grad_norm": 0.6328125, + "learning_rate": 5.877167174196491e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.9954705387353897, + "num_tokens": 412777515.0, + "step": 3462 + }, + { + "entropy": 0.63480494171381, + "epoch": 7.889649272882806, + "grad_norm": 0.57421875, + "learning_rate": 5.865040682334303e-07, + "loss": 0.0173, + "mean_token_accuracy": 0.9938396513462067, + "num_tokens": 412896253.0, + "step": 3463 + }, + { + "entropy": 0.6330574452877045, + "epoch": 7.891930424864556, + "grad_norm": 0.46875, + "learning_rate": 5.852925051105609e-07, + "loss": 0.0161, + "mean_token_accuracy": 0.9943116307258606, + "num_tokens": 413015178.0, + "step": 3464 + }, + { + "entropy": 0.6333870217204094, + "epoch": 7.894211576846307, + "grad_norm": 0.5078125, + "learning_rate": 5.840820287387009e-07, + "loss": 0.0162, + "mean_token_accuracy": 0.993760421872139, + "num_tokens": 413134431.0, + "step": 3465 + }, + { + "entropy": 0.6333759427070618, + "epoch": 7.896492728828058, + "grad_norm": 0.74609375, + "learning_rate": 5.828726398048939e-07, + "loss": 0.0204, + "mean_token_accuracy": 0.9940017908811569, + "num_tokens": 413252902.0, + "step": 3466 + }, + { + "entropy": 0.6356987506151199, + "epoch": 7.898773880809809, + "grad_norm": 0.3828125, + "learning_rate": 5.816643389955642e-07, + "loss": 0.0128, + "mean_token_accuracy": 0.995942659676075, + "num_tokens": 413371810.0, + "step": 3467 + }, + { + "entropy": 0.6350524872541428, + "epoch": 7.90105503279156, + "grad_norm": 0.6484375, + "learning_rate": 5.804571269965206e-07, + "loss": 0.0167, + "mean_token_accuracy": 0.9935454577207565, + "num_tokens": 413491020.0, + "step": 3468 + }, + { + "entropy": 0.6377769857645035, + "epoch": 7.903336184773311, + "grad_norm": 0.75390625, + "learning_rate": 5.792510044929545e-07, + "loss": 0.0185, + "mean_token_accuracy": 0.9944734275341034, + "num_tokens": 413610144.0, + "step": 3469 + }, + { + "entropy": 0.6353226974606514, + "epoch": 7.9056173367550615, + "grad_norm": 0.54296875, + "learning_rate": 5.780459721694359e-07, + "loss": 0.0176, + "mean_token_accuracy": 0.9940384402871132, + "num_tokens": 413729382.0, + "step": 3470 + }, + { + "entropy": 0.6349055245518684, + "epoch": 7.907898488736812, + "grad_norm": 0.494140625, + "learning_rate": 5.768420307099188e-07, + "loss": 0.0153, + "mean_token_accuracy": 0.9954587444663048, + "num_tokens": 413848978.0, + "step": 3471 + }, + { + "entropy": 0.6339629143476486, + "epoch": 7.910179640718563, + "grad_norm": 0.52734375, + "learning_rate": 5.756391807977377e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.993511751294136, + "num_tokens": 413968310.0, + "step": 3472 + }, + { + "entropy": 0.625716432929039, + "epoch": 7.912460792700314, + "grad_norm": 0.6015625, + "learning_rate": 5.744374231156056e-07, + "loss": 0.022, + "mean_token_accuracy": 0.9942318424582481, + "num_tokens": 414086896.0, + "step": 3473 + }, + { + "entropy": 0.6347675547003746, + "epoch": 7.914741944682064, + "grad_norm": 0.54296875, + "learning_rate": 5.732367583456177e-07, + "loss": 0.0163, + "mean_token_accuracy": 0.9950410276651382, + "num_tokens": 414206051.0, + "step": 3474 + }, + { + "entropy": 0.6366858556866646, + "epoch": 7.917023096663815, + "grad_norm": 0.462890625, + "learning_rate": 5.720371871692484e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.9950160384178162, + "num_tokens": 414325410.0, + "step": 3475 + }, + { + "entropy": 0.6388410553336143, + "epoch": 7.919304248645566, + "grad_norm": 0.41015625, + "learning_rate": 5.708387102673507e-07, + "loss": 0.0113, + "mean_token_accuracy": 0.9963899105787277, + "num_tokens": 414444680.0, + "step": 3476 + }, + { + "entropy": 0.6342408284544945, + "epoch": 7.9215854006273165, + "grad_norm": 0.671875, + "learning_rate": 5.696413283201571e-07, + "loss": 0.0176, + "mean_token_accuracy": 0.9934411570429802, + "num_tokens": 414563986.0, + "step": 3477 + }, + { + "entropy": 0.634798564016819, + "epoch": 7.923866552609067, + "grad_norm": 0.388671875, + "learning_rate": 5.684450420072792e-07, + "loss": 0.0124, + "mean_token_accuracy": 0.9960343763232231, + "num_tokens": 414683508.0, + "step": 3478 + }, + { + "entropy": 0.6359194964170456, + "epoch": 7.926147704590818, + "grad_norm": 0.421875, + "learning_rate": 5.67249852007705e-07, + "loss": 0.0101, + "mean_token_accuracy": 0.9964630380272865, + "num_tokens": 414803368.0, + "step": 3479 + }, + { + "entropy": 0.6474344730377197, + "epoch": 7.92842885657257, + "grad_norm": 0.5390625, + "learning_rate": 5.660557589998014e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.9945859536528587, + "num_tokens": 414923645.0, + "step": 3480 + }, + { + "entropy": 0.6305102780461311, + "epoch": 7.93071000855432, + "grad_norm": 0.63671875, + "learning_rate": 5.648627636613127e-07, + "loss": 0.0224, + "mean_token_accuracy": 0.9936139285564423, + "num_tokens": 415043170.0, + "step": 3481 + }, + { + "entropy": 0.6399151459336281, + "epoch": 7.932991160536071, + "grad_norm": 0.365234375, + "learning_rate": 5.636708666693599e-07, + "loss": 0.0121, + "mean_token_accuracy": 0.9966028928756714, + "num_tokens": 415163054.0, + "step": 3482 + }, + { + "entropy": 0.638505719602108, + "epoch": 7.935272312517822, + "grad_norm": 0.5625, + "learning_rate": 5.62480068700442e-07, + "loss": 0.0194, + "mean_token_accuracy": 0.9936505556106567, + "num_tokens": 415282160.0, + "step": 3483 + }, + { + "entropy": 0.6338230818510056, + "epoch": 7.937553464499572, + "grad_norm": 0.5, + "learning_rate": 5.612903704304309e-07, + "loss": 0.016, + "mean_token_accuracy": 0.9949586987495422, + "num_tokens": 415402148.0, + "step": 3484 + }, + { + "entropy": 0.6349026411771774, + "epoch": 7.939834616481323, + "grad_norm": 0.6171875, + "learning_rate": 5.601017725345772e-07, + "loss": 0.0161, + "mean_token_accuracy": 0.995113343000412, + "num_tokens": 415521965.0, + "step": 3485 + }, + { + "entropy": 0.6332695037126541, + "epoch": 7.942115768463074, + "grad_norm": 0.38671875, + "learning_rate": 5.589142756875065e-07, + "loss": 0.0142, + "mean_token_accuracy": 0.9959475174546242, + "num_tokens": 415640867.0, + "step": 3486 + }, + { + "entropy": 0.6376106813549995, + "epoch": 7.944396920444825, + "grad_norm": 0.6015625, + "learning_rate": 5.577278805632186e-07, + "loss": 0.0262, + "mean_token_accuracy": 0.9924160167574883, + "num_tokens": 415760518.0, + "step": 3487 + }, + { + "entropy": 0.6290957108139992, + "epoch": 7.946678072426575, + "grad_norm": 0.65234375, + "learning_rate": 5.565425878350895e-07, + "loss": 0.0262, + "mean_token_accuracy": 0.9925666749477386, + "num_tokens": 415879618.0, + "step": 3488 + }, + { + "entropy": 0.6365744471549988, + "epoch": 7.948959224408326, + "grad_norm": 0.4453125, + "learning_rate": 5.553583981758668e-07, + "loss": 0.0137, + "mean_token_accuracy": 0.9946264773607254, + "num_tokens": 415999108.0, + "step": 3489 + }, + { + "entropy": 0.6275183483958244, + "epoch": 7.951240376390077, + "grad_norm": 0.5625, + "learning_rate": 5.541753122576746e-07, + "loss": 0.0206, + "mean_token_accuracy": 0.9940836355090141, + "num_tokens": 416118359.0, + "step": 3490 + }, + { + "entropy": 0.6317881271243095, + "epoch": 7.9535215283718275, + "grad_norm": 0.4765625, + "learning_rate": 5.529933307520102e-07, + "loss": 0.01, + "mean_token_accuracy": 0.9968279153108597, + "num_tokens": 416237773.0, + "step": 3491 + }, + { + "entropy": 0.6317457482218742, + "epoch": 7.955802680353578, + "grad_norm": 0.462890625, + "learning_rate": 5.518124543297423e-07, + "loss": 0.0134, + "mean_token_accuracy": 0.9955916106700897, + "num_tokens": 416356833.0, + "step": 3492 + }, + { + "entropy": 0.6345725655555725, + "epoch": 7.95808383233533, + "grad_norm": 0.78125, + "learning_rate": 5.506326836611139e-07, + "loss": 0.0199, + "mean_token_accuracy": 0.9939513951539993, + "num_tokens": 416476666.0, + "step": 3493 + }, + { + "entropy": 0.6310638561844826, + "epoch": 7.9603649843170805, + "grad_norm": 0.46484375, + "learning_rate": 5.494540194157411e-07, + "loss": 0.0134, + "mean_token_accuracy": 0.9951734766364098, + "num_tokens": 416595532.0, + "step": 3494 + }, + { + "entropy": 0.6370683237910271, + "epoch": 7.962646136298831, + "grad_norm": 0.66015625, + "learning_rate": 5.482764622626094e-07, + "loss": 0.0202, + "mean_token_accuracy": 0.9938688054680824, + "num_tokens": 416714450.0, + "step": 3495 + }, + { + "entropy": 0.6354637295007706, + "epoch": 7.964927288280582, + "grad_norm": 0.4453125, + "learning_rate": 5.471000128700784e-07, + "loss": 0.0155, + "mean_token_accuracy": 0.9949403926730156, + "num_tokens": 416834277.0, + "step": 3496 + }, + { + "entropy": 0.6336948201060295, + "epoch": 7.967208440262333, + "grad_norm": 0.5546875, + "learning_rate": 5.459246719058778e-07, + "loss": 0.0161, + "mean_token_accuracy": 0.993808887898922, + "num_tokens": 416953676.0, + "step": 3497 + }, + { + "entropy": 0.6385904848575592, + "epoch": 7.969489592244083, + "grad_norm": 0.44140625, + "learning_rate": 5.447504400371084e-07, + "loss": 0.0141, + "mean_token_accuracy": 0.9952010735869408, + "num_tokens": 417072823.0, + "step": 3498 + }, + { + "entropy": 0.6310571283102036, + "epoch": 7.971770744225834, + "grad_norm": 0.45703125, + "learning_rate": 5.435773179302426e-07, + "loss": 0.0143, + "mean_token_accuracy": 0.9957203269004822, + "num_tokens": 417191661.0, + "step": 3499 + }, + { + "entropy": 0.6347384229302406, + "epoch": 7.974051896207585, + "grad_norm": 0.482421875, + "learning_rate": 5.4240530625112e-07, + "loss": 0.0135, + "mean_token_accuracy": 0.9950878769159317, + "num_tokens": 417312059.0, + "step": 3500 + }, + { + "entropy": 0.6359045132994652, + "epoch": 7.9763330481893355, + "grad_norm": 0.53125, + "learning_rate": 5.412344056649527e-07, + "loss": 0.0154, + "mean_token_accuracy": 0.9950065314769745, + "num_tokens": 417430155.0, + "step": 3501 + }, + { + "entropy": 0.634592205286026, + "epoch": 7.978614200171086, + "grad_norm": 0.58984375, + "learning_rate": 5.400646168363216e-07, + "loss": 0.0216, + "mean_token_accuracy": 0.9935535117983818, + "num_tokens": 417549809.0, + "step": 3502 + }, + { + "entropy": 0.6363679990172386, + "epoch": 7.980895352152837, + "grad_norm": 0.48046875, + "learning_rate": 5.388959404291757e-07, + "loss": 0.0138, + "mean_token_accuracy": 0.9956032335758209, + "num_tokens": 417668718.0, + "step": 3503 + }, + { + "entropy": 0.6409248411655426, + "epoch": 7.983176504134588, + "grad_norm": 0.71484375, + "learning_rate": 5.377283771068342e-07, + "loss": 0.0282, + "mean_token_accuracy": 0.9923953711986542, + "num_tokens": 417788092.0, + "step": 3504 + }, + { + "entropy": 0.634217232465744, + "epoch": 7.985457656116338, + "grad_norm": 0.466796875, + "learning_rate": 5.365619275319823e-07, + "loss": 0.0079, + "mean_token_accuracy": 0.9977793246507645, + "num_tokens": 417906956.0, + "step": 3505 + }, + { + "entropy": 0.6328699886798859, + "epoch": 7.98773880809809, + "grad_norm": 0.46484375, + "learning_rate": 5.353965923666743e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9955780878663063, + "num_tokens": 418026290.0, + "step": 3506 + }, + { + "entropy": 0.6305505856871605, + "epoch": 7.99001996007984, + "grad_norm": 0.61328125, + "learning_rate": 5.342323722723324e-07, + "loss": 0.0194, + "mean_token_accuracy": 0.9928989559412003, + "num_tokens": 418144988.0, + "step": 3507 + }, + { + "entropy": 0.6354324221611023, + "epoch": 7.992301112061591, + "grad_norm": 0.5234375, + "learning_rate": 5.330692679097457e-07, + "loss": 0.0164, + "mean_token_accuracy": 0.993457742035389, + "num_tokens": 418264817.0, + "step": 3508 + }, + { + "entropy": 0.6312143728137016, + "epoch": 7.994582264043342, + "grad_norm": 0.5390625, + "learning_rate": 5.319072799390693e-07, + "loss": 0.0193, + "mean_token_accuracy": 0.9929672554135323, + "num_tokens": 418384059.0, + "step": 3509 + }, + { + "entropy": 0.6362127140164375, + "epoch": 7.996863416025093, + "grad_norm": 0.56640625, + "learning_rate": 5.307464090198258e-07, + "loss": 0.0187, + "mean_token_accuracy": 0.9946375414729118, + "num_tokens": 418503882.0, + "step": 3510 + }, + { + "entropy": 0.6372129246592522, + "epoch": 7.999144568006844, + "grad_norm": 0.4453125, + "learning_rate": 5.295866558109023e-07, + "loss": 0.0159, + "mean_token_accuracy": 0.9962585270404816, + "num_tokens": 418624147.0, + "step": 3511 + }, + { + "entropy": 0.6363813281059265, + "epoch": 8.0, + "grad_norm": 1.3828125, + "learning_rate": 5.284280209705531e-07, + "loss": 0.0301, + "mean_token_accuracy": 0.9907250006993612, + "num_tokens": 418667824.0, + "step": 3512 + }, + { + "entropy": 0.6295195966959, + "epoch": 8.002281151981752, + "grad_norm": 0.62890625, + "learning_rate": 5.272705051563959e-07, + "loss": 0.0211, + "mean_token_accuracy": 0.9931184649467468, + "num_tokens": 418787392.0, + "step": 3513 + }, + { + "entropy": 0.6313819736242294, + "epoch": 8.004562303963501, + "grad_norm": 0.55859375, + "learning_rate": 5.261141090254149e-07, + "loss": 0.0189, + "mean_token_accuracy": 0.9930616691708565, + "num_tokens": 418906936.0, + "step": 3514 + }, + { + "entropy": 0.6318344697356224, + "epoch": 8.006843455945253, + "grad_norm": 0.52734375, + "learning_rate": 5.249588332339589e-07, + "loss": 0.0201, + "mean_token_accuracy": 0.9944902285933495, + "num_tokens": 419026586.0, + "step": 3515 + }, + { + "entropy": 0.6397242173552513, + "epoch": 8.009124607927003, + "grad_norm": 0.490234375, + "learning_rate": 5.238046784377388e-07, + "loss": 0.0146, + "mean_token_accuracy": 0.9948445335030556, + "num_tokens": 419145081.0, + "step": 3516 + }, + { + "entropy": 0.636722020804882, + "epoch": 8.011405759908754, + "grad_norm": 0.70703125, + "learning_rate": 5.226516452918315e-07, + "loss": 0.0186, + "mean_token_accuracy": 0.9939193576574326, + "num_tokens": 419264380.0, + "step": 3517 + }, + { + "entropy": 0.6340345144271851, + "epoch": 8.013686911890504, + "grad_norm": 0.51171875, + "learning_rate": 5.214997344506758e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.9942486584186554, + "num_tokens": 419383235.0, + "step": 3518 + }, + { + "entropy": 0.6376915946602821, + "epoch": 8.015968063872256, + "grad_norm": 0.59765625, + "learning_rate": 5.203489465680747e-07, + "loss": 0.0182, + "mean_token_accuracy": 0.9920329824090004, + "num_tokens": 419502125.0, + "step": 3519 + }, + { + "entropy": 0.6411778852343559, + "epoch": 8.018249215854006, + "grad_norm": 0.48046875, + "learning_rate": 5.19199282297193e-07, + "loss": 0.019, + "mean_token_accuracy": 0.9939900636672974, + "num_tokens": 419622568.0, + "step": 3520 + }, + { + "epoch": 8.018249215854006, + "eval_entropy": 0.6343205102043007, + "eval_loss": 0.020490380004048347, + "eval_mean_token_accuracy": 0.9936303453300389, + "eval_num_tokens": 419622568.0, + "eval_runtime": 177.4849, + "eval_samples_per_second": 47.243, + "eval_steps_per_second": 1.482, + "step": 3520 + }, + { + "entropy": 0.6356026157736778, + "epoch": 8.020530367835757, + "grad_norm": 0.498046875, + "learning_rate": 5.180507422905585e-07, + "loss": 0.0185, + "mean_token_accuracy": 0.9929445460438728, + "num_tokens": 419741655.0, + "step": 3521 + }, + { + "entropy": 0.6350171566009521, + "epoch": 8.022811519817507, + "grad_norm": 0.474609375, + "learning_rate": 5.169033272000587e-07, + "loss": 0.0134, + "mean_token_accuracy": 0.9956403449177742, + "num_tokens": 419860606.0, + "step": 3522 + }, + { + "entropy": 0.629441075026989, + "epoch": 8.025092671799259, + "grad_norm": 0.46875, + "learning_rate": 5.157570376769452e-07, + "loss": 0.0194, + "mean_token_accuracy": 0.9948098659515381, + "num_tokens": 419979551.0, + "step": 3523 + }, + { + "entropy": 0.639604352414608, + "epoch": 8.027373823781009, + "grad_norm": 0.51171875, + "learning_rate": 5.146118743718301e-07, + "loss": 0.0113, + "mean_token_accuracy": 0.9959367737174034, + "num_tokens": 420100451.0, + "step": 3524 + }, + { + "entropy": 0.6355204433202744, + "epoch": 8.02965497576276, + "grad_norm": 0.50390625, + "learning_rate": 5.134678379346856e-07, + "loss": 0.0191, + "mean_token_accuracy": 0.9934121966362, + "num_tokens": 420219467.0, + "step": 3525 + }, + { + "entropy": 0.6324359029531479, + "epoch": 8.031936127744512, + "grad_norm": 0.5078125, + "learning_rate": 5.123249290148452e-07, + "loss": 0.0182, + "mean_token_accuracy": 0.9953311756253242, + "num_tokens": 420338925.0, + "step": 3526 + }, + { + "entropy": 0.6379518732428551, + "epoch": 8.034217279726262, + "grad_norm": 0.443359375, + "learning_rate": 5.111831482610011e-07, + "loss": 0.0114, + "mean_token_accuracy": 0.9959170296788216, + "num_tokens": 420458398.0, + "step": 3527 + }, + { + "entropy": 0.6306264773011208, + "epoch": 8.036498431708013, + "grad_norm": 0.53515625, + "learning_rate": 5.100424963212064e-07, + "loss": 0.0151, + "mean_token_accuracy": 0.9952834397554398, + "num_tokens": 420576954.0, + "step": 3528 + }, + { + "entropy": 0.633255161345005, + "epoch": 8.038779583689763, + "grad_norm": 0.58984375, + "learning_rate": 5.089029738428733e-07, + "loss": 0.0176, + "mean_token_accuracy": 0.993297316133976, + "num_tokens": 420696353.0, + "step": 3529 + }, + { + "entropy": 0.638888843357563, + "epoch": 8.041060735671515, + "grad_norm": 0.5390625, + "learning_rate": 5.077645814727725e-07, + "loss": 0.0186, + "mean_token_accuracy": 0.993000864982605, + "num_tokens": 420815376.0, + "step": 3530 + }, + { + "entropy": 0.6334759891033173, + "epoch": 8.043341887653265, + "grad_norm": 0.62109375, + "learning_rate": 5.066273198570343e-07, + "loss": 0.0191, + "mean_token_accuracy": 0.9945726692676544, + "num_tokens": 420933591.0, + "step": 3531 + }, + { + "entropy": 0.6376611888408661, + "epoch": 8.045623039635016, + "grad_norm": 0.52734375, + "learning_rate": 5.054911896411452e-07, + "loss": 0.0143, + "mean_token_accuracy": 0.9952151477336884, + "num_tokens": 421053491.0, + "step": 3532 + }, + { + "entropy": 0.6374474316835403, + "epoch": 8.047904191616766, + "grad_norm": 0.45703125, + "learning_rate": 5.043561914699513e-07, + "loss": 0.0147, + "mean_token_accuracy": 0.9944631010293961, + "num_tokens": 421173052.0, + "step": 3533 + }, + { + "entropy": 0.632668524980545, + "epoch": 8.050185343598518, + "grad_norm": 0.5859375, + "learning_rate": 5.032223259876565e-07, + "loss": 0.0144, + "mean_token_accuracy": 0.9944843500852585, + "num_tokens": 421292057.0, + "step": 3534 + }, + { + "entropy": 0.6319850459694862, + "epoch": 8.052466495580267, + "grad_norm": 0.486328125, + "learning_rate": 5.020895938378195e-07, + "loss": 0.0134, + "mean_token_accuracy": 0.9960121437907219, + "num_tokens": 421411790.0, + "step": 3535 + }, + { + "entropy": 0.6353977844119072, + "epoch": 8.054747647562019, + "grad_norm": 0.60546875, + "learning_rate": 5.009579956633578e-07, + "loss": 0.0224, + "mean_token_accuracy": 0.9934924095869064, + "num_tokens": 421531618.0, + "step": 3536 + }, + { + "entropy": 0.6385964751243591, + "epoch": 8.057028799543769, + "grad_norm": 0.5234375, + "learning_rate": 4.998275321065454e-07, + "loss": 0.0165, + "mean_token_accuracy": 0.9947995617985725, + "num_tokens": 421651269.0, + "step": 3537 + }, + { + "entropy": 0.6287341862916946, + "epoch": 8.05930995152552, + "grad_norm": 0.57421875, + "learning_rate": 4.986982038090104e-07, + "loss": 0.0195, + "mean_token_accuracy": 0.9945827573537827, + "num_tokens": 421769822.0, + "step": 3538 + }, + { + "entropy": 0.6297082751989365, + "epoch": 8.061591103507272, + "grad_norm": 0.435546875, + "learning_rate": 4.975700114117385e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.9946072548627853, + "num_tokens": 421889243.0, + "step": 3539 + }, + { + "entropy": 0.6371785029768944, + "epoch": 8.063872255489022, + "grad_norm": 0.56640625, + "learning_rate": 4.964429555550693e-07, + "loss": 0.0171, + "mean_token_accuracy": 0.9950065538287163, + "num_tokens": 422008421.0, + "step": 3540 + }, + { + "entropy": 0.6351531893014908, + "epoch": 8.066153407470773, + "grad_norm": 0.5859375, + "learning_rate": 4.953170368786985e-07, + "loss": 0.0226, + "mean_token_accuracy": 0.9929912239313126, + "num_tokens": 422127835.0, + "step": 3541 + }, + { + "entropy": 0.6354996860027313, + "epoch": 8.068434559452523, + "grad_norm": 0.4609375, + "learning_rate": 4.941922560216764e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.9952095746994019, + "num_tokens": 422247035.0, + "step": 3542 + }, + { + "entropy": 0.6341025680303574, + "epoch": 8.070715711434275, + "grad_norm": 0.49609375, + "learning_rate": 4.930686136224056e-07, + "loss": 0.0165, + "mean_token_accuracy": 0.9941012039780617, + "num_tokens": 422365839.0, + "step": 3543 + }, + { + "entropy": 0.6352396011352539, + "epoch": 8.072996863416025, + "grad_norm": 0.53515625, + "learning_rate": 4.91946110318644e-07, + "loss": 0.0162, + "mean_token_accuracy": 0.9951505213975906, + "num_tokens": 422485470.0, + "step": 3544 + }, + { + "entropy": 0.6345069259405136, + "epoch": 8.075278015397776, + "grad_norm": 0.52734375, + "learning_rate": 4.908247467475036e-07, + "loss": 0.0141, + "mean_token_accuracy": 0.9945297539234161, + "num_tokens": 422605649.0, + "step": 3545 + }, + { + "entropy": 0.6333846598863602, + "epoch": 8.077559167379526, + "grad_norm": 0.640625, + "learning_rate": 4.897045235454481e-07, + "loss": 0.0212, + "mean_token_accuracy": 0.9922196567058563, + "num_tokens": 422725107.0, + "step": 3546 + }, + { + "entropy": 0.6376156434416771, + "epoch": 8.079840319361278, + "grad_norm": 0.51171875, + "learning_rate": 4.885854413482955e-07, + "loss": 0.0138, + "mean_token_accuracy": 0.9964442253112793, + "num_tokens": 422844857.0, + "step": 3547 + }, + { + "entropy": 0.6363424956798553, + "epoch": 8.082121471343028, + "grad_norm": 0.60546875, + "learning_rate": 4.874675007912138e-07, + "loss": 0.0181, + "mean_token_accuracy": 0.9943671897053719, + "num_tokens": 422964705.0, + "step": 3548 + }, + { + "entropy": 0.6347638443112373, + "epoch": 8.08440262332478, + "grad_norm": 0.50390625, + "learning_rate": 4.863507025087255e-07, + "loss": 0.0197, + "mean_token_accuracy": 0.9943272471427917, + "num_tokens": 423084479.0, + "step": 3549 + }, + { + "entropy": 0.6338000595569611, + "epoch": 8.086683775306529, + "grad_norm": 0.5703125, + "learning_rate": 4.852350471347031e-07, + "loss": 0.0215, + "mean_token_accuracy": 0.9944526255130768, + "num_tokens": 423203322.0, + "step": 3550 + }, + { + "entropy": 0.6346802935004234, + "epoch": 8.08896492728828, + "grad_norm": 0.56640625, + "learning_rate": 4.841205353023715e-07, + "loss": 0.0198, + "mean_token_accuracy": 0.9921191483736038, + "num_tokens": 423322569.0, + "step": 3551 + }, + { + "entropy": 0.6327783763408661, + "epoch": 8.091246079270032, + "grad_norm": 0.458984375, + "learning_rate": 4.83007167644306e-07, + "loss": 0.0155, + "mean_token_accuracy": 0.996185913681984, + "num_tokens": 423441763.0, + "step": 3552 + }, + { + "entropy": 0.634330078959465, + "epoch": 8.093527231251782, + "grad_norm": 0.45703125, + "learning_rate": 4.818949447924334e-07, + "loss": 0.0133, + "mean_token_accuracy": 0.9961518049240112, + "num_tokens": 423560840.0, + "step": 3553 + }, + { + "entropy": 0.6351498886942863, + "epoch": 8.095808383233534, + "grad_norm": 0.55859375, + "learning_rate": 4.807838673780282e-07, + "loss": 0.017, + "mean_token_accuracy": 0.9941232800483704, + "num_tokens": 423680426.0, + "step": 3554 + }, + { + "entropy": 0.6338286176323891, + "epoch": 8.098089535215284, + "grad_norm": 0.57421875, + "learning_rate": 4.796739360317181e-07, + "loss": 0.02, + "mean_token_accuracy": 0.9942050129175186, + "num_tokens": 423799533.0, + "step": 3555 + }, + { + "entropy": 0.6316585689783096, + "epoch": 8.100370687197035, + "grad_norm": 0.43359375, + "learning_rate": 4.785651513834774e-07, + "loss": 0.015, + "mean_token_accuracy": 0.9953683391213417, + "num_tokens": 423918521.0, + "step": 3556 + }, + { + "entropy": 0.631285272538662, + "epoch": 8.102651839178785, + "grad_norm": 0.59375, + "learning_rate": 4.774575140626317e-07, + "loss": 0.0193, + "mean_token_accuracy": 0.9940491244196892, + "num_tokens": 424038066.0, + "step": 3557 + }, + { + "entropy": 0.6376054212450981, + "epoch": 8.104932991160537, + "grad_norm": 0.515625, + "learning_rate": 4.763510246978548e-07, + "loss": 0.017, + "mean_token_accuracy": 0.9955966994166374, + "num_tokens": 424158023.0, + "step": 3558 + }, + { + "entropy": 0.6372072920203209, + "epoch": 8.107214143142286, + "grad_norm": 0.486328125, + "learning_rate": 4.7524568391716736e-07, + "loss": 0.0126, + "mean_token_accuracy": 0.9964739456772804, + "num_tokens": 424277743.0, + "step": 3559 + }, + { + "entropy": 0.6408190280199051, + "epoch": 8.109495295124038, + "grad_norm": 0.50390625, + "learning_rate": 4.7414149234794064e-07, + "loss": 0.0156, + "mean_token_accuracy": 0.9936943352222443, + "num_tokens": 424397465.0, + "step": 3560 + }, + { + "entropy": 0.6374931186437607, + "epoch": 8.111776447105788, + "grad_norm": 0.4921875, + "learning_rate": 4.7303845061689197e-07, + "loss": 0.0167, + "mean_token_accuracy": 0.993336945772171, + "num_tokens": 424516757.0, + "step": 3561 + }, + { + "entropy": 0.6385602056980133, + "epoch": 8.11405759908754, + "grad_norm": 0.45703125, + "learning_rate": 4.719365593500866e-07, + "loss": 0.0137, + "mean_token_accuracy": 0.9956723675131798, + "num_tokens": 424636307.0, + "step": 3562 + }, + { + "entropy": 0.6382851973176003, + "epoch": 8.11633875106929, + "grad_norm": 0.52734375, + "learning_rate": 4.7083581917293784e-07, + "loss": 0.013, + "mean_token_accuracy": 0.9952649250626564, + "num_tokens": 424755582.0, + "step": 3563 + }, + { + "entropy": 0.6285621672868729, + "epoch": 8.11861990305104, + "grad_norm": 0.515625, + "learning_rate": 4.6973623071020267e-07, + "loss": 0.018, + "mean_token_accuracy": 0.9955193102359772, + "num_tokens": 424874504.0, + "step": 3564 + }, + { + "entropy": 0.637930266559124, + "epoch": 8.12090105503279, + "grad_norm": 0.359375, + "learning_rate": 4.686377945859874e-07, + "loss": 0.0121, + "mean_token_accuracy": 0.9973889067769051, + "num_tokens": 424994062.0, + "step": 3565 + }, + { + "entropy": 0.6357327401638031, + "epoch": 8.123182207014542, + "grad_norm": 0.482421875, + "learning_rate": 4.6754051142374275e-07, + "loss": 0.0112, + "mean_token_accuracy": 0.9961701259016991, + "num_tokens": 425113503.0, + "step": 3566 + }, + { + "entropy": 0.6424869373440742, + "epoch": 8.125463358996294, + "grad_norm": 0.56640625, + "learning_rate": 4.664443818462658e-07, + "loss": 0.0185, + "mean_token_accuracy": 0.9929245784878731, + "num_tokens": 425233453.0, + "step": 3567 + }, + { + "entropy": 0.6321180835366249, + "epoch": 8.127744510978044, + "grad_norm": 0.52734375, + "learning_rate": 4.653494064756983e-07, + "loss": 0.0164, + "mean_token_accuracy": 0.9948494136333466, + "num_tokens": 425352874.0, + "step": 3568 + }, + { + "entropy": 0.6303884163498878, + "epoch": 8.130025662959795, + "grad_norm": 0.466796875, + "learning_rate": 4.6425558593352796e-07, + "loss": 0.0101, + "mean_token_accuracy": 0.9962014928460121, + "num_tokens": 425472273.0, + "step": 3569 + }, + { + "entropy": 0.6376355811953545, + "epoch": 8.132306814941545, + "grad_norm": 0.474609375, + "learning_rate": 4.631629208405847e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9949484094977379, + "num_tokens": 425591946.0, + "step": 3570 + }, + { + "entropy": 0.6366995275020599, + "epoch": 8.134587966923297, + "grad_norm": 0.46875, + "learning_rate": 4.620714118170452e-07, + "loss": 0.0165, + "mean_token_accuracy": 0.9940685927867889, + "num_tokens": 425711898.0, + "step": 3571 + }, + { + "entropy": 0.6362486630678177, + "epoch": 8.136869118905047, + "grad_norm": 0.486328125, + "learning_rate": 4.609810594824282e-07, + "loss": 0.0142, + "mean_token_accuracy": 0.9953118562698364, + "num_tokens": 425831306.0, + "step": 3572 + }, + { + "entropy": 0.6345850899815559, + "epoch": 8.139150270886798, + "grad_norm": 0.53515625, + "learning_rate": 4.598918644555975e-07, + "loss": 0.0156, + "mean_token_accuracy": 0.9949739649891853, + "num_tokens": 425950971.0, + "step": 3573 + }, + { + "entropy": 0.6387654170393944, + "epoch": 8.141431422868548, + "grad_norm": 0.470703125, + "learning_rate": 4.58803827354759e-07, + "loss": 0.0132, + "mean_token_accuracy": 0.9950261041522026, + "num_tokens": 426070111.0, + "step": 3574 + }, + { + "entropy": 0.6322121694684029, + "epoch": 8.1437125748503, + "grad_norm": 0.462890625, + "learning_rate": 4.5771694879746087e-07, + "loss": 0.016, + "mean_token_accuracy": 0.9947661459445953, + "num_tokens": 426188945.0, + "step": 3575 + }, + { + "entropy": 0.638969212770462, + "epoch": 8.14599372683205, + "grad_norm": 0.5, + "learning_rate": 4.566312294005948e-07, + "loss": 0.0097, + "mean_token_accuracy": 0.9963188841938972, + "num_tokens": 426307950.0, + "step": 3576 + }, + { + "entropy": 0.6338545829057693, + "epoch": 8.148274878813801, + "grad_norm": 0.486328125, + "learning_rate": 4.5554666978039455e-07, + "loss": 0.0148, + "mean_token_accuracy": 0.9953985214233398, + "num_tokens": 426427248.0, + "step": 3577 + }, + { + "entropy": 0.6293626800179482, + "epoch": 8.150556030795551, + "grad_norm": 0.4453125, + "learning_rate": 4.544632705524343e-07, + "loss": 0.0127, + "mean_token_accuracy": 0.9958158507943153, + "num_tokens": 426546496.0, + "step": 3578 + }, + { + "entropy": 0.6349634006619453, + "epoch": 8.152837182777303, + "grad_norm": 0.431640625, + "learning_rate": 4.5338103233163175e-07, + "loss": 0.0137, + "mean_token_accuracy": 0.9959702789783478, + "num_tokens": 426665303.0, + "step": 3579 + }, + { + "entropy": 0.6307752355933189, + "epoch": 8.155118334759054, + "grad_norm": 0.44140625, + "learning_rate": 4.522999557322433e-07, + "loss": 0.0147, + "mean_token_accuracy": 0.9958847090601921, + "num_tokens": 426784066.0, + "step": 3580 + }, + { + "entropy": 0.6398071944713593, + "epoch": 8.157399486740804, + "grad_norm": 0.6484375, + "learning_rate": 4.512200413678672e-07, + "loss": 0.0155, + "mean_token_accuracy": 0.9940467402338982, + "num_tokens": 426903656.0, + "step": 3581 + }, + { + "entropy": 0.6346627622842789, + "epoch": 8.159680638722556, + "grad_norm": 0.50390625, + "learning_rate": 4.501412898514426e-07, + "loss": 0.011, + "mean_token_accuracy": 0.9966484680771828, + "num_tokens": 427022126.0, + "step": 3582 + }, + { + "entropy": 0.6347939819097519, + "epoch": 8.161961790704305, + "grad_norm": 0.68359375, + "learning_rate": 4.490637017952479e-07, + "loss": 0.0186, + "mean_token_accuracy": 0.9946636185050011, + "num_tokens": 427141850.0, + "step": 3583 + }, + { + "entropy": 0.6323034688830376, + "epoch": 8.164242942686057, + "grad_norm": 0.5625, + "learning_rate": 4.4798727781090096e-07, + "loss": 0.0206, + "mean_token_accuracy": 0.9939329773187637, + "num_tokens": 427260457.0, + "step": 3584 + }, + { + "entropy": 0.6389393731951714, + "epoch": 8.166524094667807, + "grad_norm": 0.61328125, + "learning_rate": 4.4691201850936034e-07, + "loss": 0.0222, + "mean_token_accuracy": 0.9937165901064873, + "num_tokens": 427379326.0, + "step": 3585 + }, + { + "entropy": 0.6331617310643196, + "epoch": 8.168805246649558, + "grad_norm": 0.416015625, + "learning_rate": 4.458379245009209e-07, + "loss": 0.0158, + "mean_token_accuracy": 0.9946288540959358, + "num_tokens": 427498690.0, + "step": 3586 + }, + { + "entropy": 0.6367922723293304, + "epoch": 8.171086398631308, + "grad_norm": 0.53125, + "learning_rate": 4.447649963952183e-07, + "loss": 0.0178, + "mean_token_accuracy": 0.9945474043488503, + "num_tokens": 427618047.0, + "step": 3587 + }, + { + "entropy": 0.6368299201130867, + "epoch": 8.17336755061306, + "grad_norm": 0.5859375, + "learning_rate": 4.43693234801226e-07, + "loss": 0.0203, + "mean_token_accuracy": 0.994398333132267, + "num_tokens": 427737465.0, + "step": 3588 + }, + { + "entropy": 0.6362158805131912, + "epoch": 8.17564870259481, + "grad_norm": 0.6015625, + "learning_rate": 4.4262264032725517e-07, + "loss": 0.0192, + "mean_token_accuracy": 0.993162177503109, + "num_tokens": 427856894.0, + "step": 3589 + }, + { + "entropy": 0.6343085765838623, + "epoch": 8.177929854576561, + "grad_norm": 0.640625, + "learning_rate": 4.41553213580955e-07, + "loss": 0.0229, + "mean_token_accuracy": 0.9936042353510857, + "num_tokens": 427976085.0, + "step": 3590 + }, + { + "entropy": 0.6363385394215584, + "epoch": 8.180211006558311, + "grad_norm": 0.4453125, + "learning_rate": 4.404849551693102e-07, + "loss": 0.0126, + "mean_token_accuracy": 0.9958407357335091, + "num_tokens": 428095845.0, + "step": 3591 + }, + { + "entropy": 0.6302459612488747, + "epoch": 8.182492158540063, + "grad_norm": 0.55078125, + "learning_rate": 4.394178656986448e-07, + "loss": 0.0253, + "mean_token_accuracy": 0.9916606098413467, + "num_tokens": 428215985.0, + "step": 3592 + }, + { + "entropy": 0.6328221336007118, + "epoch": 8.184773310521814, + "grad_norm": 0.515625, + "learning_rate": 4.383519457746174e-07, + "loss": 0.013, + "mean_token_accuracy": 0.9959599301218987, + "num_tokens": 428335593.0, + "step": 3593 + }, + { + "entropy": 0.6352027282118797, + "epoch": 8.187054462503564, + "grad_norm": 0.50390625, + "learning_rate": 4.3728719600222374e-07, + "loss": 0.0187, + "mean_token_accuracy": 0.9942102059721947, + "num_tokens": 428455261.0, + "step": 3594 + }, + { + "entropy": 0.6291707903146744, + "epoch": 8.189335614485316, + "grad_norm": 0.49609375, + "learning_rate": 4.3622361698579586e-07, + "loss": 0.0155, + "mean_token_accuracy": 0.9961117953062057, + "num_tokens": 428573979.0, + "step": 3595 + }, + { + "entropy": 0.6286094933748245, + "epoch": 8.191616766467066, + "grad_norm": 0.44921875, + "learning_rate": 4.351612093290006e-07, + "loss": 0.0147, + "mean_token_accuracy": 0.9955262839794159, + "num_tokens": 428693641.0, + "step": 3596 + }, + { + "entropy": 0.6383709013462067, + "epoch": 8.193897918448817, + "grad_norm": 0.4140625, + "learning_rate": 4.340999736348389e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.995856337249279, + "num_tokens": 428813250.0, + "step": 3597 + }, + { + "entropy": 0.6315968111157417, + "epoch": 8.196179070430567, + "grad_norm": 0.515625, + "learning_rate": 4.3303991050564877e-07, + "loss": 0.0209, + "mean_token_accuracy": 0.9934088885784149, + "num_tokens": 428932653.0, + "step": 3598 + }, + { + "entropy": 0.639477327466011, + "epoch": 8.198460222412319, + "grad_norm": 0.69921875, + "learning_rate": 4.3198102054310157e-07, + "loss": 0.0223, + "mean_token_accuracy": 0.99273432046175, + "num_tokens": 429052140.0, + "step": 3599 + }, + { + "entropy": 0.6337134018540382, + "epoch": 8.200741374394068, + "grad_norm": 0.5546875, + "learning_rate": 4.30923304348202e-07, + "loss": 0.016, + "mean_token_accuracy": 0.9949889928102493, + "num_tokens": 429171696.0, + "step": 3600 + }, + { + "entropy": 0.6362425684928894, + "epoch": 8.20302252637582, + "grad_norm": 0.82421875, + "learning_rate": 4.2986676252129047e-07, + "loss": 0.0208, + "mean_token_accuracy": 0.9950012713670731, + "num_tokens": 429290858.0, + "step": 3601 + }, + { + "entropy": 0.6339386031031609, + "epoch": 8.20530367835757, + "grad_norm": 0.55078125, + "learning_rate": 4.288113956620382e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9948957562446594, + "num_tokens": 429410294.0, + "step": 3602 + }, + { + "entropy": 0.6358930766582489, + "epoch": 8.207584830339322, + "grad_norm": 0.478515625, + "learning_rate": 4.2775720436945225e-07, + "loss": 0.0151, + "mean_token_accuracy": 0.9950143694877625, + "num_tokens": 429529927.0, + "step": 3603 + }, + { + "entropy": 0.6388140320777893, + "epoch": 8.209865982321071, + "grad_norm": 0.65234375, + "learning_rate": 4.267041892418705e-07, + "loss": 0.0242, + "mean_token_accuracy": 0.9935542568564415, + "num_tokens": 429649520.0, + "step": 3604 + }, + { + "entropy": 0.6354804039001465, + "epoch": 8.212147134302823, + "grad_norm": 0.431640625, + "learning_rate": 4.256523508769647e-07, + "loss": 0.0119, + "mean_token_accuracy": 0.9954667538404465, + "num_tokens": 429769021.0, + "step": 3605 + }, + { + "entropy": 0.6309514120221138, + "epoch": 8.214428286284575, + "grad_norm": 0.5703125, + "learning_rate": 4.246016898717381e-07, + "loss": 0.0144, + "mean_token_accuracy": 0.9957665577530861, + "num_tokens": 429887804.0, + "step": 3606 + }, + { + "entropy": 0.6414297372102737, + "epoch": 8.216709438266324, + "grad_norm": 0.51171875, + "learning_rate": 4.235522068225248e-07, + "loss": 0.0126, + "mean_token_accuracy": 0.9950456470251083, + "num_tokens": 430006996.0, + "step": 3607 + }, + { + "entropy": 0.6332867443561554, + "epoch": 8.218990590248076, + "grad_norm": 0.64453125, + "learning_rate": 4.225039023249916e-07, + "loss": 0.0263, + "mean_token_accuracy": 0.9943082928657532, + "num_tokens": 430126251.0, + "step": 3608 + }, + { + "entropy": 0.629414439201355, + "epoch": 8.221271742229826, + "grad_norm": 0.578125, + "learning_rate": 4.2145677697413566e-07, + "loss": 0.0179, + "mean_token_accuracy": 0.9926956817507744, + "num_tokens": 430245190.0, + "step": 3609 + }, + { + "entropy": 0.6320610493421555, + "epoch": 8.223552894211577, + "grad_norm": 0.44921875, + "learning_rate": 4.204108313642852e-07, + "loss": 0.0103, + "mean_token_accuracy": 0.9973148480057716, + "num_tokens": 430364264.0, + "step": 3610 + }, + { + "entropy": 0.6310309693217278, + "epoch": 8.225834046193327, + "grad_norm": 0.58984375, + "learning_rate": 4.1936606608909887e-07, + "loss": 0.0217, + "mean_token_accuracy": 0.9938569143414497, + "num_tokens": 430483800.0, + "step": 3611 + }, + { + "entropy": 0.6371408551931381, + "epoch": 8.228115198175079, + "grad_norm": 0.44921875, + "learning_rate": 4.1832248174156597e-07, + "loss": 0.0119, + "mean_token_accuracy": 0.9962255284190178, + "num_tokens": 430603104.0, + "step": 3612 + }, + { + "entropy": 0.638577289879322, + "epoch": 8.230396350156829, + "grad_norm": 0.6171875, + "learning_rate": 4.1728007891400356e-07, + "loss": 0.0226, + "mean_token_accuracy": 0.9928141832351685, + "num_tokens": 430722459.0, + "step": 3613 + }, + { + "entropy": 0.6333236172795296, + "epoch": 8.23267750213858, + "grad_norm": 0.5546875, + "learning_rate": 4.1623885819805977e-07, + "loss": 0.0206, + "mean_token_accuracy": 0.9944396018981934, + "num_tokens": 430842073.0, + "step": 3614 + }, + { + "entropy": 0.6341728419065475, + "epoch": 8.23495865412033, + "grad_norm": 0.6015625, + "learning_rate": 4.151988201847112e-07, + "loss": 0.0216, + "mean_token_accuracy": 0.9931081235408783, + "num_tokens": 430961608.0, + "step": 3615 + }, + { + "entropy": 0.6291255727410316, + "epoch": 8.237239806102082, + "grad_norm": 0.5078125, + "learning_rate": 4.141599654642642e-07, + "loss": 0.0153, + "mean_token_accuracy": 0.995839960873127, + "num_tokens": 431081059.0, + "step": 3616 + }, + { + "entropy": 0.6372723802924156, + "epoch": 8.239520958083832, + "grad_norm": 0.61328125, + "learning_rate": 4.1312229462635243e-07, + "loss": 0.0194, + "mean_token_accuracy": 0.994607724249363, + "num_tokens": 431200404.0, + "step": 3617 + }, + { + "entropy": 0.631127156317234, + "epoch": 8.241802110065583, + "grad_norm": 0.5078125, + "learning_rate": 4.1208580825993686e-07, + "loss": 0.0191, + "mean_token_accuracy": 0.9933553487062454, + "num_tokens": 431319663.0, + "step": 3618 + }, + { + "entropy": 0.6342224478721619, + "epoch": 8.244083262047335, + "grad_norm": 0.5859375, + "learning_rate": 4.1105050695330774e-07, + "loss": 0.014, + "mean_token_accuracy": 0.995700977742672, + "num_tokens": 431439111.0, + "step": 3619 + }, + { + "entropy": 0.6323237866163254, + "epoch": 8.246364414029085, + "grad_norm": 0.52734375, + "learning_rate": 4.100163912940827e-07, + "loss": 0.0144, + "mean_token_accuracy": 0.9947383105754852, + "num_tokens": 431558120.0, + "step": 3620 + }, + { + "entropy": 0.6348285004496574, + "epoch": 8.248645566010836, + "grad_norm": 0.4296875, + "learning_rate": 4.0898346186920484e-07, + "loss": 0.0132, + "mean_token_accuracy": 0.9968567490577698, + "num_tokens": 431677086.0, + "step": 3621 + }, + { + "entropy": 0.6346550062298775, + "epoch": 8.250926717992586, + "grad_norm": 0.50390625, + "learning_rate": 4.0795171926494543e-07, + "loss": 0.0114, + "mean_token_accuracy": 0.9961873814463615, + "num_tokens": 431796709.0, + "step": 3622 + }, + { + "entropy": 0.6356236860156059, + "epoch": 8.253207869974338, + "grad_norm": 0.6484375, + "learning_rate": 4.0692116406690214e-07, + "loss": 0.0166, + "mean_token_accuracy": 0.9944038167595863, + "num_tokens": 431915468.0, + "step": 3623 + }, + { + "entropy": 0.6380622833967209, + "epoch": 8.255489021956087, + "grad_norm": 0.58203125, + "learning_rate": 4.058917968599968e-07, + "loss": 0.0223, + "mean_token_accuracy": 0.9926895499229431, + "num_tokens": 432034750.0, + "step": 3624 + }, + { + "entropy": 0.6334963440895081, + "epoch": 8.257770173937839, + "grad_norm": 0.396484375, + "learning_rate": 4.048636182284796e-07, + "loss": 0.0123, + "mean_token_accuracy": 0.9949152991175652, + "num_tokens": 432154214.0, + "step": 3625 + }, + { + "entropy": 0.6328727900981903, + "epoch": 8.260051325919589, + "grad_norm": 0.578125, + "learning_rate": 4.038366287559245e-07, + "loss": 0.0188, + "mean_token_accuracy": 0.9925613030791283, + "num_tokens": 432273556.0, + "step": 3626 + }, + { + "entropy": 0.6332607120275497, + "epoch": 8.26233247790134, + "grad_norm": 0.474609375, + "learning_rate": 4.0281082902523055e-07, + "loss": 0.0128, + "mean_token_accuracy": 0.9960195794701576, + "num_tokens": 432393146.0, + "step": 3627 + }, + { + "entropy": 0.6344685405492783, + "epoch": 8.26461362988309, + "grad_norm": 0.3828125, + "learning_rate": 4.0178621961862315e-07, + "loss": 0.0112, + "mean_token_accuracy": 0.9967414885759354, + "num_tokens": 432512103.0, + "step": 3628 + }, + { + "entropy": 0.6348953619599342, + "epoch": 8.266894781864842, + "grad_norm": 0.474609375, + "learning_rate": 4.0076280111764927e-07, + "loss": 0.0176, + "mean_token_accuracy": 0.9939457550644875, + "num_tokens": 432631241.0, + "step": 3629 + }, + { + "entropy": 0.6335016638040543, + "epoch": 8.269175933846592, + "grad_norm": 0.53515625, + "learning_rate": 3.997405741031821e-07, + "loss": 0.0184, + "mean_token_accuracy": 0.9935881271958351, + "num_tokens": 432750733.0, + "step": 3630 + }, + { + "entropy": 0.6346899643540382, + "epoch": 8.271457085828343, + "grad_norm": 0.62890625, + "learning_rate": 3.98719539155418e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9941022619605064, + "num_tokens": 432870879.0, + "step": 3631 + }, + { + "entropy": 0.6336653009057045, + "epoch": 8.273738237810093, + "grad_norm": 0.52734375, + "learning_rate": 3.9769969685387684e-07, + "loss": 0.0153, + "mean_token_accuracy": 0.9954018816351891, + "num_tokens": 432989998.0, + "step": 3632 + }, + { + "entropy": 0.633271649479866, + "epoch": 8.276019389791845, + "grad_norm": 0.53125, + "learning_rate": 3.966810477774016e-07, + "loss": 0.0159, + "mean_token_accuracy": 0.9949168786406517, + "num_tokens": 433109602.0, + "step": 3633 + }, + { + "entropy": 0.6343715414404869, + "epoch": 8.278300541773596, + "grad_norm": 0.640625, + "learning_rate": 3.9566359250415686e-07, + "loss": 0.0206, + "mean_token_accuracy": 0.9938369169831276, + "num_tokens": 433228375.0, + "step": 3634 + }, + { + "entropy": 0.6296771839261055, + "epoch": 8.280581693755346, + "grad_norm": 0.59375, + "learning_rate": 3.9464733161163144e-07, + "loss": 0.0163, + "mean_token_accuracy": 0.9924634099006653, + "num_tokens": 433347049.0, + "step": 3635 + }, + { + "entropy": 0.6276212558150291, + "epoch": 8.282862845737098, + "grad_norm": 0.69921875, + "learning_rate": 3.9363226567663503e-07, + "loss": 0.0246, + "mean_token_accuracy": 0.9912154600024223, + "num_tokens": 433466067.0, + "step": 3636 + }, + { + "entropy": 0.6377044096589088, + "epoch": 8.285143997718848, + "grad_norm": 0.421875, + "learning_rate": 3.926183952752999e-07, + "loss": 0.0159, + "mean_token_accuracy": 0.9959944561123848, + "num_tokens": 433585353.0, + "step": 3637 + }, + { + "entropy": 0.6399420127272606, + "epoch": 8.2874251497006, + "grad_norm": 0.50390625, + "learning_rate": 3.9160572098307923e-07, + "loss": 0.0163, + "mean_token_accuracy": 0.9960668757557869, + "num_tokens": 433704973.0, + "step": 3638 + }, + { + "entropy": 0.638011246919632, + "epoch": 8.28970630168235, + "grad_norm": 0.6328125, + "learning_rate": 3.90594243374747e-07, + "loss": 0.0181, + "mean_token_accuracy": 0.9954337179660797, + "num_tokens": 433824894.0, + "step": 3639 + }, + { + "entropy": 0.632746011018753, + "epoch": 8.2919874536641, + "grad_norm": 0.44921875, + "learning_rate": 3.895839630243983e-07, + "loss": 0.0131, + "mean_token_accuracy": 0.9955785870552063, + "num_tokens": 433943644.0, + "step": 3640 + }, + { + "entropy": 0.6373278126120567, + "epoch": 8.29426860564585, + "grad_norm": 0.71484375, + "learning_rate": 3.8857488050544903e-07, + "loss": 0.0223, + "mean_token_accuracy": 0.9939525946974754, + "num_tokens": 434062470.0, + "step": 3641 + }, + { + "entropy": 0.635734848678112, + "epoch": 8.296549757627602, + "grad_norm": 0.4140625, + "learning_rate": 3.875669963906356e-07, + "loss": 0.0134, + "mean_token_accuracy": 0.9943856373429298, + "num_tokens": 434182224.0, + "step": 3642 + }, + { + "entropy": 0.6362305358052254, + "epoch": 8.298830909609352, + "grad_norm": 0.4375, + "learning_rate": 3.865603112520125e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9971660599112511, + "num_tokens": 434301827.0, + "step": 3643 + }, + { + "entropy": 0.6344234943389893, + "epoch": 8.301112061591104, + "grad_norm": 0.451171875, + "learning_rate": 3.855548256609556e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.9958141148090363, + "num_tokens": 434422310.0, + "step": 3644 + }, + { + "entropy": 0.6289426609873772, + "epoch": 8.303393213572853, + "grad_norm": 0.56640625, + "learning_rate": 3.8455054018815803e-07, + "loss": 0.0197, + "mean_token_accuracy": 0.9935206472873688, + "num_tokens": 434541336.0, + "step": 3645 + }, + { + "entropy": 0.6387055814266205, + "epoch": 8.305674365554605, + "grad_norm": 0.71875, + "learning_rate": 3.8354745540363364e-07, + "loss": 0.0194, + "mean_token_accuracy": 0.9933138638734818, + "num_tokens": 434661360.0, + "step": 3646 + }, + { + "entropy": 0.6378220841288567, + "epoch": 8.307955517536357, + "grad_norm": 0.44140625, + "learning_rate": 3.8254557187671374e-07, + "loss": 0.0129, + "mean_token_accuracy": 0.9960588961839676, + "num_tokens": 434780763.0, + "step": 3647 + }, + { + "entropy": 0.6342916265130043, + "epoch": 8.310236669518106, + "grad_norm": 0.59375, + "learning_rate": 3.815448901760485e-07, + "loss": 0.0166, + "mean_token_accuracy": 0.9934190064668655, + "num_tokens": 434900328.0, + "step": 3648 + }, + { + "entropy": 0.6350668519735336, + "epoch": 8.312517821499858, + "grad_norm": 0.62109375, + "learning_rate": 3.805454108696055e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.995753325521946, + "num_tokens": 435019013.0, + "step": 3649 + }, + { + "entropy": 0.6363730430603027, + "epoch": 8.314798973481608, + "grad_norm": 0.6015625, + "learning_rate": 3.7954713452466927e-07, + "loss": 0.0207, + "mean_token_accuracy": 0.9948305264115334, + "num_tokens": 435138728.0, + "step": 3650 + }, + { + "entropy": 0.6358460113406181, + "epoch": 8.31708012546336, + "grad_norm": 0.498046875, + "learning_rate": 3.785500617078425e-07, + "loss": 0.0144, + "mean_token_accuracy": 0.9952556565403938, + "num_tokens": 435257652.0, + "step": 3651 + }, + { + "entropy": 0.6389608383178711, + "epoch": 8.31936127744511, + "grad_norm": 0.55078125, + "learning_rate": 3.775541929850443e-07, + "loss": 0.0194, + "mean_token_accuracy": 0.99556465446949, + "num_tokens": 435377925.0, + "step": 3652 + }, + { + "entropy": 0.6348562389612198, + "epoch": 8.321642429426861, + "grad_norm": 0.5703125, + "learning_rate": 3.76559528921511e-07, + "loss": 0.017, + "mean_token_accuracy": 0.993946261703968, + "num_tokens": 435497536.0, + "step": 3653 + }, + { + "entropy": 0.6336989924311638, + "epoch": 8.32392358140861, + "grad_norm": 0.447265625, + "learning_rate": 3.7556607008179454e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.9953256696462631, + "num_tokens": 435616574.0, + "step": 3654 + }, + { + "entropy": 0.6355843096971512, + "epoch": 8.326204733390362, + "grad_norm": 0.57421875, + "learning_rate": 3.745738170297633e-07, + "loss": 0.0236, + "mean_token_accuracy": 0.991997666656971, + "num_tokens": 435735062.0, + "step": 3655 + }, + { + "entropy": 0.6325106397271156, + "epoch": 8.328485885372112, + "grad_norm": 0.50390625, + "learning_rate": 3.7358277032860016e-07, + "loss": 0.0136, + "mean_token_accuracy": 0.9954073652625084, + "num_tokens": 435854108.0, + "step": 3656 + }, + { + "entropy": 0.6379479691386223, + "epoch": 8.330767037353864, + "grad_norm": 0.578125, + "learning_rate": 3.7259293054080435e-07, + "loss": 0.0211, + "mean_token_accuracy": 0.9919442906975746, + "num_tokens": 435973764.0, + "step": 3657 + }, + { + "entropy": 0.6324946954846382, + "epoch": 8.333048189335614, + "grad_norm": 0.55859375, + "learning_rate": 3.7160429822819003e-07, + "loss": 0.0213, + "mean_token_accuracy": 0.993585430085659, + "num_tokens": 436093203.0, + "step": 3658 + }, + { + "entropy": 0.6335916668176651, + "epoch": 8.335329341317365, + "grad_norm": 0.5390625, + "learning_rate": 3.706168739518859e-07, + "loss": 0.0196, + "mean_token_accuracy": 0.9937543794512749, + "num_tokens": 436213156.0, + "step": 3659 + }, + { + "entropy": 0.6358487457036972, + "epoch": 8.337610493299117, + "grad_norm": 0.69140625, + "learning_rate": 3.6963065827233524e-07, + "loss": 0.0177, + "mean_token_accuracy": 0.9934941977262497, + "num_tokens": 436332922.0, + "step": 3660 + }, + { + "entropy": 0.6399563401937485, + "epoch": 8.339891645280867, + "grad_norm": 0.51953125, + "learning_rate": 3.6864565174929393e-07, + "loss": 0.0212, + "mean_token_accuracy": 0.9940471723675728, + "num_tokens": 436452488.0, + "step": 3661 + }, + { + "entropy": 0.6343127638101578, + "epoch": 8.342172797262618, + "grad_norm": 0.59375, + "learning_rate": 3.676618549418334e-07, + "loss": 0.022, + "mean_token_accuracy": 0.9931092709302902, + "num_tokens": 436572630.0, + "step": 3662 + }, + { + "entropy": 0.6341215595602989, + "epoch": 8.344453949244368, + "grad_norm": 0.453125, + "learning_rate": 3.666792684083381e-07, + "loss": 0.0132, + "mean_token_accuracy": 0.9970972314476967, + "num_tokens": 436691584.0, + "step": 3663 + }, + { + "entropy": 0.6442408040165901, + "epoch": 8.34673510122612, + "grad_norm": 0.6640625, + "learning_rate": 3.656978927065041e-07, + "loss": 0.0208, + "mean_token_accuracy": 0.9940182194113731, + "num_tokens": 436812283.0, + "step": 3664 + }, + { + "entropy": 0.6372558772563934, + "epoch": 8.34901625320787, + "grad_norm": 0.494140625, + "learning_rate": 3.64717728393342e-07, + "loss": 0.0137, + "mean_token_accuracy": 0.9948115050792694, + "num_tokens": 436931964.0, + "step": 3665 + }, + { + "entropy": 0.6371241733431816, + "epoch": 8.351297405189621, + "grad_norm": 0.451171875, + "learning_rate": 3.6373877602517457e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.9945999905467033, + "num_tokens": 437051292.0, + "step": 3666 + }, + { + "entropy": 0.6355748549103737, + "epoch": 8.353578557171371, + "grad_norm": 0.478515625, + "learning_rate": 3.627610361576353e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9940038844943047, + "num_tokens": 437170221.0, + "step": 3667 + }, + { + "entropy": 0.634530819952488, + "epoch": 8.355859709153123, + "grad_norm": 0.70703125, + "learning_rate": 3.6178450934567065e-07, + "loss": 0.0217, + "mean_token_accuracy": 0.9925642907619476, + "num_tokens": 437289645.0, + "step": 3668 + }, + { + "entropy": 0.6345864534378052, + "epoch": 8.358140861134872, + "grad_norm": 0.515625, + "learning_rate": 3.6080919614353895e-07, + "loss": 0.0154, + "mean_token_accuracy": 0.9951695650815964, + "num_tokens": 437408082.0, + "step": 3669 + }, + { + "entropy": 0.6392272934317589, + "epoch": 8.360422013116624, + "grad_norm": 0.58984375, + "learning_rate": 3.598350971048087e-07, + "loss": 0.0168, + "mean_token_accuracy": 0.9952940791845322, + "num_tokens": 437527557.0, + "step": 3670 + }, + { + "entropy": 0.6368323862552643, + "epoch": 8.362703165098374, + "grad_norm": 0.57421875, + "learning_rate": 3.5886221278236045e-07, + "loss": 0.021, + "mean_token_accuracy": 0.9930471032857895, + "num_tokens": 437646880.0, + "step": 3671 + }, + { + "entropy": 0.6331295147538185, + "epoch": 8.364984317080125, + "grad_norm": 0.498046875, + "learning_rate": 3.578905437283833e-07, + "loss": 0.0162, + "mean_token_accuracy": 0.99416184425354, + "num_tokens": 437765192.0, + "step": 3672 + }, + { + "entropy": 0.6384186372160912, + "epoch": 8.367265469061877, + "grad_norm": 0.515625, + "learning_rate": 3.569200904943784e-07, + "loss": 0.0132, + "mean_token_accuracy": 0.9953002482652664, + "num_tokens": 437884218.0, + "step": 3673 + }, + { + "entropy": 0.6395587772130966, + "epoch": 8.369546621043627, + "grad_norm": 0.515625, + "learning_rate": 3.559508536311568e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9946721717715263, + "num_tokens": 438003722.0, + "step": 3674 + }, + { + "entropy": 0.637555256485939, + "epoch": 8.371827773025379, + "grad_norm": 0.462890625, + "learning_rate": 3.549828336888378e-07, + "loss": 0.0126, + "mean_token_accuracy": 0.9963499382138252, + "num_tokens": 438123874.0, + "step": 3675 + }, + { + "entropy": 0.6295384243130684, + "epoch": 8.374108925007128, + "grad_norm": 0.53125, + "learning_rate": 3.5401603121685197e-07, + "loss": 0.0171, + "mean_token_accuracy": 0.9936123937368393, + "num_tokens": 438243579.0, + "step": 3676 + }, + { + "entropy": 0.628864049911499, + "epoch": 8.37639007698888, + "grad_norm": 0.52734375, + "learning_rate": 3.5305044676393645e-07, + "loss": 0.0171, + "mean_token_accuracy": 0.9944480955600739, + "num_tokens": 438363362.0, + "step": 3677 + }, + { + "entropy": 0.6382469162344933, + "epoch": 8.37867122897063, + "grad_norm": 0.64453125, + "learning_rate": 3.5208608087813874e-07, + "loss": 0.0196, + "mean_token_accuracy": 0.9939625561237335, + "num_tokens": 438482723.0, + "step": 3678 + }, + { + "entropy": 0.6381028145551682, + "epoch": 8.380952380952381, + "grad_norm": 0.51953125, + "learning_rate": 3.5112293410681455e-07, + "loss": 0.0164, + "mean_token_accuracy": 0.9956439658999443, + "num_tokens": 438602666.0, + "step": 3679 + }, + { + "entropy": 0.6375740766525269, + "epoch": 8.383233532934131, + "grad_norm": 0.40234375, + "learning_rate": 3.501610069966271e-07, + "loss": 0.0107, + "mean_token_accuracy": 0.9963843524456024, + "num_tokens": 438722990.0, + "step": 3680 + }, + { + "entropy": 0.6384274959564209, + "epoch": 8.385514684915883, + "grad_norm": 0.46484375, + "learning_rate": 3.492003000935487e-07, + "loss": 0.0119, + "mean_token_accuracy": 0.9959043487906456, + "num_tokens": 438842565.0, + "step": 3681 + }, + { + "entropy": 0.6318017542362213, + "epoch": 8.387795836897633, + "grad_norm": 0.5859375, + "learning_rate": 3.482408139428564e-07, + "loss": 0.0155, + "mean_token_accuracy": 0.9941346868872643, + "num_tokens": 438961929.0, + "step": 3682 + }, + { + "entropy": 0.6355911493301392, + "epoch": 8.390076988879384, + "grad_norm": 0.458984375, + "learning_rate": 3.4728254908913683e-07, + "loss": 0.0162, + "mean_token_accuracy": 0.9950530081987381, + "num_tokens": 439081259.0, + "step": 3683 + }, + { + "entropy": 0.6313094645738602, + "epoch": 8.392358140861134, + "grad_norm": 0.52734375, + "learning_rate": 3.463255060762827e-07, + "loss": 0.0151, + "mean_token_accuracy": 0.9950520694255829, + "num_tokens": 439199757.0, + "step": 3684 + }, + { + "entropy": 0.6323173642158508, + "epoch": 8.394639292842886, + "grad_norm": 0.45703125, + "learning_rate": 3.4536968544749333e-07, + "loss": 0.0163, + "mean_token_accuracy": 0.994041197001934, + "num_tokens": 439318975.0, + "step": 3685 + }, + { + "entropy": 0.6418001130223274, + "epoch": 8.396920444824637, + "grad_norm": 0.443359375, + "learning_rate": 3.4441508774527345e-07, + "loss": 0.0126, + "mean_token_accuracy": 0.9966474622488022, + "num_tokens": 439437985.0, + "step": 3686 + }, + { + "entropy": 0.6368720605969429, + "epoch": 8.399201596806387, + "grad_norm": 0.455078125, + "learning_rate": 3.434617135114349e-07, + "loss": 0.0156, + "mean_token_accuracy": 0.9951367825269699, + "num_tokens": 439557258.0, + "step": 3687 + }, + { + "entropy": 0.6327372640371323, + "epoch": 8.401482748788139, + "grad_norm": 0.6328125, + "learning_rate": 3.425095632870937e-07, + "loss": 0.0234, + "mean_token_accuracy": 0.9933167695999146, + "num_tokens": 439676305.0, + "step": 3688 + }, + { + "entropy": 0.6373439729213715, + "epoch": 8.403763900769889, + "grad_norm": 0.6875, + "learning_rate": 3.4155863761267256e-07, + "loss": 0.0232, + "mean_token_accuracy": 0.991635873913765, + "num_tokens": 439795256.0, + "step": 3689 + }, + { + "entropy": 0.6341472789645195, + "epoch": 8.40604505275164, + "grad_norm": 0.435546875, + "learning_rate": 3.406089370278981e-07, + "loss": 0.014, + "mean_token_accuracy": 0.994786337018013, + "num_tokens": 439914541.0, + "step": 3690 + }, + { + "entropy": 0.6332595869898796, + "epoch": 8.40832620473339, + "grad_norm": 0.53125, + "learning_rate": 3.396604620718025e-07, + "loss": 0.013, + "mean_token_accuracy": 0.9957346692681313, + "num_tokens": 440033831.0, + "step": 3691 + }, + { + "entropy": 0.6337523013353348, + "epoch": 8.410607356715142, + "grad_norm": 0.59375, + "learning_rate": 3.387132132827223e-07, + "loss": 0.02, + "mean_token_accuracy": 0.9929488822817802, + "num_tokens": 440152853.0, + "step": 3692 + }, + { + "entropy": 0.6355877220630646, + "epoch": 8.412888508696891, + "grad_norm": 0.498046875, + "learning_rate": 3.377671911982963e-07, + "loss": 0.0148, + "mean_token_accuracy": 0.9944931194186211, + "num_tokens": 440271490.0, + "step": 3693 + }, + { + "entropy": 0.630707137286663, + "epoch": 8.415169660678643, + "grad_norm": 0.490234375, + "learning_rate": 3.3682239635546927e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.9954030513763428, + "num_tokens": 440390790.0, + "step": 3694 + }, + { + "entropy": 0.6333915516734123, + "epoch": 8.417450812660393, + "grad_norm": 0.671875, + "learning_rate": 3.35878829290488e-07, + "loss": 0.0209, + "mean_token_accuracy": 0.9936603456735611, + "num_tokens": 440509844.0, + "step": 3695 + }, + { + "entropy": 0.6324962079524994, + "epoch": 8.419731964642144, + "grad_norm": 0.5546875, + "learning_rate": 3.3493649053890325e-07, + "loss": 0.019, + "mean_token_accuracy": 0.9930814504623413, + "num_tokens": 440628912.0, + "step": 3696 + }, + { + "entropy": 0.6416174694895744, + "epoch": 8.422013116623894, + "grad_norm": 0.54296875, + "learning_rate": 3.339953806355692e-07, + "loss": 0.0167, + "mean_token_accuracy": 0.9940676018595695, + "num_tokens": 440749191.0, + "step": 3697 + }, + { + "entropy": 0.6350235491991043, + "epoch": 8.424294268605646, + "grad_norm": 0.70703125, + "learning_rate": 3.330555001146399e-07, + "loss": 0.0247, + "mean_token_accuracy": 0.9929917678236961, + "num_tokens": 440868425.0, + "step": 3698 + }, + { + "entropy": 0.6383695676922798, + "epoch": 8.426575420587398, + "grad_norm": 0.546875, + "learning_rate": 3.3211684950957416e-07, + "loss": 0.017, + "mean_token_accuracy": 0.9945909678936005, + "num_tokens": 440987832.0, + "step": 3699 + }, + { + "entropy": 0.6384435966610909, + "epoch": 8.428856572569147, + "grad_norm": 0.431640625, + "learning_rate": 3.311794293531323e-07, + "loss": 0.0123, + "mean_token_accuracy": 0.9962062835693359, + "num_tokens": 441107215.0, + "step": 3700 + }, + { + "entropy": 0.6311969980597496, + "epoch": 8.431137724550899, + "grad_norm": 0.53125, + "learning_rate": 3.3024324017737555e-07, + "loss": 0.0195, + "mean_token_accuracy": 0.994271345436573, + "num_tokens": 441227452.0, + "step": 3701 + }, + { + "entropy": 0.6334764510393143, + "epoch": 8.433418876532649, + "grad_norm": 0.482421875, + "learning_rate": 3.2930828251366703e-07, + "loss": 0.0141, + "mean_token_accuracy": 0.9957271590828896, + "num_tokens": 441346921.0, + "step": 3702 + }, + { + "entropy": 0.6361623406410217, + "epoch": 8.4357000285144, + "grad_norm": 0.498046875, + "learning_rate": 3.283745568926708e-07, + "loss": 0.0151, + "mean_token_accuracy": 0.9949776753783226, + "num_tokens": 441466529.0, + "step": 3703 + }, + { + "entropy": 0.631890133023262, + "epoch": 8.43798118049615, + "grad_norm": 0.53125, + "learning_rate": 3.274420638443507e-07, + "loss": 0.0207, + "mean_token_accuracy": 0.9947633519768715, + "num_tokens": 441586223.0, + "step": 3704 + }, + { + "entropy": 0.6361866891384125, + "epoch": 8.440262332477902, + "grad_norm": 0.443359375, + "learning_rate": 3.2651080389797253e-07, + "loss": 0.0113, + "mean_token_accuracy": 0.9962419420480728, + "num_tokens": 441705590.0, + "step": 3705 + }, + { + "entropy": 0.6388387009501457, + "epoch": 8.442543484459652, + "grad_norm": 0.5078125, + "learning_rate": 3.255807775821015e-07, + "loss": 0.0134, + "mean_token_accuracy": 0.9954176172614098, + "num_tokens": 441825064.0, + "step": 3706 + }, + { + "entropy": 0.6302207484841347, + "epoch": 8.444824636441403, + "grad_norm": 0.62890625, + "learning_rate": 3.246519854246022e-07, + "loss": 0.0206, + "mean_token_accuracy": 0.9933697134256363, + "num_tokens": 441944705.0, + "step": 3707 + }, + { + "entropy": 0.6334255710244179, + "epoch": 8.447105788423153, + "grad_norm": 0.5546875, + "learning_rate": 3.2372442795263885e-07, + "loss": 0.0171, + "mean_token_accuracy": 0.9954428896307945, + "num_tokens": 442063925.0, + "step": 3708 + }, + { + "entropy": 0.6381438300013542, + "epoch": 8.449386940404905, + "grad_norm": 0.62890625, + "learning_rate": 3.227981056926763e-07, + "loss": 0.0217, + "mean_token_accuracy": 0.9937597140669823, + "num_tokens": 442183545.0, + "step": 3709 + }, + { + "entropy": 0.6306346356868744, + "epoch": 8.451668092386655, + "grad_norm": 0.5, + "learning_rate": 3.218730191704758e-07, + "loss": 0.0189, + "mean_token_accuracy": 0.9932510629296303, + "num_tokens": 442302837.0, + "step": 3710 + }, + { + "entropy": 0.6392187774181366, + "epoch": 8.453949244368406, + "grad_norm": 0.4453125, + "learning_rate": 3.209491689110994e-07, + "loss": 0.0093, + "mean_token_accuracy": 0.997318834066391, + "num_tokens": 442422288.0, + "step": 3711 + }, + { + "entropy": 0.6379440575838089, + "epoch": 8.456230396350158, + "grad_norm": 0.462890625, + "learning_rate": 3.2002655543890646e-07, + "loss": 0.0186, + "mean_token_accuracy": 0.9938284903764725, + "num_tokens": 442541899.0, + "step": 3712 + }, + { + "entropy": 0.6373124942183495, + "epoch": 8.458511548331908, + "grad_norm": 0.578125, + "learning_rate": 3.1910517927755516e-07, + "loss": 0.0203, + "mean_token_accuracy": 0.9931608140468597, + "num_tokens": 442661692.0, + "step": 3713 + }, + { + "entropy": 0.6320207864046097, + "epoch": 8.46079270031366, + "grad_norm": 0.478515625, + "learning_rate": 3.181850409499995e-07, + "loss": 0.0134, + "mean_token_accuracy": 0.9956968650221825, + "num_tokens": 442781146.0, + "step": 3714 + }, + { + "entropy": 0.6374308988451958, + "epoch": 8.463073852295409, + "grad_norm": 0.5234375, + "learning_rate": 3.1726614097849326e-07, + "loss": 0.015, + "mean_token_accuracy": 0.9955965057015419, + "num_tokens": 442900519.0, + "step": 3715 + }, + { + "entropy": 0.6315358355641365, + "epoch": 8.46535500427716, + "grad_norm": 0.6875, + "learning_rate": 3.163484798845862e-07, + "loss": 0.0211, + "mean_token_accuracy": 0.9935419708490372, + "num_tokens": 443019496.0, + "step": 3716 + }, + { + "entropy": 0.6328833699226379, + "epoch": 8.46763615625891, + "grad_norm": 0.443359375, + "learning_rate": 3.1543205818912484e-07, + "loss": 0.013, + "mean_token_accuracy": 0.9961632564663887, + "num_tokens": 443138626.0, + "step": 3717 + }, + { + "entropy": 0.6366747245192528, + "epoch": 8.469917308240662, + "grad_norm": 0.56640625, + "learning_rate": 3.145168764122525e-07, + "loss": 0.0183, + "mean_token_accuracy": 0.9945467486977577, + "num_tokens": 443257428.0, + "step": 3718 + }, + { + "entropy": 0.6387536525726318, + "epoch": 8.472198460222412, + "grad_norm": 0.50390625, + "learning_rate": 3.1360293507340934e-07, + "loss": 0.0147, + "mean_token_accuracy": 0.9951237440109253, + "num_tokens": 443376541.0, + "step": 3719 + }, + { + "entropy": 0.6313810423016548, + "epoch": 8.474479612204163, + "grad_norm": 0.49609375, + "learning_rate": 3.1269023469132937e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9944380298256874, + "num_tokens": 443496430.0, + "step": 3720 + }, + { + "entropy": 0.6370832100510597, + "epoch": 8.476760764185913, + "grad_norm": 0.474609375, + "learning_rate": 3.117787757840449e-07, + "loss": 0.0145, + "mean_token_accuracy": 0.9950980991125107, + "num_tokens": 443615743.0, + "step": 3721 + }, + { + "entropy": 0.6324382200837135, + "epoch": 8.479041916167665, + "grad_norm": 0.72265625, + "learning_rate": 3.10868558868882e-07, + "loss": 0.0194, + "mean_token_accuracy": 0.9947016090154648, + "num_tokens": 443735826.0, + "step": 3722 + }, + { + "entropy": 0.6387528255581856, + "epoch": 8.481323068149415, + "grad_norm": 0.546875, + "learning_rate": 3.0995958446246197e-07, + "loss": 0.0183, + "mean_token_accuracy": 0.9951045662164688, + "num_tokens": 443855514.0, + "step": 3723 + }, + { + "entropy": 0.6318894326686859, + "epoch": 8.483604220131166, + "grad_norm": 0.52734375, + "learning_rate": 3.090518530807021e-07, + "loss": 0.0188, + "mean_token_accuracy": 0.9942224398255348, + "num_tokens": 443974354.0, + "step": 3724 + }, + { + "entropy": 0.63023691624403, + "epoch": 8.485885372112918, + "grad_norm": 0.4765625, + "learning_rate": 3.0814536523881224e-07, + "loss": 0.0104, + "mean_token_accuracy": 0.9968261271715164, + "num_tokens": 444093697.0, + "step": 3725 + }, + { + "entropy": 0.6331131234765053, + "epoch": 8.488166524094668, + "grad_norm": 0.58984375, + "learning_rate": 3.072401214512974e-07, + "loss": 0.0178, + "mean_token_accuracy": 0.9938473626971245, + "num_tokens": 444212536.0, + "step": 3726 + }, + { + "entropy": 0.6379842236638069, + "epoch": 8.49044767607642, + "grad_norm": 0.8125, + "learning_rate": 3.063361222319569e-07, + "loss": 0.0326, + "mean_token_accuracy": 0.9916841015219688, + "num_tokens": 444331827.0, + "step": 3727 + }, + { + "entropy": 0.6364071145653725, + "epoch": 8.49272882805817, + "grad_norm": 0.45703125, + "learning_rate": 3.054333680938837e-07, + "loss": 0.0116, + "mean_token_accuracy": 0.9939709529280663, + "num_tokens": 444451010.0, + "step": 3728 + }, + { + "entropy": 0.6381096690893173, + "epoch": 8.49500998003992, + "grad_norm": 0.53515625, + "learning_rate": 3.045318595494623e-07, + "loss": 0.0198, + "mean_token_accuracy": 0.9931627586483955, + "num_tokens": 444570418.0, + "step": 3729 + }, + { + "entropy": 0.6331044510006905, + "epoch": 8.49729113202167, + "grad_norm": 0.42578125, + "learning_rate": 3.036315971103723e-07, + "loss": 0.0119, + "mean_token_accuracy": 0.9951529279351234, + "num_tokens": 444689173.0, + "step": 3730 + }, + { + "entropy": 0.6379807740449905, + "epoch": 8.499572284003422, + "grad_norm": 0.546875, + "learning_rate": 3.0273258128758585e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9962579309940338, + "num_tokens": 444808659.0, + "step": 3731 + }, + { + "entropy": 0.6306969597935677, + "epoch": 8.501853435985172, + "grad_norm": 0.48828125, + "learning_rate": 3.018348125913659e-07, + "loss": 0.0142, + "mean_token_accuracy": 0.9950896799564362, + "num_tokens": 444927823.0, + "step": 3732 + }, + { + "entropy": 0.6366394907236099, + "epoch": 8.504134587966924, + "grad_norm": 0.4921875, + "learning_rate": 3.009382915312689e-07, + "loss": 0.0159, + "mean_token_accuracy": 0.9941250458359718, + "num_tokens": 445047515.0, + "step": 3733 + }, + { + "entropy": 0.6343629583716393, + "epoch": 8.506415739948674, + "grad_norm": 0.66015625, + "learning_rate": 3.000430186161432e-07, + "loss": 0.019, + "mean_token_accuracy": 0.9947511032223701, + "num_tokens": 445166478.0, + "step": 3734 + }, + { + "entropy": 0.6330859288573265, + "epoch": 8.508696891930425, + "grad_norm": 0.46484375, + "learning_rate": 2.991489943541287e-07, + "loss": 0.0103, + "mean_token_accuracy": 0.9958532899618149, + "num_tokens": 445285946.0, + "step": 3735 + }, + { + "entropy": 0.6351182758808136, + "epoch": 8.510978043912175, + "grad_norm": 0.48046875, + "learning_rate": 2.982562192526556e-07, + "loss": 0.0097, + "mean_token_accuracy": 0.996795155107975, + "num_tokens": 445405465.0, + "step": 3736 + }, + { + "entropy": 0.6363307982683182, + "epoch": 8.513259195893927, + "grad_norm": 0.60546875, + "learning_rate": 2.97364693818446e-07, + "loss": 0.0193, + "mean_token_accuracy": 0.9943244233727455, + "num_tokens": 445524882.0, + "step": 3737 + }, + { + "entropy": 0.6378805041313171, + "epoch": 8.515540347875678, + "grad_norm": 0.55078125, + "learning_rate": 2.9647441855751274e-07, + "loss": 0.0181, + "mean_token_accuracy": 0.9951576590538025, + "num_tokens": 445644670.0, + "step": 3738 + }, + { + "entropy": 0.63292595744133, + "epoch": 8.517821499857428, + "grad_norm": 0.4609375, + "learning_rate": 2.9558539397515905e-07, + "loss": 0.0108, + "mean_token_accuracy": 0.9962939992547035, + "num_tokens": 445764164.0, + "step": 3739 + }, + { + "entropy": 0.6398011073470116, + "epoch": 8.52010265183918, + "grad_norm": 0.5234375, + "learning_rate": 2.94697620575978e-07, + "loss": 0.0179, + "mean_token_accuracy": 0.9942022562026978, + "num_tokens": 445882978.0, + "step": 3740 + }, + { + "epoch": 8.52010265183918, + "eval_entropy": 0.6344493518764075, + "eval_loss": 0.020523052662611008, + "eval_mean_token_accuracy": 0.9935979834074303, + "eval_num_tokens": 445882978.0, + "eval_runtime": 177.4676, + "eval_samples_per_second": 47.248, + "eval_steps_per_second": 1.482, + "step": 3740 + }, + { + "entropy": 0.6324914321303368, + "epoch": 8.52238380382093, + "grad_norm": 0.47265625, + "learning_rate": 2.938110988638521e-07, + "loss": 0.0141, + "mean_token_accuracy": 0.9951005205512047, + "num_tokens": 446001685.0, + "step": 3741 + }, + { + "entropy": 0.6330416277050972, + "epoch": 8.524664955802681, + "grad_norm": 0.51953125, + "learning_rate": 2.9292582934195427e-07, + "loss": 0.0101, + "mean_token_accuracy": 0.9965989589691162, + "num_tokens": 446120943.0, + "step": 3742 + }, + { + "entropy": 0.6385501697659492, + "epoch": 8.52694610778443, + "grad_norm": 0.5, + "learning_rate": 2.9204181251274665e-07, + "loss": 0.0158, + "mean_token_accuracy": 0.996173582971096, + "num_tokens": 446239696.0, + "step": 3743 + }, + { + "entropy": 0.6377552673220634, + "epoch": 8.529227259766182, + "grad_norm": 0.546875, + "learning_rate": 2.9115904887798005e-07, + "loss": 0.0128, + "mean_token_accuracy": 0.997244082391262, + "num_tokens": 446358753.0, + "step": 3744 + }, + { + "entropy": 0.6374934688210487, + "epoch": 8.531508411747932, + "grad_norm": 0.4765625, + "learning_rate": 2.9027753893869387e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9945925772190094, + "num_tokens": 446478302.0, + "step": 3745 + }, + { + "entropy": 0.6376848816871643, + "epoch": 8.533789563729684, + "grad_norm": 0.5859375, + "learning_rate": 2.893972831952166e-07, + "loss": 0.018, + "mean_token_accuracy": 0.9931379482150078, + "num_tokens": 446597890.0, + "step": 3746 + }, + { + "entropy": 0.6318284720182419, + "epoch": 8.536070715711434, + "grad_norm": 0.474609375, + "learning_rate": 2.8851828214716383e-07, + "loss": 0.0156, + "mean_token_accuracy": 0.9954425022006035, + "num_tokens": 446717181.0, + "step": 3747 + }, + { + "entropy": 0.6333819627761841, + "epoch": 8.538351867693185, + "grad_norm": 0.50390625, + "learning_rate": 2.876405362934395e-07, + "loss": 0.0137, + "mean_token_accuracy": 0.9952364191412926, + "num_tokens": 446836488.0, + "step": 3748 + }, + { + "entropy": 0.6345789283514023, + "epoch": 8.540633019674935, + "grad_norm": 0.62109375, + "learning_rate": 2.8676404613223573e-07, + "loss": 0.0197, + "mean_token_accuracy": 0.9930208921432495, + "num_tokens": 446955487.0, + "step": 3749 + }, + { + "entropy": 0.6320394948124886, + "epoch": 8.542914171656687, + "grad_norm": 0.5390625, + "learning_rate": 2.858888121610315e-07, + "loss": 0.0145, + "mean_token_accuracy": 0.9949485659599304, + "num_tokens": 447074098.0, + "step": 3750 + }, + { + "entropy": 0.6333305016160011, + "epoch": 8.545195323638437, + "grad_norm": 0.58203125, + "learning_rate": 2.8501483487659217e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9958731606602669, + "num_tokens": 447192892.0, + "step": 3751 + }, + { + "entropy": 0.6354012787342072, + "epoch": 8.547476475620188, + "grad_norm": 0.4296875, + "learning_rate": 2.841421147749709e-07, + "loss": 0.0111, + "mean_token_accuracy": 0.9967767670750618, + "num_tokens": 447312446.0, + "step": 3752 + }, + { + "entropy": 0.6370061635971069, + "epoch": 8.54975762760194, + "grad_norm": 0.5, + "learning_rate": 2.832706523515061e-07, + "loss": 0.0189, + "mean_token_accuracy": 0.9943125918507576, + "num_tokens": 447431279.0, + "step": 3753 + }, + { + "entropy": 0.6313389763236046, + "epoch": 8.55203877958369, + "grad_norm": 0.5234375, + "learning_rate": 2.824004481008233e-07, + "loss": 0.0175, + "mean_token_accuracy": 0.994380384683609, + "num_tokens": 447551939.0, + "step": 3754 + }, + { + "entropy": 0.6361707895994186, + "epoch": 8.554319931565441, + "grad_norm": 0.52734375, + "learning_rate": 2.815315025168339e-07, + "loss": 0.0216, + "mean_token_accuracy": 0.9927258938550949, + "num_tokens": 447671467.0, + "step": 3755 + }, + { + "entropy": 0.6338387057185173, + "epoch": 8.556601083547191, + "grad_norm": 0.484375, + "learning_rate": 2.8066381609273497e-07, + "loss": 0.016, + "mean_token_accuracy": 0.9955186098814011, + "num_tokens": 447790542.0, + "step": 3756 + }, + { + "entropy": 0.6334778144955635, + "epoch": 8.558882235528943, + "grad_norm": 0.6015625, + "learning_rate": 2.7979738932100734e-07, + "loss": 0.0177, + "mean_token_accuracy": 0.9950881749391556, + "num_tokens": 447909948.0, + "step": 3757 + }, + { + "entropy": 0.6343951150774956, + "epoch": 8.561163387510693, + "grad_norm": 0.63671875, + "learning_rate": 2.7893222269341906e-07, + "loss": 0.019, + "mean_token_accuracy": 0.9934731870889664, + "num_tokens": 448029294.0, + "step": 3758 + }, + { + "entropy": 0.6344007849693298, + "epoch": 8.563444539492444, + "grad_norm": 0.62890625, + "learning_rate": 2.7806831670102176e-07, + "loss": 0.0152, + "mean_token_accuracy": 0.995569221675396, + "num_tokens": 448148501.0, + "step": 3759 + }, + { + "entropy": 0.6315223351120949, + "epoch": 8.565725691474194, + "grad_norm": 0.4453125, + "learning_rate": 2.7720567183415175e-07, + "loss": 0.0131, + "mean_token_accuracy": 0.9964464828372002, + "num_tokens": 448267580.0, + "step": 3760 + }, + { + "entropy": 0.6321784555912018, + "epoch": 8.568006843455946, + "grad_norm": 0.57421875, + "learning_rate": 2.7634428858242995e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.9957806542515755, + "num_tokens": 448386319.0, + "step": 3761 + }, + { + "entropy": 0.6295964419841766, + "epoch": 8.570287995437695, + "grad_norm": 0.447265625, + "learning_rate": 2.754841674347608e-07, + "loss": 0.013, + "mean_token_accuracy": 0.9944565370678902, + "num_tokens": 448505396.0, + "step": 3762 + }, + { + "entropy": 0.6382258534431458, + "epoch": 8.572569147419447, + "grad_norm": 0.421875, + "learning_rate": 2.7462530887933216e-07, + "loss": 0.0113, + "mean_token_accuracy": 0.9960286244750023, + "num_tokens": 448624715.0, + "step": 3763 + }, + { + "entropy": 0.6373923793435097, + "epoch": 8.574850299401197, + "grad_norm": 0.5625, + "learning_rate": 2.737677134036154e-07, + "loss": 0.0182, + "mean_token_accuracy": 0.9937182366847992, + "num_tokens": 448743851.0, + "step": 3764 + }, + { + "entropy": 0.6341640204191208, + "epoch": 8.577131451382948, + "grad_norm": 0.46484375, + "learning_rate": 2.729113814943654e-07, + "loss": 0.008, + "mean_token_accuracy": 0.9974091202020645, + "num_tokens": 448863111.0, + "step": 3765 + }, + { + "entropy": 0.6387627497315407, + "epoch": 8.579412603364698, + "grad_norm": 0.609375, + "learning_rate": 2.7205631363761976e-07, + "loss": 0.0202, + "mean_token_accuracy": 0.9929206147789955, + "num_tokens": 448982772.0, + "step": 3766 + }, + { + "entropy": 0.637218564748764, + "epoch": 8.58169375534645, + "grad_norm": 0.3828125, + "learning_rate": 2.7120251031869884e-07, + "loss": 0.0102, + "mean_token_accuracy": 0.9965970292687416, + "num_tokens": 449101971.0, + "step": 3767 + }, + { + "entropy": 0.6393006965517998, + "epoch": 8.583974907328201, + "grad_norm": 0.412109375, + "learning_rate": 2.7034997202220384e-07, + "loss": 0.009, + "mean_token_accuracy": 0.9964673668146133, + "num_tokens": 449220568.0, + "step": 3768 + }, + { + "entropy": 0.6340059041976929, + "epoch": 8.586256059309951, + "grad_norm": 0.6875, + "learning_rate": 2.6949869923202e-07, + "loss": 0.0247, + "mean_token_accuracy": 0.9919053614139557, + "num_tokens": 449340320.0, + "step": 3769 + }, + { + "entropy": 0.6369449868798256, + "epoch": 8.588537211291703, + "grad_norm": 0.55078125, + "learning_rate": 2.686486924313128e-07, + "loss": 0.0102, + "mean_token_accuracy": 0.9962605461478233, + "num_tokens": 449460288.0, + "step": 3770 + }, + { + "entropy": 0.6303629577159882, + "epoch": 8.590818363273453, + "grad_norm": 0.478515625, + "learning_rate": 2.6779995210253015e-07, + "loss": 0.0151, + "mean_token_accuracy": 0.9949217960238457, + "num_tokens": 449579581.0, + "step": 3771 + }, + { + "entropy": 0.6351359635591507, + "epoch": 8.593099515255204, + "grad_norm": 0.53125, + "learning_rate": 2.6695247872740027e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9958750978112221, + "num_tokens": 449698822.0, + "step": 3772 + }, + { + "entropy": 0.6304238364100456, + "epoch": 8.595380667236954, + "grad_norm": 0.4765625, + "learning_rate": 2.6610627278693265e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.9958297684788704, + "num_tokens": 449817988.0, + "step": 3773 + }, + { + "entropy": 0.6397280320525169, + "epoch": 8.597661819218706, + "grad_norm": 0.50390625, + "learning_rate": 2.6526133476141804e-07, + "loss": 0.018, + "mean_token_accuracy": 0.994235672056675, + "num_tokens": 449936840.0, + "step": 3774 + }, + { + "entropy": 0.6357916221022606, + "epoch": 8.599942971200456, + "grad_norm": 0.51171875, + "learning_rate": 2.644176651304259e-07, + "loss": 0.0159, + "mean_token_accuracy": 0.9953537136316299, + "num_tokens": 450056130.0, + "step": 3775 + }, + { + "entropy": 0.6377042606472969, + "epoch": 8.602224123182207, + "grad_norm": 0.578125, + "learning_rate": 2.6357526437280764e-07, + "loss": 0.0212, + "mean_token_accuracy": 0.9949027448892593, + "num_tokens": 450175481.0, + "step": 3776 + }, + { + "entropy": 0.6294911727309227, + "epoch": 8.604505275163957, + "grad_norm": 0.5546875, + "learning_rate": 2.6273413296669353e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9951882809400558, + "num_tokens": 450294322.0, + "step": 3777 + }, + { + "entropy": 0.6336573734879494, + "epoch": 8.606786427145709, + "grad_norm": 0.56640625, + "learning_rate": 2.618942713894937e-07, + "loss": 0.016, + "mean_token_accuracy": 0.9945861026644707, + "num_tokens": 450413656.0, + "step": 3778 + }, + { + "entropy": 0.634580098092556, + "epoch": 8.609067579127458, + "grad_norm": 0.51953125, + "learning_rate": 2.610556801178968e-07, + "loss": 0.0152, + "mean_token_accuracy": 0.9944509714841843, + "num_tokens": 450533377.0, + "step": 3779 + }, + { + "entropy": 0.6354850381612778, + "epoch": 8.61134873110921, + "grad_norm": 0.53125, + "learning_rate": 2.602183596278715e-07, + "loss": 0.0119, + "mean_token_accuracy": 0.9959636256098747, + "num_tokens": 450652725.0, + "step": 3780 + }, + { + "entropy": 0.6320644393563271, + "epoch": 8.613629883090962, + "grad_norm": 0.494140625, + "learning_rate": 2.5938231039466436e-07, + "loss": 0.0138, + "mean_token_accuracy": 0.9956261739134789, + "num_tokens": 450772065.0, + "step": 3781 + }, + { + "entropy": 0.6364989429712296, + "epoch": 8.615911035072711, + "grad_norm": 0.51953125, + "learning_rate": 2.585475328928011e-07, + "loss": 0.0146, + "mean_token_accuracy": 0.9957963675260544, + "num_tokens": 450891082.0, + "step": 3782 + }, + { + "entropy": 0.637377068400383, + "epoch": 8.618192187054463, + "grad_norm": 0.427734375, + "learning_rate": 2.577140275960857e-07, + "loss": 0.0148, + "mean_token_accuracy": 0.9955489411950111, + "num_tokens": 451009241.0, + "step": 3783 + }, + { + "entropy": 0.6280513182282448, + "epoch": 8.620473339036213, + "grad_norm": 0.515625, + "learning_rate": 2.5688179497759895e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.9940948709845543, + "num_tokens": 451128384.0, + "step": 3784 + }, + { + "entropy": 0.6395316869020462, + "epoch": 8.622754491017965, + "grad_norm": 0.48828125, + "learning_rate": 2.560508355097002e-07, + "loss": 0.0116, + "mean_token_accuracy": 0.9959321171045303, + "num_tokens": 451248073.0, + "step": 3785 + }, + { + "entropy": 0.6347874626517296, + "epoch": 8.625035642999714, + "grad_norm": 0.470703125, + "learning_rate": 2.552211496640261e-07, + "loss": 0.0118, + "mean_token_accuracy": 0.9952626004815102, + "num_tokens": 451368088.0, + "step": 3786 + }, + { + "entropy": 0.6313516348600388, + "epoch": 8.627316794981466, + "grad_norm": 0.66796875, + "learning_rate": 2.543927379114902e-07, + "loss": 0.0221, + "mean_token_accuracy": 0.9935628995299339, + "num_tokens": 451487033.0, + "step": 3787 + }, + { + "entropy": 0.6323819905519485, + "epoch": 8.629597946963216, + "grad_norm": 0.65234375, + "learning_rate": 2.5356560072228335e-07, + "loss": 0.0181, + "mean_token_accuracy": 0.9947925508022308, + "num_tokens": 451606160.0, + "step": 3788 + }, + { + "entropy": 0.6410714089870453, + "epoch": 8.631879098944967, + "grad_norm": 0.421875, + "learning_rate": 2.5273973856587283e-07, + "loss": 0.0116, + "mean_token_accuracy": 0.9955586269497871, + "num_tokens": 451725348.0, + "step": 3789 + }, + { + "entropy": 0.6354704052209854, + "epoch": 8.634160250926717, + "grad_norm": 0.486328125, + "learning_rate": 2.5191515191100107e-07, + "loss": 0.0133, + "mean_token_accuracy": 0.9947703257203102, + "num_tokens": 451844942.0, + "step": 3790 + }, + { + "entropy": 0.6339346393942833, + "epoch": 8.636441402908469, + "grad_norm": 0.478515625, + "learning_rate": 2.5109184122568797e-07, + "loss": 0.0099, + "mean_token_accuracy": 0.9964713379740715, + "num_tokens": 451964096.0, + "step": 3791 + }, + { + "entropy": 0.6336182355880737, + "epoch": 8.638722554890219, + "grad_norm": 0.375, + "learning_rate": 2.502698069772294e-07, + "loss": 0.0098, + "mean_token_accuracy": 0.9965189397335052, + "num_tokens": 452083207.0, + "step": 3792 + }, + { + "entropy": 0.6346861869096756, + "epoch": 8.64100370687197, + "grad_norm": 0.47265625, + "learning_rate": 2.494490496321958e-07, + "loss": 0.0147, + "mean_token_accuracy": 0.9939879477024078, + "num_tokens": 452202728.0, + "step": 3793 + }, + { + "entropy": 0.6354878172278404, + "epoch": 8.643284858853722, + "grad_norm": 0.482421875, + "learning_rate": 2.4862956965643253e-07, + "loss": 0.0145, + "mean_token_accuracy": 0.9947989135980606, + "num_tokens": 452321624.0, + "step": 3794 + }, + { + "entropy": 0.6321206539869308, + "epoch": 8.645566010835472, + "grad_norm": 0.54296875, + "learning_rate": 2.4781136751506176e-07, + "loss": 0.0222, + "mean_token_accuracy": 0.9938552975654602, + "num_tokens": 452440658.0, + "step": 3795 + }, + { + "entropy": 0.6327709928154945, + "epoch": 8.647847162817223, + "grad_norm": 0.50390625, + "learning_rate": 2.4699444367247834e-07, + "loss": 0.0152, + "mean_token_accuracy": 0.9950667023658752, + "num_tokens": 452559939.0, + "step": 3796 + }, + { + "entropy": 0.6321667209267616, + "epoch": 8.650128314798973, + "grad_norm": 0.67578125, + "learning_rate": 2.461787985923525e-07, + "loss": 0.0201, + "mean_token_accuracy": 0.9921667575836182, + "num_tokens": 452679078.0, + "step": 3797 + }, + { + "entropy": 0.6366337314248085, + "epoch": 8.652409466780725, + "grad_norm": 0.60546875, + "learning_rate": 2.4536443273762864e-07, + "loss": 0.0204, + "mean_token_accuracy": 0.9932368099689484, + "num_tokens": 452798618.0, + "step": 3798 + }, + { + "entropy": 0.6453105807304382, + "epoch": 8.654690618762475, + "grad_norm": 0.4296875, + "learning_rate": 2.4455134657052626e-07, + "loss": 0.015, + "mean_token_accuracy": 0.9952013567090034, + "num_tokens": 452918109.0, + "step": 3799 + }, + { + "entropy": 0.6322993636131287, + "epoch": 8.656971770744226, + "grad_norm": 0.55078125, + "learning_rate": 2.437395405525356e-07, + "loss": 0.0171, + "mean_token_accuracy": 0.994901679456234, + "num_tokens": 453036564.0, + "step": 3800 + }, + { + "entropy": 0.636474646627903, + "epoch": 8.659252922725976, + "grad_norm": 0.4765625, + "learning_rate": 2.429290151444233e-07, + "loss": 0.0101, + "mean_token_accuracy": 0.9966831803321838, + "num_tokens": 453155964.0, + "step": 3801 + }, + { + "entropy": 0.6335003152489662, + "epoch": 8.661534074707728, + "grad_norm": 0.474609375, + "learning_rate": 2.421197708062273e-07, + "loss": 0.016, + "mean_token_accuracy": 0.9944836795330048, + "num_tokens": 453275705.0, + "step": 3802 + }, + { + "entropy": 0.6354513615369797, + "epoch": 8.663815226689477, + "grad_norm": 0.58203125, + "learning_rate": 2.413118079972593e-07, + "loss": 0.0153, + "mean_token_accuracy": 0.9962194710969925, + "num_tokens": 453394735.0, + "step": 3803 + }, + { + "entropy": 0.6331475153565407, + "epoch": 8.666096378671229, + "grad_norm": 0.8359375, + "learning_rate": 2.405051271761036e-07, + "loss": 0.0238, + "mean_token_accuracy": 0.9929165542125702, + "num_tokens": 453514115.0, + "step": 3804 + }, + { + "entropy": 0.6349427998065948, + "epoch": 8.668377530652979, + "grad_norm": 0.55078125, + "learning_rate": 2.396997288006167e-07, + "loss": 0.0122, + "mean_token_accuracy": 0.9950635433197021, + "num_tokens": 453633376.0, + "step": 3805 + }, + { + "entropy": 0.632256306707859, + "epoch": 8.67065868263473, + "grad_norm": 0.546875, + "learning_rate": 2.388956133279266e-07, + "loss": 0.011, + "mean_token_accuracy": 0.996588945388794, + "num_tokens": 453753112.0, + "step": 3806 + }, + { + "entropy": 0.6399961709976196, + "epoch": 8.672939834616482, + "grad_norm": 0.58984375, + "learning_rate": 2.3809278121443403e-07, + "loss": 0.0276, + "mean_token_accuracy": 0.9921036213636398, + "num_tokens": 453872782.0, + "step": 3807 + }, + { + "entropy": 0.6340872868895531, + "epoch": 8.675220986598232, + "grad_norm": 0.53125, + "learning_rate": 2.3729123291581112e-07, + "loss": 0.0183, + "mean_token_accuracy": 0.9941503629088402, + "num_tokens": 453992480.0, + "step": 3808 + }, + { + "entropy": 0.6328607946634293, + "epoch": 8.677502138579984, + "grad_norm": 0.5, + "learning_rate": 2.3649096888700095e-07, + "loss": 0.0185, + "mean_token_accuracy": 0.9933114275336266, + "num_tokens": 454112195.0, + "step": 3809 + }, + { + "entropy": 0.6447214856743813, + "epoch": 8.679783290561733, + "grad_norm": 0.51171875, + "learning_rate": 2.356919895822188e-07, + "loss": 0.0118, + "mean_token_accuracy": 0.9965731725096703, + "num_tokens": 454231610.0, + "step": 3810 + }, + { + "entropy": 0.6407330557703972, + "epoch": 8.682064442543485, + "grad_norm": 0.53515625, + "learning_rate": 2.3489429545494851e-07, + "loss": 0.0143, + "mean_token_accuracy": 0.9951351135969162, + "num_tokens": 454351128.0, + "step": 3811 + }, + { + "entropy": 0.6350977420806885, + "epoch": 8.684345594525235, + "grad_norm": 0.7109375, + "learning_rate": 2.3409788695794688e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.99553082883358, + "num_tokens": 454471398.0, + "step": 3812 + }, + { + "entropy": 0.6366613581776619, + "epoch": 8.686626746506986, + "grad_norm": 0.498046875, + "learning_rate": 2.3330276454323926e-07, + "loss": 0.0152, + "mean_token_accuracy": 0.9944039881229401, + "num_tokens": 454591169.0, + "step": 3813 + }, + { + "entropy": 0.6259548664093018, + "epoch": 8.688907898488736, + "grad_norm": 0.71875, + "learning_rate": 2.3250892866212294e-07, + "loss": 0.0189, + "mean_token_accuracy": 0.9937223047018051, + "num_tokens": 454709592.0, + "step": 3814 + }, + { + "entropy": 0.6317023187875748, + "epoch": 8.691189050470488, + "grad_norm": 0.58984375, + "learning_rate": 2.3171637976516253e-07, + "loss": 0.0162, + "mean_token_accuracy": 0.9945140406489372, + "num_tokens": 454829478.0, + "step": 3815 + }, + { + "entropy": 0.6383225321769714, + "epoch": 8.693470202452238, + "grad_norm": 0.40625, + "learning_rate": 2.3092511830219405e-07, + "loss": 0.0144, + "mean_token_accuracy": 0.9968536347150803, + "num_tokens": 454949003.0, + "step": 3816 + }, + { + "entropy": 0.6351106464862823, + "epoch": 8.69575135443399, + "grad_norm": 0.45703125, + "learning_rate": 2.3013514472232295e-07, + "loss": 0.0178, + "mean_token_accuracy": 0.9949400126934052, + "num_tokens": 455069444.0, + "step": 3817 + }, + { + "entropy": 0.6340363845229149, + "epoch": 8.698032506415739, + "grad_norm": 0.6015625, + "learning_rate": 2.293464594739214e-07, + "loss": 0.0162, + "mean_token_accuracy": 0.9944451004266739, + "num_tokens": 455188855.0, + "step": 3818 + }, + { + "entropy": 0.6384692564606667, + "epoch": 8.70031365839749, + "grad_norm": 0.59765625, + "learning_rate": 2.2855906300463305e-07, + "loss": 0.0205, + "mean_token_accuracy": 0.9934186339378357, + "num_tokens": 455308311.0, + "step": 3819 + }, + { + "entropy": 0.6367793902754784, + "epoch": 8.702594810379242, + "grad_norm": 0.5, + "learning_rate": 2.2777295576136865e-07, + "loss": 0.0159, + "mean_token_accuracy": 0.9952844306826591, + "num_tokens": 455428404.0, + "step": 3820 + }, + { + "entropy": 0.6375135779380798, + "epoch": 8.704875962360992, + "grad_norm": 0.6171875, + "learning_rate": 2.2698813819030802e-07, + "loss": 0.0137, + "mean_token_accuracy": 0.994661770761013, + "num_tokens": 455547892.0, + "step": 3821 + }, + { + "entropy": 0.6351048797369003, + "epoch": 8.707157114342744, + "grad_norm": 0.546875, + "learning_rate": 2.2620461073689732e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.994592472910881, + "num_tokens": 455667017.0, + "step": 3822 + }, + { + "entropy": 0.6306072399020195, + "epoch": 8.709438266324494, + "grad_norm": 0.5703125, + "learning_rate": 2.254223738458522e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.9948727041482925, + "num_tokens": 455786989.0, + "step": 3823 + }, + { + "entropy": 0.6358784288167953, + "epoch": 8.711719418306245, + "grad_norm": 0.62109375, + "learning_rate": 2.2464142796115557e-07, + "loss": 0.0197, + "mean_token_accuracy": 0.9924947619438171, + "num_tokens": 455906618.0, + "step": 3824 + }, + { + "entropy": 0.6347793713212013, + "epoch": 8.714000570287995, + "grad_norm": 0.63671875, + "learning_rate": 2.2386177352605677e-07, + "loss": 0.0188, + "mean_token_accuracy": 0.9937383458018303, + "num_tokens": 456025330.0, + "step": 3825 + }, + { + "entropy": 0.6328953579068184, + "epoch": 8.716281722269747, + "grad_norm": 0.4609375, + "learning_rate": 2.2308341098307318e-07, + "loss": 0.0124, + "mean_token_accuracy": 0.9960956647992134, + "num_tokens": 456144559.0, + "step": 3826 + }, + { + "entropy": 0.6341451555490494, + "epoch": 8.718562874251496, + "grad_norm": 0.58984375, + "learning_rate": 2.2230634077398755e-07, + "loss": 0.0145, + "mean_token_accuracy": 0.9947997704148293, + "num_tokens": 456264051.0, + "step": 3827 + }, + { + "entropy": 0.6357009559869766, + "epoch": 8.720844026233248, + "grad_norm": 0.478515625, + "learning_rate": 2.2153056333985014e-07, + "loss": 0.0131, + "mean_token_accuracy": 0.9942482113838196, + "num_tokens": 456383409.0, + "step": 3828 + }, + { + "entropy": 0.640854649245739, + "epoch": 8.723125178214998, + "grad_norm": 0.55859375, + "learning_rate": 2.2075607912097758e-07, + "loss": 0.0179, + "mean_token_accuracy": 0.9929599165916443, + "num_tokens": 456503086.0, + "step": 3829 + }, + { + "entropy": 0.630342535674572, + "epoch": 8.72540633019675, + "grad_norm": 0.455078125, + "learning_rate": 2.1998288855695189e-07, + "loss": 0.0147, + "mean_token_accuracy": 0.9942848011851311, + "num_tokens": 456622376.0, + "step": 3830 + }, + { + "entropy": 0.6374860033392906, + "epoch": 8.7276874821785, + "grad_norm": 0.5546875, + "learning_rate": 2.1921099208662173e-07, + "loss": 0.0174, + "mean_token_accuracy": 0.9951121434569359, + "num_tokens": 456741890.0, + "step": 3831 + }, + { + "entropy": 0.6320973634719849, + "epoch": 8.729968634160251, + "grad_norm": 0.703125, + "learning_rate": 2.184403901480997e-07, + "loss": 0.0215, + "mean_token_accuracy": 0.9948190003633499, + "num_tokens": 456860974.0, + "step": 3832 + }, + { + "entropy": 0.6315572261810303, + "epoch": 8.732249786142003, + "grad_norm": 0.494140625, + "learning_rate": 2.176710831787651e-07, + "loss": 0.0159, + "mean_token_accuracy": 0.9959522858262062, + "num_tokens": 456979784.0, + "step": 3833 + }, + { + "entropy": 0.6363257169723511, + "epoch": 8.734530938123752, + "grad_norm": 0.51953125, + "learning_rate": 2.1690307161526148e-07, + "loss": 0.0164, + "mean_token_accuracy": 0.9939150437712669, + "num_tokens": 457099658.0, + "step": 3834 + }, + { + "entropy": 0.6343640014529228, + "epoch": 8.736812090105504, + "grad_norm": 0.54296875, + "learning_rate": 2.1613635589349756e-07, + "loss": 0.0144, + "mean_token_accuracy": 0.9939402341842651, + "num_tokens": 457219546.0, + "step": 3835 + }, + { + "entropy": 0.6361004486680031, + "epoch": 8.739093242087254, + "grad_norm": 0.482421875, + "learning_rate": 2.153709364486467e-07, + "loss": 0.0174, + "mean_token_accuracy": 0.9953198209404945, + "num_tokens": 457339668.0, + "step": 3836 + }, + { + "entropy": 0.6348271369934082, + "epoch": 8.741374394069005, + "grad_norm": 0.486328125, + "learning_rate": 2.1460681371514552e-07, + "loss": 0.0178, + "mean_token_accuracy": 0.9935082048177719, + "num_tokens": 457459214.0, + "step": 3837 + }, + { + "entropy": 0.6354571133852005, + "epoch": 8.743655546050755, + "grad_norm": 0.66015625, + "learning_rate": 2.13843988126696e-07, + "loss": 0.0178, + "mean_token_accuracy": 0.9938977956771851, + "num_tokens": 457578444.0, + "step": 3838 + }, + { + "entropy": 0.6369418129324913, + "epoch": 8.745936698032507, + "grad_norm": 0.53515625, + "learning_rate": 2.130824601162626e-07, + "loss": 0.0143, + "mean_token_accuracy": 0.9942202717065811, + "num_tokens": 457697927.0, + "step": 3839 + }, + { + "entropy": 0.6341206580400467, + "epoch": 8.748217850014257, + "grad_norm": 0.5625, + "learning_rate": 2.1232223011607406e-07, + "loss": 0.0211, + "mean_token_accuracy": 0.9928543269634247, + "num_tokens": 457817519.0, + "step": 3840 + }, + { + "entropy": 0.6377106830477715, + "epoch": 8.750499001996008, + "grad_norm": 0.55859375, + "learning_rate": 2.1156329855762243e-07, + "loss": 0.0163, + "mean_token_accuracy": 0.9965719729661942, + "num_tokens": 457937160.0, + "step": 3841 + }, + { + "entropy": 0.634471096098423, + "epoch": 8.752780153977758, + "grad_norm": 0.43359375, + "learning_rate": 2.1080566587166286e-07, + "loss": 0.0185, + "mean_token_accuracy": 0.9940520152449608, + "num_tokens": 458056411.0, + "step": 3842 + }, + { + "entropy": 0.6398299857974052, + "epoch": 8.75506130595951, + "grad_norm": 0.51953125, + "learning_rate": 2.1004933248821247e-07, + "loss": 0.0157, + "mean_token_accuracy": 0.9945663139224052, + "num_tokens": 458176860.0, + "step": 3843 + }, + { + "entropy": 0.6308516040444374, + "epoch": 8.75734245794126, + "grad_norm": 0.6953125, + "learning_rate": 2.0929429883655151e-07, + "loss": 0.0246, + "mean_token_accuracy": 0.9920361787080765, + "num_tokens": 458296658.0, + "step": 3844 + }, + { + "entropy": 0.6382728219032288, + "epoch": 8.759623609923011, + "grad_norm": 0.46875, + "learning_rate": 2.08540565345223e-07, + "loss": 0.0151, + "mean_token_accuracy": 0.9956934824585915, + "num_tokens": 458416492.0, + "step": 3845 + }, + { + "entropy": 0.6321053355932236, + "epoch": 8.761904761904763, + "grad_norm": 0.5703125, + "learning_rate": 2.0778813244203111e-07, + "loss": 0.0188, + "mean_token_accuracy": 0.9935645535588264, + "num_tokens": 458535286.0, + "step": 3846 + }, + { + "entropy": 0.6361094638705254, + "epoch": 8.764185913886513, + "grad_norm": 0.5703125, + "learning_rate": 2.0703700055404285e-07, + "loss": 0.0152, + "mean_token_accuracy": 0.9948973804712296, + "num_tokens": 458653631.0, + "step": 3847 + }, + { + "entropy": 0.6372981667518616, + "epoch": 8.766467065868264, + "grad_norm": 0.61328125, + "learning_rate": 2.0628717010758526e-07, + "loss": 0.0216, + "mean_token_accuracy": 0.9940069168806076, + "num_tokens": 458773572.0, + "step": 3848 + }, + { + "entropy": 0.6349422633647919, + "epoch": 8.768748217850014, + "grad_norm": 0.51953125, + "learning_rate": 2.0553864152824815e-07, + "loss": 0.0209, + "mean_token_accuracy": 0.9930048063397408, + "num_tokens": 458894042.0, + "step": 3849 + }, + { + "entropy": 0.632927767932415, + "epoch": 8.771029369831766, + "grad_norm": 0.49609375, + "learning_rate": 2.0479141524088169e-07, + "loss": 0.0143, + "mean_token_accuracy": 0.9941286593675613, + "num_tokens": 459013525.0, + "step": 3850 + }, + { + "entropy": 0.6352263018488884, + "epoch": 8.773310521813515, + "grad_norm": 0.58203125, + "learning_rate": 2.040454916695972e-07, + "loss": 0.0162, + "mean_token_accuracy": 0.9918675050139427, + "num_tokens": 459132983.0, + "step": 3851 + }, + { + "entropy": 0.6341735124588013, + "epoch": 8.775591673795267, + "grad_norm": 0.53125, + "learning_rate": 2.0330087123776655e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.99581079185009, + "num_tokens": 459253048.0, + "step": 3852 + }, + { + "entropy": 0.6358790546655655, + "epoch": 8.777872825777017, + "grad_norm": 0.498046875, + "learning_rate": 2.0255755436802248e-07, + "loss": 0.0142, + "mean_token_accuracy": 0.9940047934651375, + "num_tokens": 459372695.0, + "step": 3853 + }, + { + "entropy": 0.6342456638813019, + "epoch": 8.780153977758768, + "grad_norm": 0.4296875, + "learning_rate": 2.0181554148225618e-07, + "loss": 0.0137, + "mean_token_accuracy": 0.9962322115898132, + "num_tokens": 459491584.0, + "step": 3854 + }, + { + "entropy": 0.6340317130088806, + "epoch": 8.782435129740518, + "grad_norm": 0.39453125, + "learning_rate": 2.0107483300162018e-07, + "loss": 0.0108, + "mean_token_accuracy": 0.99652498960495, + "num_tokens": 459611216.0, + "step": 3855 + }, + { + "entropy": 0.6403715685009956, + "epoch": 8.78471628172227, + "grad_norm": 0.609375, + "learning_rate": 2.0033542934652679e-07, + "loss": 0.0151, + "mean_token_accuracy": 0.9945246949791908, + "num_tokens": 459731044.0, + "step": 3856 + }, + { + "entropy": 0.6367682740092278, + "epoch": 8.78699743370402, + "grad_norm": 0.546875, + "learning_rate": 1.9959733093664696e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.9936319217085838, + "num_tokens": 459850970.0, + "step": 3857 + }, + { + "entropy": 0.63430967181921, + "epoch": 8.789278585685771, + "grad_norm": 0.51171875, + "learning_rate": 1.9886053819091116e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.995650440454483, + "num_tokens": 459970001.0, + "step": 3858 + }, + { + "entropy": 0.6381070837378502, + "epoch": 8.791559737667523, + "grad_norm": 0.458984375, + "learning_rate": 1.981250515275085e-07, + "loss": 0.0106, + "mean_token_accuracy": 0.996192216873169, + "num_tokens": 460089980.0, + "step": 3859 + }, + { + "entropy": 0.6353754475712776, + "epoch": 8.793840889649273, + "grad_norm": 0.5546875, + "learning_rate": 1.973908713638878e-07, + "loss": 0.014, + "mean_token_accuracy": 0.9953928589820862, + "num_tokens": 460209332.0, + "step": 3860 + }, + { + "entropy": 0.6301795616745949, + "epoch": 8.796122041631024, + "grad_norm": 0.671875, + "learning_rate": 1.9665799811675407e-07, + "loss": 0.016, + "mean_token_accuracy": 0.9950879663228989, + "num_tokens": 460328753.0, + "step": 3861 + }, + { + "entropy": 0.6372567415237427, + "epoch": 8.798403193612774, + "grad_norm": 0.5859375, + "learning_rate": 1.959264322020732e-07, + "loss": 0.0227, + "mean_token_accuracy": 0.9939282238483429, + "num_tokens": 460448220.0, + "step": 3862 + }, + { + "entropy": 0.6387064233422279, + "epoch": 8.800684345594526, + "grad_norm": 0.5234375, + "learning_rate": 1.9519617403506747e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.9956307038664818, + "num_tokens": 460567419.0, + "step": 3863 + }, + { + "entropy": 0.6334012001752853, + "epoch": 8.802965497576276, + "grad_norm": 0.44140625, + "learning_rate": 1.9446722403021757e-07, + "loss": 0.0108, + "mean_token_accuracy": 0.9959229752421379, + "num_tokens": 460686413.0, + "step": 3864 + }, + { + "entropy": 0.6327636018395424, + "epoch": 8.805246649558027, + "grad_norm": 0.41015625, + "learning_rate": 1.9373958260126113e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.9947085902094841, + "num_tokens": 460805859.0, + "step": 3865 + }, + { + "entropy": 0.6344382613897324, + "epoch": 8.807527801539777, + "grad_norm": 0.48828125, + "learning_rate": 1.9301325016119338e-07, + "loss": 0.0159, + "mean_token_accuracy": 0.9939682185649872, + "num_tokens": 460925319.0, + "step": 3866 + }, + { + "entropy": 0.6346740499138832, + "epoch": 8.809808953521529, + "grad_norm": 0.5078125, + "learning_rate": 1.9228822712226675e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.9953140169382095, + "num_tokens": 461044711.0, + "step": 3867 + }, + { + "entropy": 0.6328175067901611, + "epoch": 8.812090105503279, + "grad_norm": 0.53125, + "learning_rate": 1.915645138959904e-07, + "loss": 0.0153, + "mean_token_accuracy": 0.9939972460269928, + "num_tokens": 461164643.0, + "step": 3868 + }, + { + "entropy": 0.6376869678497314, + "epoch": 8.81437125748503, + "grad_norm": 0.44921875, + "learning_rate": 1.908421108931302e-07, + "loss": 0.0148, + "mean_token_accuracy": 0.9957305565476418, + "num_tokens": 461284017.0, + "step": 3869 + }, + { + "entropy": 0.6341729983687401, + "epoch": 8.81665240946678, + "grad_norm": 0.44140625, + "learning_rate": 1.9012101852370763e-07, + "loss": 0.0151, + "mean_token_accuracy": 0.994876854121685, + "num_tokens": 461403542.0, + "step": 3870 + }, + { + "entropy": 0.6361062824726105, + "epoch": 8.818933561448532, + "grad_norm": 0.609375, + "learning_rate": 1.894012371970008e-07, + "loss": 0.0227, + "mean_token_accuracy": 0.9921394735574722, + "num_tokens": 461523153.0, + "step": 3871 + }, + { + "entropy": 0.6362082064151764, + "epoch": 8.821214713430283, + "grad_norm": 0.494140625, + "learning_rate": 1.8868276732154384e-07, + "loss": 0.017, + "mean_token_accuracy": 0.9943319857120514, + "num_tokens": 461642564.0, + "step": 3872 + }, + { + "entropy": 0.6365108788013458, + "epoch": 8.823495865412033, + "grad_norm": 0.408203125, + "learning_rate": 1.879656093051266e-07, + "loss": 0.0123, + "mean_token_accuracy": 0.99557676166296, + "num_tokens": 461762033.0, + "step": 3873 + }, + { + "entropy": 0.630879744887352, + "epoch": 8.825777017393785, + "grad_norm": 0.5, + "learning_rate": 1.872497635547943e-07, + "loss": 0.018, + "mean_token_accuracy": 0.9936963990330696, + "num_tokens": 461881326.0, + "step": 3874 + }, + { + "entropy": 0.6360142529010773, + "epoch": 8.828058169375534, + "grad_norm": 0.4375, + "learning_rate": 1.8653523047684642e-07, + "loss": 0.0108, + "mean_token_accuracy": 0.9960737228393555, + "num_tokens": 462000863.0, + "step": 3875 + }, + { + "entropy": 0.6351206228137016, + "epoch": 8.830339321357286, + "grad_norm": 0.5078125, + "learning_rate": 1.858220104768385e-07, + "loss": 0.0171, + "mean_token_accuracy": 0.9951196238398552, + "num_tokens": 462119986.0, + "step": 3876 + }, + { + "entropy": 0.6357057467103004, + "epoch": 8.832620473339036, + "grad_norm": 0.8359375, + "learning_rate": 1.8511010395958067e-07, + "loss": 0.0263, + "mean_token_accuracy": 0.9933116063475609, + "num_tokens": 462239854.0, + "step": 3877 + }, + { + "entropy": 0.6338930949568748, + "epoch": 8.834901625320787, + "grad_norm": 0.66796875, + "learning_rate": 1.843995113291372e-07, + "loss": 0.0196, + "mean_token_accuracy": 0.9938095510005951, + "num_tokens": 462359360.0, + "step": 3878 + }, + { + "entropy": 0.6387783363461494, + "epoch": 8.837182777302537, + "grad_norm": 0.671875, + "learning_rate": 1.836902329888268e-07, + "loss": 0.0233, + "mean_token_accuracy": 0.9939451441168785, + "num_tokens": 462478471.0, + "step": 3879 + }, + { + "entropy": 0.6340103596448898, + "epoch": 8.839463929284289, + "grad_norm": 0.474609375, + "learning_rate": 1.829822693412217e-07, + "loss": 0.0167, + "mean_token_accuracy": 0.9948710426688194, + "num_tokens": 462598095.0, + "step": 3880 + }, + { + "entropy": 0.6296518221497536, + "epoch": 8.841745081266039, + "grad_norm": 0.4921875, + "learning_rate": 1.8227562078814903e-07, + "loss": 0.0132, + "mean_token_accuracy": 0.9956203550100327, + "num_tokens": 462718221.0, + "step": 3881 + }, + { + "entropy": 0.6419806852936745, + "epoch": 8.84402623324779, + "grad_norm": 0.6171875, + "learning_rate": 1.815702877306888e-07, + "loss": 0.0171, + "mean_token_accuracy": 0.994601421058178, + "num_tokens": 462837803.0, + "step": 3882 + }, + { + "entropy": 0.6381335407495499, + "epoch": 8.84630738522954, + "grad_norm": 0.546875, + "learning_rate": 1.8086627056917382e-07, + "loss": 0.0188, + "mean_token_accuracy": 0.9953000843524933, + "num_tokens": 462956707.0, + "step": 3883 + }, + { + "entropy": 0.6293988451361656, + "epoch": 8.848588537211292, + "grad_norm": 0.423828125, + "learning_rate": 1.8016356970319116e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9950558394193649, + "num_tokens": 463076101.0, + "step": 3884 + }, + { + "entropy": 0.6380920708179474, + "epoch": 8.850869689193043, + "grad_norm": 0.64453125, + "learning_rate": 1.7946218553158062e-07, + "loss": 0.0145, + "mean_token_accuracy": 0.9944487810134888, + "num_tokens": 463196172.0, + "step": 3885 + }, + { + "entropy": 0.6355432420969009, + "epoch": 8.853150841174793, + "grad_norm": 0.6015625, + "learning_rate": 1.7876211845243325e-07, + "loss": 0.0212, + "mean_token_accuracy": 0.9931813552975655, + "num_tokens": 463316891.0, + "step": 3886 + }, + { + "entropy": 0.6303383335471153, + "epoch": 8.855431993156545, + "grad_norm": 0.490234375, + "learning_rate": 1.780633688630942e-07, + "loss": 0.0103, + "mean_token_accuracy": 0.9960558637976646, + "num_tokens": 463435578.0, + "step": 3887 + }, + { + "entropy": 0.6367069631814957, + "epoch": 8.857713145138295, + "grad_norm": 0.50390625, + "learning_rate": 1.773659371601605e-07, + "loss": 0.0193, + "mean_token_accuracy": 0.9922304749488831, + "num_tokens": 463555464.0, + "step": 3888 + }, + { + "entropy": 0.6285217702388763, + "epoch": 8.859994297120046, + "grad_norm": 0.4921875, + "learning_rate": 1.7666982373948038e-07, + "loss": 0.0163, + "mean_token_accuracy": 0.9938005059957504, + "num_tokens": 463674748.0, + "step": 3889 + }, + { + "entropy": 0.6383931934833527, + "epoch": 8.862275449101796, + "grad_norm": 0.46484375, + "learning_rate": 1.7597502899615538e-07, + "loss": 0.0132, + "mean_token_accuracy": 0.9950364306569099, + "num_tokens": 463794421.0, + "step": 3890 + }, + { + "entropy": 0.6335692778229713, + "epoch": 8.864556601083548, + "grad_norm": 0.451171875, + "learning_rate": 1.752815533245364e-07, + "loss": 0.0108, + "mean_token_accuracy": 0.9959416314959526, + "num_tokens": 463913773.0, + "step": 3891 + }, + { + "entropy": 0.6328288242220879, + "epoch": 8.866837753065298, + "grad_norm": 0.7109375, + "learning_rate": 1.745893971182272e-07, + "loss": 0.021, + "mean_token_accuracy": 0.9929630011320114, + "num_tokens": 464032535.0, + "step": 3892 + }, + { + "entropy": 0.6345797851681709, + "epoch": 8.86911890504705, + "grad_norm": 0.63671875, + "learning_rate": 1.7389856077008245e-07, + "loss": 0.019, + "mean_token_accuracy": 0.9933710768818855, + "num_tokens": 464152256.0, + "step": 3893 + }, + { + "entropy": 0.6367725431919098, + "epoch": 8.871400057028799, + "grad_norm": 0.443359375, + "learning_rate": 1.7320904467220762e-07, + "loss": 0.0126, + "mean_token_accuracy": 0.9952173754572868, + "num_tokens": 464271829.0, + "step": 3894 + }, + { + "entropy": 0.6360621824860573, + "epoch": 8.87368120901055, + "grad_norm": 0.61328125, + "learning_rate": 1.725208492159583e-07, + "loss": 0.0211, + "mean_token_accuracy": 0.9940311312675476, + "num_tokens": 464391234.0, + "step": 3895 + }, + { + "entropy": 0.6382220089435577, + "epoch": 8.8759623609923, + "grad_norm": 0.5, + "learning_rate": 1.7183397479194175e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.9950458258390427, + "num_tokens": 464510637.0, + "step": 3896 + }, + { + "entropy": 0.6379696875810623, + "epoch": 8.878243512974052, + "grad_norm": 0.51171875, + "learning_rate": 1.711484217900139e-07, + "loss": 0.0185, + "mean_token_accuracy": 0.9937357306480408, + "num_tokens": 464630039.0, + "step": 3897 + }, + { + "entropy": 0.6357573866844177, + "epoch": 8.880524664955804, + "grad_norm": 0.484375, + "learning_rate": 1.7046419059928154e-07, + "loss": 0.0109, + "mean_token_accuracy": 0.9963376894593239, + "num_tokens": 464749496.0, + "step": 3898 + }, + { + "entropy": 0.6357030496001244, + "epoch": 8.882805816937553, + "grad_norm": 0.447265625, + "learning_rate": 1.6978128160810098e-07, + "loss": 0.0129, + "mean_token_accuracy": 0.9960111975669861, + "num_tokens": 464868936.0, + "step": 3899 + }, + { + "entropy": 0.634676955640316, + "epoch": 8.885086968919305, + "grad_norm": 0.6015625, + "learning_rate": 1.6909969520407854e-07, + "loss": 0.0243, + "mean_token_accuracy": 0.9928058758378029, + "num_tokens": 464988037.0, + "step": 3900 + }, + { + "entropy": 0.6323513016104698, + "epoch": 8.887368120901055, + "grad_norm": 0.462890625, + "learning_rate": 1.6841943177406976e-07, + "loss": 0.0146, + "mean_token_accuracy": 0.9958186149597168, + "num_tokens": 465107097.0, + "step": 3901 + }, + { + "entropy": 0.6417568027973175, + "epoch": 8.889649272882806, + "grad_norm": 0.451171875, + "learning_rate": 1.6774049170417806e-07, + "loss": 0.0122, + "mean_token_accuracy": 0.9958525747060776, + "num_tokens": 465226485.0, + "step": 3902 + }, + { + "entropy": 0.6361431404948235, + "epoch": 8.891930424864556, + "grad_norm": 0.50390625, + "learning_rate": 1.6706287537975763e-07, + "loss": 0.0144, + "mean_token_accuracy": 0.9953998029232025, + "num_tokens": 465345917.0, + "step": 3903 + }, + { + "entropy": 0.6365928277373314, + "epoch": 8.894211576846308, + "grad_norm": 0.47265625, + "learning_rate": 1.6638658318540973e-07, + "loss": 0.0129, + "mean_token_accuracy": 0.9951719343662262, + "num_tokens": 465465159.0, + "step": 3904 + }, + { + "entropy": 0.6328705474734306, + "epoch": 8.896492728828058, + "grad_norm": 0.5234375, + "learning_rate": 1.657116155049851e-07, + "loss": 0.0209, + "mean_token_accuracy": 0.9923318848013878, + "num_tokens": 465585047.0, + "step": 3905 + }, + { + "entropy": 0.638294480741024, + "epoch": 8.89877388080981, + "grad_norm": 0.47265625, + "learning_rate": 1.6503797272158284e-07, + "loss": 0.0167, + "mean_token_accuracy": 0.9935402944684029, + "num_tokens": 465704468.0, + "step": 3906 + }, + { + "entropy": 0.6338247135281563, + "epoch": 8.90105503279156, + "grad_norm": 0.49609375, + "learning_rate": 1.643656552175485e-07, + "loss": 0.0166, + "mean_token_accuracy": 0.9951290637254715, + "num_tokens": 465823692.0, + "step": 3907 + }, + { + "entropy": 0.6384601965546608, + "epoch": 8.90333618477331, + "grad_norm": 0.498046875, + "learning_rate": 1.6369466337447708e-07, + "loss": 0.0164, + "mean_token_accuracy": 0.9945313185453415, + "num_tokens": 465943455.0, + "step": 3908 + }, + { + "entropy": 0.6376626715064049, + "epoch": 8.90561733675506, + "grad_norm": 0.60546875, + "learning_rate": 1.6302499757321066e-07, + "loss": 0.0238, + "mean_token_accuracy": 0.991908423602581, + "num_tokens": 466062869.0, + "step": 3909 + }, + { + "entropy": 0.6371306777000427, + "epoch": 8.907898488736812, + "grad_norm": 0.515625, + "learning_rate": 1.623566581938385e-07, + "loss": 0.0105, + "mean_token_accuracy": 0.9951636865735054, + "num_tokens": 466182071.0, + "step": 3910 + }, + { + "entropy": 0.6292389780282974, + "epoch": 8.910179640718562, + "grad_norm": 0.52734375, + "learning_rate": 1.6168964561569716e-07, + "loss": 0.0167, + "mean_token_accuracy": 0.9941458478569984, + "num_tokens": 466301368.0, + "step": 3911 + }, + { + "entropy": 0.6355702951550484, + "epoch": 8.912460792700314, + "grad_norm": 0.54296875, + "learning_rate": 1.6102396021737077e-07, + "loss": 0.0222, + "mean_token_accuracy": 0.993693083524704, + "num_tokens": 466420819.0, + "step": 3912 + }, + { + "entropy": 0.6350984647870064, + "epoch": 8.914741944682065, + "grad_norm": 0.400390625, + "learning_rate": 1.6035960237668818e-07, + "loss": 0.0129, + "mean_token_accuracy": 0.9957837611436844, + "num_tokens": 466540262.0, + "step": 3913 + }, + { + "entropy": 0.6279176771640778, + "epoch": 8.917023096663815, + "grad_norm": 0.66015625, + "learning_rate": 1.5969657247072695e-07, + "loss": 0.0225, + "mean_token_accuracy": 0.9938788488507271, + "num_tokens": 466659185.0, + "step": 3914 + }, + { + "entropy": 0.6317555457353592, + "epoch": 8.919304248645567, + "grad_norm": 0.455078125, + "learning_rate": 1.5903487087580994e-07, + "loss": 0.0101, + "mean_token_accuracy": 0.9960716515779495, + "num_tokens": 466778152.0, + "step": 3915 + }, + { + "entropy": 0.6359038949012756, + "epoch": 8.921585400627317, + "grad_norm": 0.52734375, + "learning_rate": 1.5837449796750588e-07, + "loss": 0.0192, + "mean_token_accuracy": 0.9942805990576744, + "num_tokens": 466897658.0, + "step": 3916 + }, + { + "entropy": 0.6328011602163315, + "epoch": 8.923866552609068, + "grad_norm": 0.52734375, + "learning_rate": 1.577154541206305e-07, + "loss": 0.0164, + "mean_token_accuracy": 0.9927475526928902, + "num_tokens": 467017002.0, + "step": 3917 + }, + { + "entropy": 0.6370216459035873, + "epoch": 8.926147704590818, + "grad_norm": 0.51953125, + "learning_rate": 1.5705773970924349e-07, + "loss": 0.0166, + "mean_token_accuracy": 0.9942125156521797, + "num_tokens": 467136775.0, + "step": 3918 + }, + { + "entropy": 0.6354293823242188, + "epoch": 8.92842885657257, + "grad_norm": 0.62109375, + "learning_rate": 1.5640135510665094e-07, + "loss": 0.0165, + "mean_token_accuracy": 0.9958135485649109, + "num_tokens": 467255788.0, + "step": 3919 + }, + { + "entropy": 0.633288599550724, + "epoch": 8.93071000855432, + "grad_norm": 0.474609375, + "learning_rate": 1.5574630068540458e-07, + "loss": 0.0125, + "mean_token_accuracy": 0.9967261105775833, + "num_tokens": 467375180.0, + "step": 3920 + }, + { + "entropy": 0.6306182816624641, + "epoch": 8.932991160536071, + "grad_norm": 0.578125, + "learning_rate": 1.5509257681730034e-07, + "loss": 0.0174, + "mean_token_accuracy": 0.99400644749403, + "num_tokens": 467494672.0, + "step": 3921 + }, + { + "entropy": 0.6372653916478157, + "epoch": 8.93527231251782, + "grad_norm": 0.400390625, + "learning_rate": 1.5444018387337946e-07, + "loss": 0.0135, + "mean_token_accuracy": 0.9965631738305092, + "num_tokens": 467613681.0, + "step": 3922 + }, + { + "entropy": 0.6423621624708176, + "epoch": 8.937553464499572, + "grad_norm": 0.498046875, + "learning_rate": 1.537891222239271e-07, + "loss": 0.0133, + "mean_token_accuracy": 0.9963992089033127, + "num_tokens": 467733354.0, + "step": 3923 + }, + { + "entropy": 0.6326280534267426, + "epoch": 8.939834616481322, + "grad_norm": 0.443359375, + "learning_rate": 1.5313939223847384e-07, + "loss": 0.0162, + "mean_token_accuracy": 0.9952918961644173, + "num_tokens": 467852568.0, + "step": 3924 + }, + { + "entropy": 0.6323113590478897, + "epoch": 8.942115768463074, + "grad_norm": 0.58203125, + "learning_rate": 1.5249099428579383e-07, + "loss": 0.0218, + "mean_token_accuracy": 0.9930441379547119, + "num_tokens": 467973170.0, + "step": 3925 + }, + { + "entropy": 0.6311705037951469, + "epoch": 8.944396920444824, + "grad_norm": 0.51953125, + "learning_rate": 1.5184392873390463e-07, + "loss": 0.017, + "mean_token_accuracy": 0.993541844189167, + "num_tokens": 468092359.0, + "step": 3926 + }, + { + "entropy": 0.6315928772091866, + "epoch": 8.946678072426575, + "grad_norm": 0.47265625, + "learning_rate": 1.5119819595006857e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.9952520355582237, + "num_tokens": 468211465.0, + "step": 3927 + }, + { + "entropy": 0.6374095231294632, + "epoch": 8.948959224408327, + "grad_norm": 0.51953125, + "learning_rate": 1.5055379630079163e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.9966191127896309, + "num_tokens": 468330427.0, + "step": 3928 + }, + { + "entropy": 0.6345637887716293, + "epoch": 8.951240376390077, + "grad_norm": 0.48828125, + "learning_rate": 1.4991073015182184e-07, + "loss": 0.0152, + "mean_token_accuracy": 0.9960469305515289, + "num_tokens": 468449555.0, + "step": 3929 + }, + { + "entropy": 0.6353870034217834, + "epoch": 8.953521528371828, + "grad_norm": 0.58984375, + "learning_rate": 1.4926899786815107e-07, + "loss": 0.0225, + "mean_token_accuracy": 0.994409941136837, + "num_tokens": 468569303.0, + "step": 3930 + }, + { + "entropy": 0.6344339698553085, + "epoch": 8.955802680353578, + "grad_norm": 0.462890625, + "learning_rate": 1.4862859981401468e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.9960893094539642, + "num_tokens": 468688564.0, + "step": 3931 + }, + { + "entropy": 0.6378034427762032, + "epoch": 8.95808383233533, + "grad_norm": 0.50390625, + "learning_rate": 1.4798953635288994e-07, + "loss": 0.0188, + "mean_token_accuracy": 0.9950406551361084, + "num_tokens": 468808466.0, + "step": 3932 + }, + { + "entropy": 0.6367941275238991, + "epoch": 8.96036498431708, + "grad_norm": 0.5390625, + "learning_rate": 1.4735180784749754e-07, + "loss": 0.0167, + "mean_token_accuracy": 0.9957256466150284, + "num_tokens": 468927775.0, + "step": 3933 + }, + { + "entropy": 0.639420360326767, + "epoch": 8.962646136298831, + "grad_norm": 0.392578125, + "learning_rate": 1.4671541465979877e-07, + "loss": 0.0097, + "mean_token_accuracy": 0.996525302529335, + "num_tokens": 469046924.0, + "step": 3934 + }, + { + "entropy": 0.6311897337436676, + "epoch": 8.964927288280581, + "grad_norm": 0.546875, + "learning_rate": 1.460803571509989e-07, + "loss": 0.0164, + "mean_token_accuracy": 0.9948275312781334, + "num_tokens": 469166477.0, + "step": 3935 + }, + { + "entropy": 0.6321447938680649, + "epoch": 8.967208440262333, + "grad_norm": 0.494140625, + "learning_rate": 1.4544663568154427e-07, + "loss": 0.0152, + "mean_token_accuracy": 0.995464913547039, + "num_tokens": 469285374.0, + "step": 3936 + }, + { + "entropy": 0.6343131139874458, + "epoch": 8.969489592244082, + "grad_norm": 0.6171875, + "learning_rate": 1.448142506111225e-07, + "loss": 0.0187, + "mean_token_accuracy": 0.9937012419104576, + "num_tokens": 469405030.0, + "step": 3937 + }, + { + "entropy": 0.6328693479299545, + "epoch": 8.971770744225834, + "grad_norm": 0.482421875, + "learning_rate": 1.441832022986636e-07, + "loss": 0.0135, + "mean_token_accuracy": 0.9951421171426773, + "num_tokens": 469524120.0, + "step": 3938 + }, + { + "entropy": 0.6343910470604897, + "epoch": 8.974051896207584, + "grad_norm": 0.470703125, + "learning_rate": 1.4355349110233868e-07, + "loss": 0.0121, + "mean_token_accuracy": 0.9945838823914528, + "num_tokens": 469643238.0, + "step": 3939 + }, + { + "entropy": 0.6360427811741829, + "epoch": 8.976333048189336, + "grad_norm": 0.73046875, + "learning_rate": 1.42925117379559e-07, + "loss": 0.0253, + "mean_token_accuracy": 0.9925220608711243, + "num_tokens": 469761882.0, + "step": 3940 + }, + { + "entropy": 0.6371672004461288, + "epoch": 8.978614200171087, + "grad_norm": 0.423828125, + "learning_rate": 1.4229808148697732e-07, + "loss": 0.0146, + "mean_token_accuracy": 0.9964242577552795, + "num_tokens": 469881511.0, + "step": 3941 + }, + { + "entropy": 0.6307489052414894, + "epoch": 8.980895352152837, + "grad_norm": 0.451171875, + "learning_rate": 1.416723837804876e-07, + "loss": 0.0133, + "mean_token_accuracy": 0.9959947764873505, + "num_tokens": 470001052.0, + "step": 3942 + }, + { + "entropy": 0.6422105729579926, + "epoch": 8.983176504134589, + "grad_norm": 0.447265625, + "learning_rate": 1.410480246152235e-07, + "loss": 0.018, + "mean_token_accuracy": 0.9949899092316628, + "num_tokens": 470120671.0, + "step": 3943 + }, + { + "entropy": 0.6375622376799583, + "epoch": 8.985457656116338, + "grad_norm": 0.4921875, + "learning_rate": 1.4042500434555961e-07, + "loss": 0.018, + "mean_token_accuracy": 0.9944124519824982, + "num_tokens": 470240150.0, + "step": 3944 + }, + { + "entropy": 0.6432486325502396, + "epoch": 8.98773880809809, + "grad_norm": 0.58984375, + "learning_rate": 1.398033233251095e-07, + "loss": 0.0183, + "mean_token_accuracy": 0.9927793890237808, + "num_tokens": 470359555.0, + "step": 3945 + }, + { + "entropy": 0.6377815455198288, + "epoch": 8.99001996007984, + "grad_norm": 0.5703125, + "learning_rate": 1.3918298190672806e-07, + "loss": 0.0207, + "mean_token_accuracy": 0.9945133998990059, + "num_tokens": 470478733.0, + "step": 3946 + }, + { + "entropy": 0.6361651122570038, + "epoch": 8.992301112061591, + "grad_norm": 0.494140625, + "learning_rate": 1.3856398044250846e-07, + "loss": 0.0126, + "mean_token_accuracy": 0.9957083240151405, + "num_tokens": 470598008.0, + "step": 3947 + }, + { + "entropy": 0.6311035826802254, + "epoch": 8.994582264043341, + "grad_norm": 0.462890625, + "learning_rate": 1.3794631928378434e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.994395948946476, + "num_tokens": 470717810.0, + "step": 3948 + }, + { + "entropy": 0.6331800743937492, + "epoch": 8.996863416025093, + "grad_norm": 0.61328125, + "learning_rate": 1.3732999878112856e-07, + "loss": 0.0173, + "mean_token_accuracy": 0.9938096106052399, + "num_tokens": 470837618.0, + "step": 3949 + }, + { + "entropy": 0.6366922929883003, + "epoch": 8.999144568006843, + "grad_norm": 0.5234375, + "learning_rate": 1.3671501928435193e-07, + "loss": 0.016, + "mean_token_accuracy": 0.9943871423602104, + "num_tokens": 470957463.0, + "step": 3950 + }, + { + "entropy": 0.6324210365613302, + "epoch": 9.0, + "grad_norm": 0.435546875, + "learning_rate": 1.361013811425052e-07, + "loss": 0.0061, + "mean_token_accuracy": 0.9968595306078593, + "num_tokens": 471001302.0, + "step": 3951 + }, + { + "entropy": 0.6359720975160599, + "epoch": 9.002281151981752, + "grad_norm": 0.486328125, + "learning_rate": 1.3548908470387783e-07, + "loss": 0.0129, + "mean_token_accuracy": 0.9969010800123215, + "num_tokens": 471120456.0, + "step": 3952 + }, + { + "entropy": 0.6363326981663704, + "epoch": 9.004562303963501, + "grad_norm": 0.52734375, + "learning_rate": 1.348781303159974e-07, + "loss": 0.013, + "mean_token_accuracy": 0.9955528452992439, + "num_tokens": 471240314.0, + "step": 3953 + }, + { + "entropy": 0.6346710175275803, + "epoch": 9.006843455945253, + "grad_norm": 0.380859375, + "learning_rate": 1.3426851832562982e-07, + "loss": 0.0119, + "mean_token_accuracy": 0.9960390627384186, + "num_tokens": 471360188.0, + "step": 3954 + }, + { + "entropy": 0.6331351846456528, + "epoch": 9.009124607927003, + "grad_norm": 0.5859375, + "learning_rate": 1.3366024907877917e-07, + "loss": 0.0172, + "mean_token_accuracy": 0.9935377240180969, + "num_tokens": 471480450.0, + "step": 3955 + }, + { + "entropy": 0.6364398822188377, + "epoch": 9.011405759908754, + "grad_norm": 0.435546875, + "learning_rate": 1.3305332292068706e-07, + "loss": 0.0106, + "mean_token_accuracy": 0.9965092018246651, + "num_tokens": 471599456.0, + "step": 3956 + }, + { + "entropy": 0.6373737305402756, + "epoch": 9.013686911890504, + "grad_norm": 0.53515625, + "learning_rate": 1.3244774019583296e-07, + "loss": 0.0182, + "mean_token_accuracy": 0.9950042888522148, + "num_tokens": 471718733.0, + "step": 3957 + }, + { + "entropy": 0.6317529007792473, + "epoch": 9.015968063872256, + "grad_norm": 0.435546875, + "learning_rate": 1.318435012479341e-07, + "loss": 0.0087, + "mean_token_accuracy": 0.9968730807304382, + "num_tokens": 471837096.0, + "step": 3958 + }, + { + "entropy": 0.6369073614478111, + "epoch": 9.018249215854006, + "grad_norm": 0.53125, + "learning_rate": 1.3124060641994507e-07, + "loss": 0.021, + "mean_token_accuracy": 0.9933946430683136, + "num_tokens": 471957719.0, + "step": 3959 + }, + { + "entropy": 0.6317324116826057, + "epoch": 9.020530367835757, + "grad_norm": 0.55078125, + "learning_rate": 1.306390560540577e-07, + "loss": 0.0163, + "mean_token_accuracy": 0.9946657940745354, + "num_tokens": 472077432.0, + "step": 3960 + }, + { + "epoch": 9.020530367835757, + "eval_entropy": 0.6363623844806686, + "eval_loss": 0.020557913929224014, + "eval_mean_token_accuracy": 0.9936402587382965, + "eval_num_tokens": 472077432.0, + "eval_runtime": 177.4386, + "eval_samples_per_second": 47.256, + "eval_steps_per_second": 1.482, + "step": 3960 + }, + { + "entropy": 0.6331671848893166, + "epoch": 9.022811519817507, + "grad_norm": 0.5, + "learning_rate": 1.300388504916991e-07, + "loss": 0.0162, + "mean_token_accuracy": 0.9952245280146599, + "num_tokens": 472196438.0, + "step": 3961 + }, + { + "entropy": 0.6382924243807793, + "epoch": 9.025092671799259, + "grad_norm": 0.400390625, + "learning_rate": 1.2943999007353518e-07, + "loss": 0.0109, + "mean_token_accuracy": 0.9963441714644432, + "num_tokens": 472315686.0, + "step": 3962 + }, + { + "entropy": 0.6404210478067398, + "epoch": 9.027373823781009, + "grad_norm": 0.5078125, + "learning_rate": 1.2884247513946761e-07, + "loss": 0.019, + "mean_token_accuracy": 0.9934428930282593, + "num_tokens": 472434950.0, + "step": 3963 + }, + { + "entropy": 0.6355669125914574, + "epoch": 9.02965497576276, + "grad_norm": 0.51953125, + "learning_rate": 1.2824630602863402e-07, + "loss": 0.0141, + "mean_token_accuracy": 0.9952492713928223, + "num_tokens": 472554013.0, + "step": 3964 + }, + { + "entropy": 0.6375175714492798, + "epoch": 9.031936127744512, + "grad_norm": 0.404296875, + "learning_rate": 1.2765148307940927e-07, + "loss": 0.0121, + "mean_token_accuracy": 0.9954687878489494, + "num_tokens": 472673538.0, + "step": 3965 + }, + { + "entropy": 0.6359315514564514, + "epoch": 9.034217279726262, + "grad_norm": 0.52734375, + "learning_rate": 1.270580066294022e-07, + "loss": 0.0178, + "mean_token_accuracy": 0.9940960705280304, + "num_tokens": 472792868.0, + "step": 3966 + }, + { + "entropy": 0.6322316750884056, + "epoch": 9.036498431708013, + "grad_norm": 0.37109375, + "learning_rate": 1.264658770154592e-07, + "loss": 0.0144, + "mean_token_accuracy": 0.9970100671052933, + "num_tokens": 472911674.0, + "step": 3967 + }, + { + "entropy": 0.633782722055912, + "epoch": 9.038779583689763, + "grad_norm": 0.45703125, + "learning_rate": 1.258750945736617e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9952845722436905, + "num_tokens": 473030659.0, + "step": 3968 + }, + { + "entropy": 0.6363416239619255, + "epoch": 9.041060735671515, + "grad_norm": 0.494140625, + "learning_rate": 1.252856596393262e-07, + "loss": 0.0156, + "mean_token_accuracy": 0.9938852488994598, + "num_tokens": 473150066.0, + "step": 3969 + }, + { + "entropy": 0.6393429711461067, + "epoch": 9.043341887653265, + "grad_norm": 0.53125, + "learning_rate": 1.2469757254700454e-07, + "loss": 0.0183, + "mean_token_accuracy": 0.9935580566525459, + "num_tokens": 473270215.0, + "step": 3970 + }, + { + "entropy": 0.6346340775489807, + "epoch": 9.045623039635016, + "grad_norm": 0.4296875, + "learning_rate": 1.2411083363048386e-07, + "loss": 0.0167, + "mean_token_accuracy": 0.9956382364034653, + "num_tokens": 473389434.0, + "step": 3971 + }, + { + "entropy": 0.6325456276535988, + "epoch": 9.047904191616766, + "grad_norm": 0.65625, + "learning_rate": 1.2352544322278558e-07, + "loss": 0.0223, + "mean_token_accuracy": 0.9946341216564178, + "num_tokens": 473508163.0, + "step": 3972 + }, + { + "entropy": 0.6353575736284256, + "epoch": 9.050185343598518, + "grad_norm": 0.478515625, + "learning_rate": 1.2294140165616613e-07, + "loss": 0.0158, + "mean_token_accuracy": 0.9954056367278099, + "num_tokens": 473627584.0, + "step": 3973 + }, + { + "entropy": 0.6358340159058571, + "epoch": 9.052466495580267, + "grad_norm": 0.48828125, + "learning_rate": 1.223587092621162e-07, + "loss": 0.0077, + "mean_token_accuracy": 0.9968997314572334, + "num_tokens": 473746550.0, + "step": 3974 + }, + { + "entropy": 0.6389370188117027, + "epoch": 9.054747647562019, + "grad_norm": 0.65625, + "learning_rate": 1.2177736637136063e-07, + "loss": 0.0175, + "mean_token_accuracy": 0.9927782490849495, + "num_tokens": 473865970.0, + "step": 3975 + }, + { + "entropy": 0.6317823380231857, + "epoch": 9.057028799543769, + "grad_norm": 0.67578125, + "learning_rate": 1.2119737331385885e-07, + "loss": 0.0156, + "mean_token_accuracy": 0.9938980638980865, + "num_tokens": 473985279.0, + "step": 3976 + }, + { + "entropy": 0.6370793357491493, + "epoch": 9.05930995152552, + "grad_norm": 0.5625, + "learning_rate": 1.2061873041880335e-07, + "loss": 0.0145, + "mean_token_accuracy": 0.9952749907970428, + "num_tokens": 474103973.0, + "step": 3977 + }, + { + "entropy": 0.6369862109422684, + "epoch": 9.061591103507272, + "grad_norm": 0.609375, + "learning_rate": 1.200414380146206e-07, + "loss": 0.0194, + "mean_token_accuracy": 0.9934828355908394, + "num_tokens": 474223908.0, + "step": 3978 + }, + { + "entropy": 0.6386688500642776, + "epoch": 9.063872255489022, + "grad_norm": 0.609375, + "learning_rate": 1.1946549642897043e-07, + "loss": 0.022, + "mean_token_accuracy": 0.9934226721525192, + "num_tokens": 474342919.0, + "step": 3979 + }, + { + "entropy": 0.6365095749497414, + "epoch": 9.066153407470773, + "grad_norm": 0.53125, + "learning_rate": 1.1889090598874692e-07, + "loss": 0.0179, + "mean_token_accuracy": 0.9943461120128632, + "num_tokens": 474463081.0, + "step": 3980 + }, + { + "entropy": 0.6337442398071289, + "epoch": 9.068434559452523, + "grad_norm": 0.474609375, + "learning_rate": 1.1831766702007613e-07, + "loss": 0.0161, + "mean_token_accuracy": 0.9953043609857559, + "num_tokens": 474581968.0, + "step": 3981 + }, + { + "entropy": 0.6288758739829063, + "epoch": 9.070715711434275, + "grad_norm": 0.5, + "learning_rate": 1.1774577984831725e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.9953257292509079, + "num_tokens": 474701061.0, + "step": 3982 + }, + { + "entropy": 0.63780527561903, + "epoch": 9.072996863416025, + "grad_norm": 0.53125, + "learning_rate": 1.1717524479806231e-07, + "loss": 0.0198, + "mean_token_accuracy": 0.9934739544987679, + "num_tokens": 474820076.0, + "step": 3983 + }, + { + "entropy": 0.6366401389241219, + "epoch": 9.075278015397776, + "grad_norm": 0.5390625, + "learning_rate": 1.1660606219313642e-07, + "loss": 0.0183, + "mean_token_accuracy": 0.9962786436080933, + "num_tokens": 474939596.0, + "step": 3984 + }, + { + "entropy": 0.6387142091989517, + "epoch": 9.077559167379526, + "grad_norm": 0.431640625, + "learning_rate": 1.1603823235659644e-07, + "loss": 0.0133, + "mean_token_accuracy": 0.9962259009480476, + "num_tokens": 475058818.0, + "step": 3985 + }, + { + "entropy": 0.6308257505297661, + "epoch": 9.079840319361278, + "grad_norm": 0.70703125, + "learning_rate": 1.1547175561073154e-07, + "loss": 0.0219, + "mean_token_accuracy": 0.9941318929195404, + "num_tokens": 475178182.0, + "step": 3986 + }, + { + "entropy": 0.6359536051750183, + "epoch": 9.082121471343028, + "grad_norm": 0.4140625, + "learning_rate": 1.1490663227706311e-07, + "loss": 0.0142, + "mean_token_accuracy": 0.9947295039892197, + "num_tokens": 475297182.0, + "step": 3987 + }, + { + "entropy": 0.633519820868969, + "epoch": 9.08440262332478, + "grad_norm": 0.427734375, + "learning_rate": 1.1434286267634432e-07, + "loss": 0.0131, + "mean_token_accuracy": 0.9956306144595146, + "num_tokens": 475416912.0, + "step": 3988 + }, + { + "entropy": 0.6353664547204971, + "epoch": 9.086683775306529, + "grad_norm": 0.53515625, + "learning_rate": 1.1378044712855946e-07, + "loss": 0.0146, + "mean_token_accuracy": 0.9959964379668236, + "num_tokens": 475535730.0, + "step": 3989 + }, + { + "entropy": 0.6347709596157074, + "epoch": 9.08896492728828, + "grad_norm": 0.515625, + "learning_rate": 1.1321938595292542e-07, + "loss": 0.0165, + "mean_token_accuracy": 0.9942987114191055, + "num_tokens": 475655264.0, + "step": 3990 + }, + { + "entropy": 0.6388886421918869, + "epoch": 9.091246079270032, + "grad_norm": 0.498046875, + "learning_rate": 1.1265967946788913e-07, + "loss": 0.0133, + "mean_token_accuracy": 0.9953125715255737, + "num_tokens": 475774775.0, + "step": 3991 + }, + { + "entropy": 0.6358689069747925, + "epoch": 9.093527231251782, + "grad_norm": 0.453125, + "learning_rate": 1.1210132799112954e-07, + "loss": 0.0151, + "mean_token_accuracy": 0.9950957223773003, + "num_tokens": 475894439.0, + "step": 3992 + }, + { + "entropy": 0.6381855756044388, + "epoch": 9.095808383233534, + "grad_norm": 0.60546875, + "learning_rate": 1.1154433183955593e-07, + "loss": 0.0199, + "mean_token_accuracy": 0.9939530417323112, + "num_tokens": 476014124.0, + "step": 3993 + }, + { + "entropy": 0.6408203691244125, + "epoch": 9.098089535215284, + "grad_norm": 0.59765625, + "learning_rate": 1.1098869132930846e-07, + "loss": 0.0197, + "mean_token_accuracy": 0.9939213991165161, + "num_tokens": 476133023.0, + "step": 3994 + }, + { + "entropy": 0.6367549821734428, + "epoch": 9.100370687197035, + "grad_norm": 0.53515625, + "learning_rate": 1.1043440677575818e-07, + "loss": 0.0139, + "mean_token_accuracy": 0.9955795183777809, + "num_tokens": 476251765.0, + "step": 3995 + }, + { + "entropy": 0.6383921578526497, + "epoch": 9.102651839178785, + "grad_norm": 0.53515625, + "learning_rate": 1.0988147849350623e-07, + "loss": 0.0188, + "mean_token_accuracy": 0.9939686879515648, + "num_tokens": 476370994.0, + "step": 3996 + }, + { + "entropy": 0.6295920312404633, + "epoch": 9.104932991160537, + "grad_norm": 0.44921875, + "learning_rate": 1.0932990679638406e-07, + "loss": 0.0144, + "mean_token_accuracy": 0.9948722720146179, + "num_tokens": 476489429.0, + "step": 3997 + }, + { + "entropy": 0.6392572671175003, + "epoch": 9.107214143142286, + "grad_norm": 0.46875, + "learning_rate": 1.0877969199745347e-07, + "loss": 0.0154, + "mean_token_accuracy": 0.9942628294229507, + "num_tokens": 476608888.0, + "step": 3998 + }, + { + "entropy": 0.634032316505909, + "epoch": 9.109495295124038, + "grad_norm": 0.6015625, + "learning_rate": 1.0823083440900523e-07, + "loss": 0.021, + "mean_token_accuracy": 0.9935348182916641, + "num_tokens": 476728027.0, + "step": 3999 + }, + { + "entropy": 0.6332411170005798, + "epoch": 9.111776447105788, + "grad_norm": 0.5078125, + "learning_rate": 1.0768333434256039e-07, + "loss": 0.0176, + "mean_token_accuracy": 0.9931336641311646, + "num_tokens": 476847270.0, + "step": 4000 + }, + { + "entropy": 0.6328233703970909, + "epoch": 9.11405759908754, + "grad_norm": 0.640625, + "learning_rate": 1.071371921088693e-07, + "loss": 0.0165, + "mean_token_accuracy": 0.9939233437180519, + "num_tokens": 476966869.0, + "step": 4001 + }, + { + "entropy": 0.6340752989053726, + "epoch": 9.11633875106929, + "grad_norm": 0.578125, + "learning_rate": 1.0659240801791204e-07, + "loss": 0.0188, + "mean_token_accuracy": 0.993132546544075, + "num_tokens": 477086319.0, + "step": 4002 + }, + { + "entropy": 0.6344702392816544, + "epoch": 9.11861990305104, + "grad_norm": 0.69140625, + "learning_rate": 1.0604898237889794e-07, + "loss": 0.0221, + "mean_token_accuracy": 0.9928007200360298, + "num_tokens": 477205535.0, + "step": 4003 + }, + { + "entropy": 0.6349266916513443, + "epoch": 9.12090105503279, + "grad_norm": 0.51953125, + "learning_rate": 1.0550691550026415e-07, + "loss": 0.0168, + "mean_token_accuracy": 0.9936106353998184, + "num_tokens": 477325301.0, + "step": 4004 + }, + { + "entropy": 0.6330092996358871, + "epoch": 9.123182207014542, + "grad_norm": 0.4609375, + "learning_rate": 1.0496620768967736e-07, + "loss": 0.0174, + "mean_token_accuracy": 0.9947778061032295, + "num_tokens": 477443809.0, + "step": 4005 + }, + { + "entropy": 0.6385731622576714, + "epoch": 9.125463358996294, + "grad_norm": 0.64453125, + "learning_rate": 1.0442685925403346e-07, + "loss": 0.0226, + "mean_token_accuracy": 0.9937448427081108, + "num_tokens": 477564004.0, + "step": 4006 + }, + { + "entropy": 0.6382342204451561, + "epoch": 9.127744510978044, + "grad_norm": 0.6015625, + "learning_rate": 1.0388887049945589e-07, + "loss": 0.0208, + "mean_token_accuracy": 0.993801511824131, + "num_tokens": 477683071.0, + "step": 4007 + }, + { + "entropy": 0.6406633406877518, + "epoch": 9.130025662959795, + "grad_norm": 0.67578125, + "learning_rate": 1.0335224173129683e-07, + "loss": 0.0185, + "mean_token_accuracy": 0.9931240826845169, + "num_tokens": 477803263.0, + "step": 4008 + }, + { + "entropy": 0.6378937885165215, + "epoch": 9.132306814941545, + "grad_norm": 0.416015625, + "learning_rate": 1.0281697325413593e-07, + "loss": 0.0142, + "mean_token_accuracy": 0.9965659901499748, + "num_tokens": 477922802.0, + "step": 4009 + }, + { + "entropy": 0.6360496431589127, + "epoch": 9.134587966923297, + "grad_norm": 0.48046875, + "learning_rate": 1.0228306537178185e-07, + "loss": 0.0133, + "mean_token_accuracy": 0.995887815952301, + "num_tokens": 478041883.0, + "step": 4010 + }, + { + "entropy": 0.6361121162772179, + "epoch": 9.136869118905047, + "grad_norm": 0.5390625, + "learning_rate": 1.0175051838727023e-07, + "loss": 0.0163, + "mean_token_accuracy": 0.9952628761529922, + "num_tokens": 478160917.0, + "step": 4011 + }, + { + "entropy": 0.6447338834404945, + "epoch": 9.139150270886798, + "grad_norm": 0.494140625, + "learning_rate": 1.0121933260286432e-07, + "loss": 0.0122, + "mean_token_accuracy": 0.9957680106163025, + "num_tokens": 478280764.0, + "step": 4012 + }, + { + "entropy": 0.6352125331759453, + "epoch": 9.141431422868548, + "grad_norm": 0.5078125, + "learning_rate": 1.0068950832005487e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.9957380890846252, + "num_tokens": 478400386.0, + "step": 4013 + }, + { + "entropy": 0.6329338997602463, + "epoch": 9.1437125748503, + "grad_norm": 0.54296875, + "learning_rate": 1.0016104583956021e-07, + "loss": 0.0188, + "mean_token_accuracy": 0.9959228485822678, + "num_tokens": 478519793.0, + "step": 4014 + }, + { + "entropy": 0.6395345628261566, + "epoch": 9.14599372683205, + "grad_norm": 0.5234375, + "learning_rate": 9.963394546132488e-08, + "loss": 0.0145, + "mean_token_accuracy": 0.995309054851532, + "num_tokens": 478639063.0, + "step": 4015 + }, + { + "entropy": 0.6333770602941513, + "epoch": 9.148274878813801, + "grad_norm": 0.38671875, + "learning_rate": 9.91082074845215e-08, + "loss": 0.009, + "mean_token_accuracy": 0.9968807771801949, + "num_tokens": 478759175.0, + "step": 4016 + }, + { + "entropy": 0.6416776925325394, + "epoch": 9.150556030795551, + "grad_norm": 0.431640625, + "learning_rate": 9.85838322075483e-08, + "loss": 0.009, + "mean_token_accuracy": 0.9966820925474167, + "num_tokens": 478878662.0, + "step": 4017 + }, + { + "entropy": 0.6346591860055923, + "epoch": 9.152837182777303, + "grad_norm": 0.58984375, + "learning_rate": 9.806081992803084e-08, + "loss": 0.0162, + "mean_token_accuracy": 0.9946621656417847, + "num_tokens": 478999101.0, + "step": 4018 + }, + { + "entropy": 0.6330467909574509, + "epoch": 9.155118334759054, + "grad_norm": 0.57421875, + "learning_rate": 9.753917094282112e-08, + "loss": 0.0201, + "mean_token_accuracy": 0.9937231838703156, + "num_tokens": 479117685.0, + "step": 4019 + }, + { + "entropy": 0.6330735236406326, + "epoch": 9.157399486740804, + "grad_norm": 0.47265625, + "learning_rate": 9.701888554799643e-08, + "loss": 0.0165, + "mean_token_accuracy": 0.9959881082177162, + "num_tokens": 479237545.0, + "step": 4020 + }, + { + "entropy": 0.6379288509488106, + "epoch": 9.159680638722556, + "grad_norm": 0.58984375, + "learning_rate": 9.649996403886086e-08, + "loss": 0.02, + "mean_token_accuracy": 0.9925960525870323, + "num_tokens": 479356817.0, + "step": 4021 + }, + { + "entropy": 0.6320216357707977, + "epoch": 9.161961790704305, + "grad_norm": 0.6171875, + "learning_rate": 9.598240670994435e-08, + "loss": 0.0188, + "mean_token_accuracy": 0.9932383447885513, + "num_tokens": 479475951.0, + "step": 4022 + }, + { + "entropy": 0.6352956220507622, + "epoch": 9.164242942686057, + "grad_norm": 0.484375, + "learning_rate": 9.546621385500249e-08, + "loss": 0.016, + "mean_token_accuracy": 0.9938913732767105, + "num_tokens": 479595246.0, + "step": 4023 + }, + { + "entropy": 0.642274908721447, + "epoch": 9.166524094667807, + "grad_norm": 0.578125, + "learning_rate": 9.495138576701673e-08, + "loss": 0.0211, + "mean_token_accuracy": 0.9928874298930168, + "num_tokens": 479714675.0, + "step": 4024 + }, + { + "entropy": 0.6378964632749557, + "epoch": 9.168805246649558, + "grad_norm": 0.53515625, + "learning_rate": 9.443792273819252e-08, + "loss": 0.0137, + "mean_token_accuracy": 0.9956344664096832, + "num_tokens": 479833614.0, + "step": 4025 + }, + { + "entropy": 0.6286690682172775, + "epoch": 9.171086398631308, + "grad_norm": 0.421875, + "learning_rate": 9.392582505996256e-08, + "loss": 0.017, + "mean_token_accuracy": 0.9951606318354607, + "num_tokens": 479952168.0, + "step": 4026 + }, + { + "entropy": 0.6332228854298592, + "epoch": 9.17336755061306, + "grad_norm": 0.6640625, + "learning_rate": 9.341509302298295e-08, + "loss": 0.0196, + "mean_token_accuracy": 0.9941750690340996, + "num_tokens": 480071392.0, + "step": 4027 + }, + { + "entropy": 0.6344810426235199, + "epoch": 9.17564870259481, + "grad_norm": 0.515625, + "learning_rate": 9.290572691713573e-08, + "loss": 0.0177, + "mean_token_accuracy": 0.9946499690413475, + "num_tokens": 480190601.0, + "step": 4028 + }, + { + "entropy": 0.6353466287255287, + "epoch": 9.177929854576561, + "grad_norm": 0.5546875, + "learning_rate": 9.23977270315271e-08, + "loss": 0.0205, + "mean_token_accuracy": 0.9954348206520081, + "num_tokens": 480310557.0, + "step": 4029 + }, + { + "entropy": 0.6379326581954956, + "epoch": 9.180211006558311, + "grad_norm": 0.56640625, + "learning_rate": 9.18910936544884e-08, + "loss": 0.02, + "mean_token_accuracy": 0.9935408234596252, + "num_tokens": 480429921.0, + "step": 4030 + }, + { + "entropy": 0.6379791274666786, + "epoch": 9.182492158540063, + "grad_norm": 0.59765625, + "learning_rate": 9.138582707357429e-08, + "loss": 0.0208, + "mean_token_accuracy": 0.9938461631536484, + "num_tokens": 480549294.0, + "step": 4031 + }, + { + "entropy": 0.6339767053723335, + "epoch": 9.184773310521814, + "grad_norm": 0.5546875, + "learning_rate": 9.088192757556457e-08, + "loss": 0.0183, + "mean_token_accuracy": 0.9933406040072441, + "num_tokens": 480668130.0, + "step": 4032 + }, + { + "entropy": 0.634619727730751, + "epoch": 9.187054462503564, + "grad_norm": 0.48046875, + "learning_rate": 9.037939544646324e-08, + "loss": 0.0187, + "mean_token_accuracy": 0.993624173104763, + "num_tokens": 480787831.0, + "step": 4033 + }, + { + "entropy": 0.6383912563323975, + "epoch": 9.189335614485316, + "grad_norm": 0.451171875, + "learning_rate": 8.987823097149739e-08, + "loss": 0.0141, + "mean_token_accuracy": 0.9959636628627777, + "num_tokens": 480906677.0, + "step": 4034 + }, + { + "entropy": 0.6368100941181183, + "epoch": 9.191616766467066, + "grad_norm": 0.419921875, + "learning_rate": 8.93784344351184e-08, + "loss": 0.01, + "mean_token_accuracy": 0.9975840449333191, + "num_tokens": 481025702.0, + "step": 4035 + }, + { + "entropy": 0.6352212876081467, + "epoch": 9.193897918448817, + "grad_norm": 0.51953125, + "learning_rate": 8.888000612100128e-08, + "loss": 0.017, + "mean_token_accuracy": 0.995277114212513, + "num_tokens": 481146016.0, + "step": 4036 + }, + { + "entropy": 0.6321435198187828, + "epoch": 9.196179070430567, + "grad_norm": 0.5859375, + "learning_rate": 8.838294631204391e-08, + "loss": 0.0166, + "mean_token_accuracy": 0.9969378560781479, + "num_tokens": 481265474.0, + "step": 4037 + }, + { + "entropy": 0.6370986998081207, + "epoch": 9.198460222412319, + "grad_norm": 0.60546875, + "learning_rate": 8.788725529036812e-08, + "loss": 0.0211, + "mean_token_accuracy": 0.9935951456427574, + "num_tokens": 481385783.0, + "step": 4038 + }, + { + "entropy": 0.6317119896411896, + "epoch": 9.200741374394068, + "grad_norm": 0.49609375, + "learning_rate": 8.739293333731886e-08, + "loss": 0.0157, + "mean_token_accuracy": 0.9936124682426453, + "num_tokens": 481504208.0, + "step": 4039 + }, + { + "entropy": 0.636138953268528, + "epoch": 9.20302252637582, + "grad_norm": 0.58203125, + "learning_rate": 8.689998073346361e-08, + "loss": 0.0179, + "mean_token_accuracy": 0.9948786571621895, + "num_tokens": 481623290.0, + "step": 4040 + }, + { + "entropy": 0.6362879052758217, + "epoch": 9.20530367835757, + "grad_norm": 0.54296875, + "learning_rate": 8.640839775859222e-08, + "loss": 0.0179, + "mean_token_accuracy": 0.9941195622086525, + "num_tokens": 481742618.0, + "step": 4041 + }, + { + "entropy": 0.6324362084269524, + "epoch": 9.207584830339322, + "grad_norm": 0.5234375, + "learning_rate": 8.591818469171815e-08, + "loss": 0.0147, + "mean_token_accuracy": 0.9955467730760574, + "num_tokens": 481861968.0, + "step": 4042 + }, + { + "entropy": 0.6333646550774574, + "epoch": 9.209865982321071, + "grad_norm": 0.4296875, + "learning_rate": 8.542934181107687e-08, + "loss": 0.0152, + "mean_token_accuracy": 0.9951947629451752, + "num_tokens": 481981125.0, + "step": 4043 + }, + { + "entropy": 0.6347843408584595, + "epoch": 9.212147134302823, + "grad_norm": 0.609375, + "learning_rate": 8.494186939412591e-08, + "loss": 0.0178, + "mean_token_accuracy": 0.9947155863046646, + "num_tokens": 482101065.0, + "step": 4044 + }, + { + "entropy": 0.6345132365822792, + "epoch": 9.214428286284575, + "grad_norm": 0.5625, + "learning_rate": 8.44557677175456e-08, + "loss": 0.0177, + "mean_token_accuracy": 0.9943040609359741, + "num_tokens": 482220288.0, + "step": 4045 + }, + { + "entropy": 0.6367553919553757, + "epoch": 9.216709438266324, + "grad_norm": 0.46875, + "learning_rate": 8.397103705723774e-08, + "loss": 0.0131, + "mean_token_accuracy": 0.9961254373192787, + "num_tokens": 482339689.0, + "step": 4046 + }, + { + "entropy": 0.6374166235327721, + "epoch": 9.218990590248076, + "grad_norm": 0.4921875, + "learning_rate": 8.348767768832561e-08, + "loss": 0.0136, + "mean_token_accuracy": 0.9951080828905106, + "num_tokens": 482458699.0, + "step": 4047 + }, + { + "entropy": 0.6359874308109283, + "epoch": 9.221271742229826, + "grad_norm": 0.52734375, + "learning_rate": 8.300568988515529e-08, + "loss": 0.0193, + "mean_token_accuracy": 0.992134764790535, + "num_tokens": 482577810.0, + "step": 4048 + }, + { + "entropy": 0.6388935223221779, + "epoch": 9.223552894211577, + "grad_norm": 0.490234375, + "learning_rate": 8.25250739212935e-08, + "loss": 0.0164, + "mean_token_accuracy": 0.995386965572834, + "num_tokens": 482697125.0, + "step": 4049 + }, + { + "entropy": 0.6341124475002289, + "epoch": 9.225834046193327, + "grad_norm": 0.5234375, + "learning_rate": 8.204583006952843e-08, + "loss": 0.0139, + "mean_token_accuracy": 0.9966626390814781, + "num_tokens": 482816134.0, + "step": 4050 + }, + { + "entropy": 0.6297047063708305, + "epoch": 9.228115198175079, + "grad_norm": 0.423828125, + "learning_rate": 8.156795860187028e-08, + "loss": 0.0145, + "mean_token_accuracy": 0.9941071346402168, + "num_tokens": 482935765.0, + "step": 4051 + }, + { + "entropy": 0.6333122029900551, + "epoch": 9.230396350156829, + "grad_norm": 0.482421875, + "learning_rate": 8.109145978954874e-08, + "loss": 0.0163, + "mean_token_accuracy": 0.9947684407234192, + "num_tokens": 483054789.0, + "step": 4052 + }, + { + "entropy": 0.6389750391244888, + "epoch": 9.23267750213858, + "grad_norm": 0.625, + "learning_rate": 8.061633390301582e-08, + "loss": 0.0179, + "mean_token_accuracy": 0.9932487681508064, + "num_tokens": 483174774.0, + "step": 4053 + }, + { + "entropy": 0.6358097940683365, + "epoch": 9.23495865412033, + "grad_norm": 0.5625, + "learning_rate": 8.014258121194385e-08, + "loss": 0.0152, + "mean_token_accuracy": 0.9947173893451691, + "num_tokens": 483293949.0, + "step": 4054 + }, + { + "entropy": 0.6354188993573189, + "epoch": 9.237239806102082, + "grad_norm": 0.376953125, + "learning_rate": 7.967020198522579e-08, + "loss": 0.0111, + "mean_token_accuracy": 0.9952755495905876, + "num_tokens": 483413768.0, + "step": 4055 + }, + { + "entropy": 0.6336700320243835, + "epoch": 9.239520958083832, + "grad_norm": 0.498046875, + "learning_rate": 7.91991964909744e-08, + "loss": 0.0146, + "mean_token_accuracy": 0.9953607395291328, + "num_tokens": 483533772.0, + "step": 4056 + }, + { + "entropy": 0.6324954256415367, + "epoch": 9.241802110065583, + "grad_norm": 0.5546875, + "learning_rate": 7.872956499652418e-08, + "loss": 0.021, + "mean_token_accuracy": 0.993195541203022, + "num_tokens": 483652841.0, + "step": 4057 + }, + { + "entropy": 0.6351828947663307, + "epoch": 9.244083262047335, + "grad_norm": 0.474609375, + "learning_rate": 7.826130776842828e-08, + "loss": 0.0173, + "mean_token_accuracy": 0.9954340308904648, + "num_tokens": 483771803.0, + "step": 4058 + }, + { + "entropy": 0.6376127526164055, + "epoch": 9.246364414029085, + "grad_norm": 0.41015625, + "learning_rate": 7.779442507246021e-08, + "loss": 0.0143, + "mean_token_accuracy": 0.9956338182091713, + "num_tokens": 483891386.0, + "step": 4059 + }, + { + "entropy": 0.6340015828609467, + "epoch": 9.248645566010836, + "grad_norm": 0.484375, + "learning_rate": 7.73289171736144e-08, + "loss": 0.0138, + "mean_token_accuracy": 0.9947069585323334, + "num_tokens": 484009837.0, + "step": 4060 + }, + { + "entropy": 0.6324451789259911, + "epoch": 9.250926717992586, + "grad_norm": 0.53125, + "learning_rate": 7.686478433610339e-08, + "loss": 0.0142, + "mean_token_accuracy": 0.9955054372549057, + "num_tokens": 484129978.0, + "step": 4061 + }, + { + "entropy": 0.6338633224368095, + "epoch": 9.253207869974338, + "grad_norm": 0.609375, + "learning_rate": 7.64020268233609e-08, + "loss": 0.0221, + "mean_token_accuracy": 0.9929016083478928, + "num_tokens": 484249530.0, + "step": 4062 + }, + { + "entropy": 0.6317703202366829, + "epoch": 9.255489021956087, + "grad_norm": 0.65625, + "learning_rate": 7.594064489803821e-08, + "loss": 0.0213, + "mean_token_accuracy": 0.9944830313324928, + "num_tokens": 484369468.0, + "step": 4063 + }, + { + "entropy": 0.6342199742794037, + "epoch": 9.257770173937839, + "grad_norm": 0.5546875, + "learning_rate": 7.548063882200724e-08, + "loss": 0.0155, + "mean_token_accuracy": 0.9943109378218651, + "num_tokens": 484488499.0, + "step": 4064 + }, + { + "entropy": 0.6337149143218994, + "epoch": 9.260051325919589, + "grad_norm": 0.44921875, + "learning_rate": 7.502200885635858e-08, + "loss": 0.0107, + "mean_token_accuracy": 0.9969168230891228, + "num_tokens": 484607785.0, + "step": 4065 + }, + { + "entropy": 0.6350704580545425, + "epoch": 9.26233247790134, + "grad_norm": 0.494140625, + "learning_rate": 7.45647552614015e-08, + "loss": 0.0177, + "mean_token_accuracy": 0.9945379570126534, + "num_tokens": 484727162.0, + "step": 4066 + }, + { + "entropy": 0.6345521658658981, + "epoch": 9.26461362988309, + "grad_norm": 0.51953125, + "learning_rate": 7.410887829666479e-08, + "loss": 0.0163, + "mean_token_accuracy": 0.9933284521102905, + "num_tokens": 484846163.0, + "step": 4067 + }, + { + "entropy": 0.6369775608181953, + "epoch": 9.266894781864842, + "grad_norm": 0.52734375, + "learning_rate": 7.365437822089482e-08, + "loss": 0.0205, + "mean_token_accuracy": 0.9948882013559341, + "num_tokens": 484966063.0, + "step": 4068 + }, + { + "entropy": 0.6363441571593285, + "epoch": 9.269175933846592, + "grad_norm": 0.5859375, + "learning_rate": 7.320125529205746e-08, + "loss": 0.0137, + "mean_token_accuracy": 0.9948942810297012, + "num_tokens": 485085900.0, + "step": 4069 + }, + { + "entropy": 0.6359632983803749, + "epoch": 9.271457085828343, + "grad_norm": 0.55859375, + "learning_rate": 7.274950976733642e-08, + "loss": 0.0166, + "mean_token_accuracy": 0.9953204616904259, + "num_tokens": 485205526.0, + "step": 4070 + }, + { + "entropy": 0.6405183523893356, + "epoch": 9.273738237810093, + "grad_norm": 0.65625, + "learning_rate": 7.22991419031338e-08, + "loss": 0.0221, + "mean_token_accuracy": 0.9926489740610123, + "num_tokens": 485324915.0, + "step": 4071 + }, + { + "entropy": 0.6318849623203278, + "epoch": 9.276019389791845, + "grad_norm": 0.470703125, + "learning_rate": 7.185015195506961e-08, + "loss": 0.0132, + "mean_token_accuracy": 0.9962703585624695, + "num_tokens": 485444025.0, + "step": 4072 + }, + { + "entropy": 0.6358897089958191, + "epoch": 9.278300541773596, + "grad_norm": 0.51171875, + "learning_rate": 7.140254017798221e-08, + "loss": 0.019, + "mean_token_accuracy": 0.9929203540086746, + "num_tokens": 485563392.0, + "step": 4073 + }, + { + "entropy": 0.6415648311376572, + "epoch": 9.280581693755346, + "grad_norm": 0.4296875, + "learning_rate": 7.095630682592669e-08, + "loss": 0.0133, + "mean_token_accuracy": 0.99516561627388, + "num_tokens": 485682743.0, + "step": 4074 + }, + { + "entropy": 0.6388050988316536, + "epoch": 9.282862845737098, + "grad_norm": 0.61328125, + "learning_rate": 7.051145215217715e-08, + "loss": 0.0164, + "mean_token_accuracy": 0.9952031075954437, + "num_tokens": 485801776.0, + "step": 4075 + }, + { + "entropy": 0.6311292797327042, + "epoch": 9.285143997718848, + "grad_norm": 0.6796875, + "learning_rate": 7.006797640922436e-08, + "loss": 0.0219, + "mean_token_accuracy": 0.992256797850132, + "num_tokens": 485920482.0, + "step": 4076 + }, + { + "entropy": 0.6353208348155022, + "epoch": 9.2874251497006, + "grad_norm": 0.5078125, + "learning_rate": 6.962587984877617e-08, + "loss": 0.0147, + "mean_token_accuracy": 0.9945407062768936, + "num_tokens": 486039309.0, + "step": 4077 + }, + { + "entropy": 0.6354212015867233, + "epoch": 9.28970630168235, + "grad_norm": 0.56640625, + "learning_rate": 6.918516272175879e-08, + "loss": 0.0179, + "mean_token_accuracy": 0.9929946213960648, + "num_tokens": 486158885.0, + "step": 4078 + }, + { + "entropy": 0.6364796757698059, + "epoch": 9.2919874536641, + "grad_norm": 0.5625, + "learning_rate": 6.874582527831409e-08, + "loss": 0.0193, + "mean_token_accuracy": 0.9942623302340508, + "num_tokens": 486279106.0, + "step": 4079 + }, + { + "entropy": 0.6348674297332764, + "epoch": 9.29426860564585, + "grad_norm": 0.609375, + "learning_rate": 6.830786776780174e-08, + "loss": 0.0204, + "mean_token_accuracy": 0.9937059953808784, + "num_tokens": 486398211.0, + "step": 4080 + }, + { + "entropy": 0.6357633024454117, + "epoch": 9.296549757627602, + "grad_norm": 0.640625, + "learning_rate": 6.78712904387982e-08, + "loss": 0.0228, + "mean_token_accuracy": 0.9910479038953781, + "num_tokens": 486517505.0, + "step": 4081 + }, + { + "entropy": 0.632320910692215, + "epoch": 9.298830909609352, + "grad_norm": 0.4375, + "learning_rate": 6.74360935390958e-08, + "loss": 0.013, + "mean_token_accuracy": 0.9936673492193222, + "num_tokens": 486636786.0, + "step": 4082 + }, + { + "entropy": 0.6402900964021683, + "epoch": 9.301112061591104, + "grad_norm": 0.392578125, + "learning_rate": 6.700227731570475e-08, + "loss": 0.0105, + "mean_token_accuracy": 0.9963009133934975, + "num_tokens": 486755870.0, + "step": 4083 + }, + { + "entropy": 0.6340215429663658, + "epoch": 9.303393213572853, + "grad_norm": 0.4140625, + "learning_rate": 6.656984201485001e-08, + "loss": 0.0131, + "mean_token_accuracy": 0.9964675232768059, + "num_tokens": 486875056.0, + "step": 4084 + }, + { + "entropy": 0.6358983665704727, + "epoch": 9.305674365554605, + "grad_norm": 0.5234375, + "learning_rate": 6.613878788197359e-08, + "loss": 0.0115, + "mean_token_accuracy": 0.9953816533088684, + "num_tokens": 486993779.0, + "step": 4085 + }, + { + "entropy": 0.6360890418291092, + "epoch": 9.307955517536357, + "grad_norm": 0.431640625, + "learning_rate": 6.570911516173368e-08, + "loss": 0.0113, + "mean_token_accuracy": 0.9963933378458023, + "num_tokens": 487113312.0, + "step": 4086 + }, + { + "entropy": 0.6357756704092026, + "epoch": 9.310236669518106, + "grad_norm": 0.40234375, + "learning_rate": 6.528082409800434e-08, + "loss": 0.0143, + "mean_token_accuracy": 0.9959370419383049, + "num_tokens": 487232114.0, + "step": 4087 + }, + { + "entropy": 0.6350175961852074, + "epoch": 9.312517821499858, + "grad_norm": 0.474609375, + "learning_rate": 6.485391493387505e-08, + "loss": 0.0154, + "mean_token_accuracy": 0.9948367848992348, + "num_tokens": 487351535.0, + "step": 4088 + }, + { + "entropy": 0.6347580328583717, + "epoch": 9.314798973481608, + "grad_norm": 0.6015625, + "learning_rate": 6.442838791165168e-08, + "loss": 0.0149, + "mean_token_accuracy": 0.9946373105049133, + "num_tokens": 487470426.0, + "step": 4089 + }, + { + "entropy": 0.6363870352506638, + "epoch": 9.31708012546336, + "grad_norm": 0.4921875, + "learning_rate": 6.400424327285437e-08, + "loss": 0.0144, + "mean_token_accuracy": 0.995031975209713, + "num_tokens": 487589833.0, + "step": 4090 + }, + { + "entropy": 0.6366482675075531, + "epoch": 9.31936127744511, + "grad_norm": 0.494140625, + "learning_rate": 6.358148125822e-08, + "loss": 0.0154, + "mean_token_accuracy": 0.99464051425457, + "num_tokens": 487709677.0, + "step": 4091 + }, + { + "entropy": 0.6365924999117851, + "epoch": 9.321642429426861, + "grad_norm": 0.49609375, + "learning_rate": 6.316010210769997e-08, + "loss": 0.0141, + "mean_token_accuracy": 0.995802991092205, + "num_tokens": 487829184.0, + "step": 4092 + }, + { + "entropy": 0.6348769441246986, + "epoch": 9.32392358140861, + "grad_norm": 0.52734375, + "learning_rate": 6.274010606046071e-08, + "loss": 0.0157, + "mean_token_accuracy": 0.9963616132736206, + "num_tokens": 487948753.0, + "step": 4093 + }, + { + "entropy": 0.6332232654094696, + "epoch": 9.326204733390362, + "grad_norm": 0.50390625, + "learning_rate": 6.232149335488463e-08, + "loss": 0.0151, + "mean_token_accuracy": 0.9961426556110382, + "num_tokens": 488067593.0, + "step": 4094 + }, + { + "entropy": 0.6359502300620079, + "epoch": 9.328485885372112, + "grad_norm": 0.439453125, + "learning_rate": 6.190426422856749e-08, + "loss": 0.0116, + "mean_token_accuracy": 0.9954705759882927, + "num_tokens": 488186352.0, + "step": 4095 + }, + { + "entropy": 0.6300693824887276, + "epoch": 9.330767037353864, + "grad_norm": 0.58984375, + "learning_rate": 6.148841891832069e-08, + "loss": 0.0157, + "mean_token_accuracy": 0.9940545111894608, + "num_tokens": 488305568.0, + "step": 4096 + }, + { + "entropy": 0.6307547986507416, + "epoch": 9.333048189335614, + "grad_norm": 0.408203125, + "learning_rate": 6.107395766016988e-08, + "loss": 0.0137, + "mean_token_accuracy": 0.9958236515522003, + "num_tokens": 488424320.0, + "step": 4097 + }, + { + "entropy": 0.6389123126864433, + "epoch": 9.335329341317365, + "grad_norm": 0.65234375, + "learning_rate": 6.066088068935577e-08, + "loss": 0.0176, + "mean_token_accuracy": 0.994137704372406, + "num_tokens": 488543984.0, + "step": 4098 + }, + { + "entropy": 0.639007605612278, + "epoch": 9.337610493299117, + "grad_norm": 0.51953125, + "learning_rate": 6.024918824033221e-08, + "loss": 0.0191, + "mean_token_accuracy": 0.994052343070507, + "num_tokens": 488662368.0, + "step": 4099 + }, + { + "entropy": 0.6293163001537323, + "epoch": 9.339891645280867, + "grad_norm": 0.515625, + "learning_rate": 5.983888054676867e-08, + "loss": 0.0181, + "mean_token_accuracy": 0.9939788654446602, + "num_tokens": 488781941.0, + "step": 4100 + }, + { + "entropy": 0.6320939213037491, + "epoch": 9.342172797262618, + "grad_norm": 0.44921875, + "learning_rate": 5.9429957841546926e-08, + "loss": 0.013, + "mean_token_accuracy": 0.9943601563572884, + "num_tokens": 488900846.0, + "step": 4101 + }, + { + "entropy": 0.6358425915241241, + "epoch": 9.344453949244368, + "grad_norm": 0.421875, + "learning_rate": 5.902242035676409e-08, + "loss": 0.0145, + "mean_token_accuracy": 0.9956803768873215, + "num_tokens": 489020146.0, + "step": 4102 + }, + { + "entropy": 0.6352656707167625, + "epoch": 9.34673510122612, + "grad_norm": 0.65625, + "learning_rate": 5.8616268323730685e-08, + "loss": 0.0152, + "mean_token_accuracy": 0.9949975833296776, + "num_tokens": 489139605.0, + "step": 4103 + }, + { + "entropy": 0.6348226442933083, + "epoch": 9.34901625320787, + "grad_norm": 0.72265625, + "learning_rate": 5.821150197297038e-08, + "loss": 0.0201, + "mean_token_accuracy": 0.9950218573212624, + "num_tokens": 489258558.0, + "step": 4104 + }, + { + "entropy": 0.6321392208337784, + "epoch": 9.351297405189621, + "grad_norm": 0.62109375, + "learning_rate": 5.780812153422161e-08, + "loss": 0.0184, + "mean_token_accuracy": 0.9942390024662018, + "num_tokens": 489377674.0, + "step": 4105 + }, + { + "entropy": 0.6366815268993378, + "epoch": 9.353578557171371, + "grad_norm": 0.51953125, + "learning_rate": 5.7406127236434016e-08, + "loss": 0.0176, + "mean_token_accuracy": 0.9943306893110275, + "num_tokens": 489496583.0, + "step": 4106 + }, + { + "entropy": 0.6379704400897026, + "epoch": 9.355859709153123, + "grad_norm": 0.48046875, + "learning_rate": 5.700551930777287e-08, + "loss": 0.0177, + "mean_token_accuracy": 0.9945609793066978, + "num_tokens": 489616176.0, + "step": 4107 + }, + { + "entropy": 0.6394259631633759, + "epoch": 9.358140861134872, + "grad_norm": 0.384765625, + "learning_rate": 5.66062979756149e-08, + "loss": 0.0115, + "mean_token_accuracy": 0.9972492307424545, + "num_tokens": 489735986.0, + "step": 4108 + }, + { + "entropy": 0.6324135810136795, + "epoch": 9.360422013116624, + "grad_norm": 0.5234375, + "learning_rate": 5.620846346655079e-08, + "loss": 0.0137, + "mean_token_accuracy": 0.9949335157871246, + "num_tokens": 489855693.0, + "step": 4109 + }, + { + "entropy": 0.6393437013030052, + "epoch": 9.362703165098374, + "grad_norm": 0.609375, + "learning_rate": 5.5812016006383805e-08, + "loss": 0.0229, + "mean_token_accuracy": 0.9925966337323189, + "num_tokens": 489975414.0, + "step": 4110 + }, + { + "entropy": 0.6372890248894691, + "epoch": 9.364984317080125, + "grad_norm": 0.5390625, + "learning_rate": 5.5416955820129515e-08, + "loss": 0.0183, + "mean_token_accuracy": 0.9944883212447166, + "num_tokens": 490095054.0, + "step": 4111 + }, + { + "entropy": 0.6379915997385979, + "epoch": 9.367265469061877, + "grad_norm": 0.55859375, + "learning_rate": 5.50232831320166e-08, + "loss": 0.0201, + "mean_token_accuracy": 0.9941944181919098, + "num_tokens": 490213982.0, + "step": 4112 + }, + { + "entropy": 0.6369621232151985, + "epoch": 9.369546621043627, + "grad_norm": 0.53515625, + "learning_rate": 5.463099816548578e-08, + "loss": 0.0149, + "mean_token_accuracy": 0.9947495609521866, + "num_tokens": 490332944.0, + "step": 4113 + }, + { + "entropy": 0.6325101405382156, + "epoch": 9.371827773025379, + "grad_norm": 0.65234375, + "learning_rate": 5.424010114319117e-08, + "loss": 0.0232, + "mean_token_accuracy": 0.992453321814537, + "num_tokens": 490451933.0, + "step": 4114 + }, + { + "entropy": 0.6334957778453827, + "epoch": 9.374108925007128, + "grad_norm": 0.55078125, + "learning_rate": 5.385059228699779e-08, + "loss": 0.0215, + "mean_token_accuracy": 0.9917930364608765, + "num_tokens": 490570708.0, + "step": 4115 + }, + { + "entropy": 0.6334489285945892, + "epoch": 9.37639007698888, + "grad_norm": 0.5078125, + "learning_rate": 5.346247181798325e-08, + "loss": 0.0181, + "mean_token_accuracy": 0.9933737069368362, + "num_tokens": 490690522.0, + "step": 4116 + }, + { + "entropy": 0.6354789510369301, + "epoch": 9.37867122897063, + "grad_norm": 0.453125, + "learning_rate": 5.307573995643772e-08, + "loss": 0.0153, + "mean_token_accuracy": 0.995659813284874, + "num_tokens": 490809816.0, + "step": 4117 + }, + { + "entropy": 0.6349670365452766, + "epoch": 9.380952380952381, + "grad_norm": 0.5390625, + "learning_rate": 5.2690396921862284e-08, + "loss": 0.0149, + "mean_token_accuracy": 0.9946278184652328, + "num_tokens": 490929330.0, + "step": 4118 + }, + { + "entropy": 0.6363025531172752, + "epoch": 9.383233532934131, + "grad_norm": 0.5390625, + "learning_rate": 5.230644293297088e-08, + "loss": 0.0167, + "mean_token_accuracy": 0.9946830123662949, + "num_tokens": 491049062.0, + "step": 4119 + }, + { + "entropy": 0.6318720430135727, + "epoch": 9.385514684915883, + "grad_norm": 0.56640625, + "learning_rate": 5.192387820768752e-08, + "loss": 0.0185, + "mean_token_accuracy": 0.9941776692867279, + "num_tokens": 491168149.0, + "step": 4120 + }, + { + "entropy": 0.6317180022597313, + "epoch": 9.387795836897633, + "grad_norm": 0.671875, + "learning_rate": 5.154270296314878e-08, + "loss": 0.023, + "mean_token_accuracy": 0.9925040826201439, + "num_tokens": 491287683.0, + "step": 4121 + }, + { + "entropy": 0.631727509200573, + "epoch": 9.390076988879384, + "grad_norm": 0.4453125, + "learning_rate": 5.116291741570301e-08, + "loss": 0.0145, + "mean_token_accuracy": 0.9957842156291008, + "num_tokens": 491408010.0, + "step": 4122 + }, + { + "entropy": 0.6367385014891624, + "epoch": 9.392358140861134, + "grad_norm": 0.66015625, + "learning_rate": 5.078452178090831e-08, + "loss": 0.0205, + "mean_token_accuracy": 0.9930264949798584, + "num_tokens": 491527281.0, + "step": 4123 + }, + { + "entropy": 0.6377266496419907, + "epoch": 9.394639292842886, + "grad_norm": 0.50390625, + "learning_rate": 5.040751627353513e-08, + "loss": 0.0141, + "mean_token_accuracy": 0.9957645982503891, + "num_tokens": 491646834.0, + "step": 4124 + }, + { + "entropy": 0.6314424574375153, + "epoch": 9.396920444824637, + "grad_norm": 0.486328125, + "learning_rate": 5.003190110756451e-08, + "loss": 0.0129, + "mean_token_accuracy": 0.995598092675209, + "num_tokens": 491766304.0, + "step": 4125 + }, + { + "entropy": 0.6339164301753044, + "epoch": 9.399201596806387, + "grad_norm": 0.466796875, + "learning_rate": 4.965767649618869e-08, + "loss": 0.0115, + "mean_token_accuracy": 0.9955377578735352, + "num_tokens": 491884917.0, + "step": 4126 + }, + { + "entropy": 0.6280913576483727, + "epoch": 9.401482748788139, + "grad_norm": 0.4453125, + "learning_rate": 4.928484265180972e-08, + "loss": 0.0169, + "mean_token_accuracy": 0.994895912706852, + "num_tokens": 492003614.0, + "step": 4127 + }, + { + "entropy": 0.6340465843677521, + "epoch": 9.403763900769889, + "grad_norm": 0.484375, + "learning_rate": 4.8913399786041097e-08, + "loss": 0.0133, + "mean_token_accuracy": 0.9950336515903473, + "num_tokens": 492122444.0, + "step": 4128 + }, + { + "entropy": 0.6324601098895073, + "epoch": 9.40604505275164, + "grad_norm": 0.474609375, + "learning_rate": 4.854334810970668e-08, + "loss": 0.014, + "mean_token_accuracy": 0.994332067668438, + "num_tokens": 492241538.0, + "step": 4129 + }, + { + "entropy": 0.6328210532665253, + "epoch": 9.40832620473339, + "grad_norm": 0.50390625, + "learning_rate": 4.817468783284096e-08, + "loss": 0.0195, + "mean_token_accuracy": 0.9932608902454376, + "num_tokens": 492361188.0, + "step": 4130 + }, + { + "entropy": 0.637243241071701, + "epoch": 9.410607356715142, + "grad_norm": 0.345703125, + "learning_rate": 4.7807419164687673e-08, + "loss": 0.0094, + "mean_token_accuracy": 0.9960004016757011, + "num_tokens": 492480034.0, + "step": 4131 + }, + { + "entropy": 0.6335903629660606, + "epoch": 9.412888508696891, + "grad_norm": 0.6640625, + "learning_rate": 4.7441542313702293e-08, + "loss": 0.0214, + "mean_token_accuracy": 0.9931656047701836, + "num_tokens": 492599250.0, + "step": 4132 + }, + { + "entropy": 0.6318375244736671, + "epoch": 9.415169660678643, + "grad_norm": 0.55078125, + "learning_rate": 4.707705748754898e-08, + "loss": 0.0114, + "mean_token_accuracy": 0.9955375492572784, + "num_tokens": 492718878.0, + "step": 4133 + }, + { + "entropy": 0.6395155787467957, + "epoch": 9.417450812660393, + "grad_norm": 0.55859375, + "learning_rate": 4.671396489310198e-08, + "loss": 0.0115, + "mean_token_accuracy": 0.9969375059008598, + "num_tokens": 492838474.0, + "step": 4134 + }, + { + "entropy": 0.630906842648983, + "epoch": 9.419731964642144, + "grad_norm": 0.455078125, + "learning_rate": 4.635226473644616e-08, + "loss": 0.0125, + "mean_token_accuracy": 0.9958387240767479, + "num_tokens": 492957284.0, + "step": 4135 + }, + { + "entropy": 0.6347004771232605, + "epoch": 9.422013116623894, + "grad_norm": 0.484375, + "learning_rate": 4.599195722287536e-08, + "loss": 0.0179, + "mean_token_accuracy": 0.9934825748205185, + "num_tokens": 493077056.0, + "step": 4136 + }, + { + "entropy": 0.6345586031675339, + "epoch": 9.424294268605646, + "grad_norm": 0.578125, + "learning_rate": 4.5633042556893493e-08, + "loss": 0.0181, + "mean_token_accuracy": 0.9940597787499428, + "num_tokens": 493196598.0, + "step": 4137 + }, + { + "entropy": 0.6397690549492836, + "epoch": 9.426575420587398, + "grad_norm": 0.515625, + "learning_rate": 4.527552094221288e-08, + "loss": 0.0123, + "mean_token_accuracy": 0.9960629642009735, + "num_tokens": 493316082.0, + "step": 4138 + }, + { + "entropy": 0.63877784460783, + "epoch": 9.428856572569147, + "grad_norm": 0.376953125, + "learning_rate": 4.4919392581756204e-08, + "loss": 0.0117, + "mean_token_accuracy": 0.9972184598445892, + "num_tokens": 493435455.0, + "step": 4139 + }, + { + "entropy": 0.6404017806053162, + "epoch": 9.431137724550899, + "grad_norm": 0.4609375, + "learning_rate": 4.456465767765539e-08, + "loss": 0.0121, + "mean_token_accuracy": 0.9956845864653587, + "num_tokens": 493555273.0, + "step": 4140 + }, + { + "entropy": 0.6392020061612129, + "epoch": 9.433418876532649, + "grad_norm": 0.53125, + "learning_rate": 4.421131643125104e-08, + "loss": 0.017, + "mean_token_accuracy": 0.9942917674779892, + "num_tokens": 493674779.0, + "step": 4141 + }, + { + "entropy": 0.633618175983429, + "epoch": 9.4357000285144, + "grad_norm": 0.49609375, + "learning_rate": 4.3859369043092183e-08, + "loss": 0.0131, + "mean_token_accuracy": 0.9943791553378105, + "num_tokens": 493794502.0, + "step": 4142 + }, + { + "entropy": 0.6407156586647034, + "epoch": 9.43798118049615, + "grad_norm": 0.703125, + "learning_rate": 4.350881571293819e-08, + "loss": 0.0225, + "mean_token_accuracy": 0.9941362366080284, + "num_tokens": 493914319.0, + "step": 4143 + }, + { + "entropy": 0.6391750648617744, + "epoch": 9.440262332477902, + "grad_norm": 0.5859375, + "learning_rate": 4.315965663975602e-08, + "loss": 0.017, + "mean_token_accuracy": 0.9951021894812584, + "num_tokens": 494034062.0, + "step": 4144 + }, + { + "entropy": 0.6332917809486389, + "epoch": 9.442543484459652, + "grad_norm": 0.5234375, + "learning_rate": 4.281189202172131e-08, + "loss": 0.0088, + "mean_token_accuracy": 0.997545063495636, + "num_tokens": 494153094.0, + "step": 4145 + }, + { + "entropy": 0.6374918669462204, + "epoch": 9.444824636441403, + "grad_norm": 0.3828125, + "learning_rate": 4.246552205621896e-08, + "loss": 0.0134, + "mean_token_accuracy": 0.9955617561936378, + "num_tokens": 494272328.0, + "step": 4146 + }, + { + "entropy": 0.6340372934937477, + "epoch": 9.447105788423153, + "grad_norm": 0.66015625, + "learning_rate": 4.212054693984169e-08, + "loss": 0.019, + "mean_token_accuracy": 0.9945412278175354, + "num_tokens": 494391147.0, + "step": 4147 + }, + { + "entropy": 0.6333491057157516, + "epoch": 9.449386940404905, + "grad_norm": 0.53515625, + "learning_rate": 4.177696686839094e-08, + "loss": 0.0187, + "mean_token_accuracy": 0.9941490814089775, + "num_tokens": 494510253.0, + "step": 4148 + }, + { + "entropy": 0.6333981677889824, + "epoch": 9.451668092386655, + "grad_norm": 0.47265625, + "learning_rate": 4.143478203687573e-08, + "loss": 0.0166, + "mean_token_accuracy": 0.9946000650525093, + "num_tokens": 494629466.0, + "step": 4149 + }, + { + "entropy": 0.6298128962516785, + "epoch": 9.453949244368406, + "grad_norm": 0.482421875, + "learning_rate": 4.1093992639514026e-08, + "loss": 0.0128, + "mean_token_accuracy": 0.9954721108078957, + "num_tokens": 494748555.0, + "step": 4150 + }, + { + "entropy": 0.6379109472036362, + "epoch": 9.456230396350158, + "grad_norm": 0.4609375, + "learning_rate": 4.0754598869730824e-08, + "loss": 0.0204, + "mean_token_accuracy": 0.9947513043880463, + "num_tokens": 494868172.0, + "step": 4151 + }, + { + "entropy": 0.6384910866618156, + "epoch": 9.458511548331908, + "grad_norm": 0.69140625, + "learning_rate": 4.041660092015981e-08, + "loss": 0.0211, + "mean_token_accuracy": 0.9936100766062737, + "num_tokens": 494988199.0, + "step": 4152 + }, + { + "entropy": 0.642090804874897, + "epoch": 9.46079270031366, + "grad_norm": 0.70703125, + "learning_rate": 4.007999898264225e-08, + "loss": 0.0224, + "mean_token_accuracy": 0.9936446994543076, + "num_tokens": 495108144.0, + "step": 4153 + }, + { + "entropy": 0.6367986500263214, + "epoch": 9.463073852295409, + "grad_norm": 0.388671875, + "learning_rate": 3.9744793248226446e-08, + "loss": 0.0143, + "mean_token_accuracy": 0.9952849969267845, + "num_tokens": 495227419.0, + "step": 4154 + }, + { + "entropy": 0.6335557326674461, + "epoch": 9.46535500427716, + "grad_norm": 0.376953125, + "learning_rate": 3.9410983907169076e-08, + "loss": 0.0152, + "mean_token_accuracy": 0.9962492063641548, + "num_tokens": 495346448.0, + "step": 4155 + }, + { + "entropy": 0.6345428377389908, + "epoch": 9.46763615625891, + "grad_norm": 0.55859375, + "learning_rate": 3.90785711489336e-08, + "loss": 0.0148, + "mean_token_accuracy": 0.9955236166715622, + "num_tokens": 495465293.0, + "step": 4156 + }, + { + "entropy": 0.6413858830928802, + "epoch": 9.469917308240662, + "grad_norm": 0.427734375, + "learning_rate": 3.874755516219103e-08, + "loss": 0.0124, + "mean_token_accuracy": 0.9957537725567818, + "num_tokens": 495584985.0, + "step": 4157 + }, + { + "entropy": 0.6309913024306297, + "epoch": 9.472198460222412, + "grad_norm": 0.349609375, + "learning_rate": 3.8417936134820255e-08, + "loss": 0.0115, + "mean_token_accuracy": 0.9960332065820694, + "num_tokens": 495704348.0, + "step": 4158 + }, + { + "entropy": 0.6367775350809097, + "epoch": 9.474479612204163, + "grad_norm": 0.54296875, + "learning_rate": 3.808971425390606e-08, + "loss": 0.0191, + "mean_token_accuracy": 0.992771565914154, + "num_tokens": 495822948.0, + "step": 4159 + }, + { + "entropy": 0.6333020180463791, + "epoch": 9.476760764185913, + "grad_norm": 0.56640625, + "learning_rate": 3.7762889705740824e-08, + "loss": 0.0201, + "mean_token_accuracy": 0.9930561110377312, + "num_tokens": 495942389.0, + "step": 4160 + }, + { + "entropy": 0.6325799748301506, + "epoch": 9.479041916167665, + "grad_norm": 0.546875, + "learning_rate": 3.743746267582421e-08, + "loss": 0.0204, + "mean_token_accuracy": 0.9929074123501778, + "num_tokens": 496062757.0, + "step": 4161 + }, + { + "entropy": 0.6330079734325409, + "epoch": 9.481323068149415, + "grad_norm": 0.65625, + "learning_rate": 3.711343334886236e-08, + "loss": 0.0124, + "mean_token_accuracy": 0.9958575740456581, + "num_tokens": 496181373.0, + "step": 4162 + }, + { + "entropy": 0.6391487941145897, + "epoch": 9.483604220131166, + "grad_norm": 0.4375, + "learning_rate": 3.679080190876788e-08, + "loss": 0.0135, + "mean_token_accuracy": 0.9942786693572998, + "num_tokens": 496301143.0, + "step": 4163 + }, + { + "entropy": 0.6372695341706276, + "epoch": 9.485885372112918, + "grad_norm": 0.5, + "learning_rate": 3.646956853865985e-08, + "loss": 0.0121, + "mean_token_accuracy": 0.9959971085190773, + "num_tokens": 496420308.0, + "step": 4164 + }, + { + "entropy": 0.6339521110057831, + "epoch": 9.488166524094668, + "grad_norm": 0.392578125, + "learning_rate": 3.614973342086464e-08, + "loss": 0.0143, + "mean_token_accuracy": 0.9954963177442551, + "num_tokens": 496539482.0, + "step": 4165 + }, + { + "entropy": 0.6329493299126625, + "epoch": 9.49044767607642, + "grad_norm": 0.56640625, + "learning_rate": 3.583129673691427e-08, + "loss": 0.0185, + "mean_token_accuracy": 0.9949780702590942, + "num_tokens": 496658772.0, + "step": 4166 + }, + { + "entropy": 0.6361378356814384, + "epoch": 9.49272882805817, + "grad_norm": 0.4375, + "learning_rate": 3.551425866754693e-08, + "loss": 0.0104, + "mean_token_accuracy": 0.9958807528018951, + "num_tokens": 496778205.0, + "step": 4167 + }, + { + "entropy": 0.6315587759017944, + "epoch": 9.49500998003992, + "grad_norm": 0.486328125, + "learning_rate": 3.519861939270786e-08, + "loss": 0.0174, + "mean_token_accuracy": 0.9945416525006294, + "num_tokens": 496897728.0, + "step": 4168 + }, + { + "entropy": 0.6369871273636818, + "epoch": 9.49729113202167, + "grad_norm": 0.54296875, + "learning_rate": 3.4884379091547905e-08, + "loss": 0.0187, + "mean_token_accuracy": 0.9940879344940186, + "num_tokens": 497016772.0, + "step": 4169 + }, + { + "entropy": 0.6365024447441101, + "epoch": 9.499572284003422, + "grad_norm": 0.515625, + "learning_rate": 3.457153794242302e-08, + "loss": 0.0187, + "mean_token_accuracy": 0.9940440505743027, + "num_tokens": 497135875.0, + "step": 4170 + }, + { + "entropy": 0.635115846991539, + "epoch": 9.501853435985172, + "grad_norm": 0.66796875, + "learning_rate": 3.4260096122896435e-08, + "loss": 0.0217, + "mean_token_accuracy": 0.9927052706480026, + "num_tokens": 497255817.0, + "step": 4171 + }, + { + "entropy": 0.6373689845204353, + "epoch": 9.504134587966924, + "grad_norm": 0.41796875, + "learning_rate": 3.3950053809736204e-08, + "loss": 0.0148, + "mean_token_accuracy": 0.9946744590997696, + "num_tokens": 497375022.0, + "step": 4172 + }, + { + "entropy": 0.6352171376347542, + "epoch": 9.506415739948674, + "grad_norm": 0.49609375, + "learning_rate": 3.364141117891656e-08, + "loss": 0.0166, + "mean_token_accuracy": 0.9942670315504074, + "num_tokens": 497495207.0, + "step": 4173 + }, + { + "entropy": 0.6331456750631332, + "epoch": 9.508696891930425, + "grad_norm": 0.419921875, + "learning_rate": 3.333416840561709e-08, + "loss": 0.0128, + "mean_token_accuracy": 0.996303990483284, + "num_tokens": 497614093.0, + "step": 4174 + }, + { + "entropy": 0.636615127325058, + "epoch": 9.510978043912175, + "grad_norm": 0.439453125, + "learning_rate": 3.302832566422276e-08, + "loss": 0.0129, + "mean_token_accuracy": 0.9947681352496147, + "num_tokens": 497734229.0, + "step": 4175 + }, + { + "entropy": 0.631718099117279, + "epoch": 9.513259195893927, + "grad_norm": 0.5234375, + "learning_rate": 3.272388312832414e-08, + "loss": 0.0156, + "mean_token_accuracy": 0.994045153260231, + "num_tokens": 497853295.0, + "step": 4176 + }, + { + "entropy": 0.6346657946705818, + "epoch": 9.515540347875678, + "grad_norm": 0.51171875, + "learning_rate": 3.242084097071663e-08, + "loss": 0.0161, + "mean_token_accuracy": 0.9944434314966202, + "num_tokens": 497971877.0, + "step": 4177 + }, + { + "entropy": 0.6361216902732849, + "epoch": 9.517821499857428, + "grad_norm": 0.62109375, + "learning_rate": 3.211919936340152e-08, + "loss": 0.0247, + "mean_token_accuracy": 0.9918049201369286, + "num_tokens": 498091307.0, + "step": 4178 + }, + { + "entropy": 0.6345694959163666, + "epoch": 9.52010265183918, + "grad_norm": 0.5546875, + "learning_rate": 3.1818958477584375e-08, + "loss": 0.0181, + "mean_token_accuracy": 0.9943441897630692, + "num_tokens": 498210555.0, + "step": 4179 + }, + { + "entropy": 0.6399570629000664, + "epoch": 9.52238380382093, + "grad_norm": 0.48828125, + "learning_rate": 3.152011848367664e-08, + "loss": 0.0166, + "mean_token_accuracy": 0.994474284350872, + "num_tokens": 498330447.0, + "step": 4180 + }, + { + "epoch": 9.52238380382093, + "eval_entropy": 0.636155339928181, + "eval_loss": 0.02054397389292717, + "eval_mean_token_accuracy": 0.993616149226069, + "eval_num_tokens": 498330447.0, + "eval_runtime": 177.4811, + "eval_samples_per_second": 47.244, + "eval_steps_per_second": 1.482, + "step": 4180 + }, + { + "entropy": 0.6337756589055061, + "epoch": 9.524664955802681, + "grad_norm": 0.578125, + "learning_rate": 3.1222679551293486e-08, + "loss": 0.019, + "mean_token_accuracy": 0.9929038733243942, + "num_tokens": 498449475.0, + "step": 4181 + }, + { + "entropy": 0.6393431797623634, + "epoch": 9.52694610778443, + "grad_norm": 0.376953125, + "learning_rate": 3.0926641849255976e-08, + "loss": 0.0105, + "mean_token_accuracy": 0.9968982338905334, + "num_tokens": 498569006.0, + "step": 4182 + }, + { + "entropy": 0.6370956599712372, + "epoch": 9.529227259766182, + "grad_norm": 0.51171875, + "learning_rate": 3.063200554558915e-08, + "loss": 0.0183, + "mean_token_accuracy": 0.9935068637132645, + "num_tokens": 498688110.0, + "step": 4183 + }, + { + "entropy": 0.6345847994089127, + "epoch": 9.531508411747932, + "grad_norm": 0.5625, + "learning_rate": 3.033877080752312e-08, + "loss": 0.0174, + "mean_token_accuracy": 0.995329312980175, + "num_tokens": 498807666.0, + "step": 4184 + }, + { + "entropy": 0.6338488087058067, + "epoch": 9.533789563729684, + "grad_norm": 0.5546875, + "learning_rate": 3.0046937801491983e-08, + "loss": 0.0191, + "mean_token_accuracy": 0.99517372995615, + "num_tokens": 498927356.0, + "step": 4185 + }, + { + "entropy": 0.6420131176710129, + "epoch": 9.536070715711434, + "grad_norm": 0.55859375, + "learning_rate": 2.97565066931349e-08, + "loss": 0.0216, + "mean_token_accuracy": 0.9953547269105911, + "num_tokens": 499047001.0, + "step": 4186 + }, + { + "entropy": 0.6342705711722374, + "epoch": 9.538351867693185, + "grad_norm": 0.466796875, + "learning_rate": 2.9467477647294464e-08, + "loss": 0.0145, + "mean_token_accuracy": 0.9955959692597389, + "num_tokens": 499166791.0, + "step": 4187 + }, + { + "entropy": 0.6313030868768692, + "epoch": 9.540633019674935, + "grad_norm": 0.455078125, + "learning_rate": 2.917985082801833e-08, + "loss": 0.0128, + "mean_token_accuracy": 0.9964437335729599, + "num_tokens": 499286951.0, + "step": 4188 + }, + { + "entropy": 0.6367980390787125, + "epoch": 9.542914171656687, + "grad_norm": 0.6328125, + "learning_rate": 2.8893626398557583e-08, + "loss": 0.0182, + "mean_token_accuracy": 0.9953559413552284, + "num_tokens": 499406055.0, + "step": 4189 + }, + { + "entropy": 0.6319067180156708, + "epoch": 9.545195323638437, + "grad_norm": 0.4140625, + "learning_rate": 2.8608804521368382e-08, + "loss": 0.0108, + "mean_token_accuracy": 0.9966056123375893, + "num_tokens": 499525636.0, + "step": 4190 + }, + { + "entropy": 0.636522188782692, + "epoch": 9.547476475620188, + "grad_norm": 0.55859375, + "learning_rate": 2.832538535810947e-08, + "loss": 0.0184, + "mean_token_accuracy": 0.9943730309605598, + "num_tokens": 499645271.0, + "step": 4191 + }, + { + "entropy": 0.6331631317734718, + "epoch": 9.54975762760194, + "grad_norm": 0.515625, + "learning_rate": 2.804336906964439e-08, + "loss": 0.0147, + "mean_token_accuracy": 0.9953274726867676, + "num_tokens": 499764173.0, + "step": 4192 + }, + { + "entropy": 0.6317362487316132, + "epoch": 9.55203877958369, + "grad_norm": 0.515625, + "learning_rate": 2.7762755816039823e-08, + "loss": 0.0163, + "mean_token_accuracy": 0.9946000501513481, + "num_tokens": 499884057.0, + "step": 4193 + }, + { + "entropy": 0.637648306787014, + "epoch": 9.554319931565441, + "grad_norm": 0.62890625, + "learning_rate": 2.74835457565667e-08, + "loss": 0.0207, + "mean_token_accuracy": 0.9943486079573631, + "num_tokens": 500003777.0, + "step": 4194 + }, + { + "entropy": 0.6404694989323616, + "epoch": 9.556601083547191, + "grad_norm": 0.63671875, + "learning_rate": 2.7205739049699365e-08, + "loss": 0.0162, + "mean_token_accuracy": 0.9952503144741058, + "num_tokens": 500124465.0, + "step": 4195 + }, + { + "entropy": 0.6379305347800255, + "epoch": 9.558882235528943, + "grad_norm": 0.64453125, + "learning_rate": 2.6929335853115302e-08, + "loss": 0.0249, + "mean_token_accuracy": 0.9914159998297691, + "num_tokens": 500243883.0, + "step": 4196 + }, + { + "entropy": 0.6403057873249054, + "epoch": 9.561163387510693, + "grad_norm": 0.48046875, + "learning_rate": 2.6654336323695963e-08, + "loss": 0.015, + "mean_token_accuracy": 0.9938406944274902, + "num_tokens": 500363502.0, + "step": 4197 + }, + { + "entropy": 0.641230471432209, + "epoch": 9.563444539492444, + "grad_norm": 0.5390625, + "learning_rate": 2.63807406175251e-08, + "loss": 0.0185, + "mean_token_accuracy": 0.9939208477735519, + "num_tokens": 500482552.0, + "step": 4198 + }, + { + "entropy": 0.6358135789632797, + "epoch": 9.565725691474194, + "grad_norm": 0.5234375, + "learning_rate": 2.6108548889891005e-08, + "loss": 0.0198, + "mean_token_accuracy": 0.9936339408159256, + "num_tokens": 500602703.0, + "step": 4199 + }, + { + "entropy": 0.6388900652527809, + "epoch": 9.568006843455946, + "grad_norm": 0.65625, + "learning_rate": 2.5837761295284258e-08, + "loss": 0.0239, + "mean_token_accuracy": 0.9935680106282234, + "num_tokens": 500722564.0, + "step": 4200 + }, + { + "entropy": 0.6341762617230415, + "epoch": 9.570287995437695, + "grad_norm": 0.51953125, + "learning_rate": 2.5568377987398862e-08, + "loss": 0.0205, + "mean_token_accuracy": 0.9945273548364639, + "num_tokens": 500842488.0, + "step": 4201 + }, + { + "entropy": 0.6381703391671181, + "epoch": 9.572569147419447, + "grad_norm": 0.45703125, + "learning_rate": 2.5300399119131124e-08, + "loss": 0.0135, + "mean_token_accuracy": 0.9959870129823685, + "num_tokens": 500962263.0, + "step": 4202 + }, + { + "entropy": 0.6339679807424545, + "epoch": 9.574850299401197, + "grad_norm": 0.5546875, + "learning_rate": 2.5033824842581046e-08, + "loss": 0.0202, + "mean_token_accuracy": 0.9929948672652245, + "num_tokens": 501081560.0, + "step": 4203 + }, + { + "entropy": 0.6325573548674583, + "epoch": 9.577131451382948, + "grad_norm": 0.44140625, + "learning_rate": 2.476865530905065e-08, + "loss": 0.0122, + "mean_token_accuracy": 0.9950642883777618, + "num_tokens": 501199984.0, + "step": 4204 + }, + { + "entropy": 0.6338833570480347, + "epoch": 9.579412603364698, + "grad_norm": 0.69140625, + "learning_rate": 2.4504890669045654e-08, + "loss": 0.0297, + "mean_token_accuracy": 0.9924123138189316, + "num_tokens": 501318797.0, + "step": 4205 + }, + { + "entropy": 0.639415942132473, + "epoch": 9.58169375534645, + "grad_norm": 0.470703125, + "learning_rate": 2.4242531072273255e-08, + "loss": 0.0133, + "mean_token_accuracy": 0.9967901259660721, + "num_tokens": 501438576.0, + "step": 4206 + }, + { + "entropy": 0.6398534625768661, + "epoch": 9.583974907328201, + "grad_norm": 0.41015625, + "learning_rate": 2.398157666764378e-08, + "loss": 0.0141, + "mean_token_accuracy": 0.9960227608680725, + "num_tokens": 501557760.0, + "step": 4207 + }, + { + "entropy": 0.6351345926523209, + "epoch": 9.586256059309951, + "grad_norm": 0.431640625, + "learning_rate": 2.3722027603270415e-08, + "loss": 0.0146, + "mean_token_accuracy": 0.9952506572008133, + "num_tokens": 501677198.0, + "step": 4208 + }, + { + "entropy": 0.6377573683857918, + "epoch": 9.588537211291703, + "grad_norm": 0.62890625, + "learning_rate": 2.3463884026467265e-08, + "loss": 0.0151, + "mean_token_accuracy": 0.9955201372504234, + "num_tokens": 501797368.0, + "step": 4209 + }, + { + "entropy": 0.6375140622258186, + "epoch": 9.590818363273453, + "grad_norm": 0.484375, + "learning_rate": 2.320714608375241e-08, + "loss": 0.0144, + "mean_token_accuracy": 0.9952707216143608, + "num_tokens": 501916379.0, + "step": 4210 + }, + { + "entropy": 0.6349636092782021, + "epoch": 9.593099515255204, + "grad_norm": 0.890625, + "learning_rate": 2.295181392084511e-08, + "loss": 0.0325, + "mean_token_accuracy": 0.991486519575119, + "num_tokens": 502036528.0, + "step": 4211 + }, + { + "entropy": 0.6385044679045677, + "epoch": 9.595380667236954, + "grad_norm": 0.466796875, + "learning_rate": 2.269788768266695e-08, + "loss": 0.0121, + "mean_token_accuracy": 0.9964694902300835, + "num_tokens": 502156508.0, + "step": 4212 + }, + { + "entropy": 0.6400937438011169, + "epoch": 9.597661819218706, + "grad_norm": 0.443359375, + "learning_rate": 2.2445367513341533e-08, + "loss": 0.0119, + "mean_token_accuracy": 0.9948005601763725, + "num_tokens": 502275731.0, + "step": 4213 + }, + { + "entropy": 0.6347229555249214, + "epoch": 9.599942971200456, + "grad_norm": 0.5078125, + "learning_rate": 2.21942535561942e-08, + "loss": 0.0181, + "mean_token_accuracy": 0.9945418760180473, + "num_tokens": 502395252.0, + "step": 4214 + }, + { + "entropy": 0.6388291269540787, + "epoch": 9.602224123182207, + "grad_norm": 0.49609375, + "learning_rate": 2.1944545953752894e-08, + "loss": 0.0157, + "mean_token_accuracy": 0.9933938756585121, + "num_tokens": 502514651.0, + "step": 4215 + }, + { + "entropy": 0.6345308348536491, + "epoch": 9.604505275163957, + "grad_norm": 0.55078125, + "learning_rate": 2.1696244847746737e-08, + "loss": 0.0174, + "mean_token_accuracy": 0.994727335870266, + "num_tokens": 502633991.0, + "step": 4216 + }, + { + "entropy": 0.6420991271734238, + "epoch": 9.606786427145709, + "grad_norm": 0.765625, + "learning_rate": 2.1449350379106336e-08, + "loss": 0.0246, + "mean_token_accuracy": 0.9920217394828796, + "num_tokens": 502753845.0, + "step": 4217 + }, + { + "entropy": 0.6354053243994713, + "epoch": 9.609067579127458, + "grad_norm": 0.671875, + "learning_rate": 2.1203862687964595e-08, + "loss": 0.0312, + "mean_token_accuracy": 0.9924238100647926, + "num_tokens": 502872727.0, + "step": 4218 + }, + { + "entropy": 0.6365998238325119, + "epoch": 9.61134873110921, + "grad_norm": 0.73046875, + "learning_rate": 2.0959781913655053e-08, + "loss": 0.0241, + "mean_token_accuracy": 0.9917858988046646, + "num_tokens": 502992162.0, + "step": 4219 + }, + { + "entropy": 0.6337921470403671, + "epoch": 9.613629883090962, + "grad_norm": 0.61328125, + "learning_rate": 2.0717108194713566e-08, + "loss": 0.0196, + "mean_token_accuracy": 0.9940581768751144, + "num_tokens": 503111773.0, + "step": 4220 + }, + { + "entropy": 0.6389620378613472, + "epoch": 9.615911035072711, + "grad_norm": 0.53125, + "learning_rate": 2.0475841668877172e-08, + "loss": 0.0195, + "mean_token_accuracy": 0.993760734796524, + "num_tokens": 503231186.0, + "step": 4221 + }, + { + "entropy": 0.6338105872273445, + "epoch": 9.618192187054463, + "grad_norm": 0.6171875, + "learning_rate": 2.0235982473084115e-08, + "loss": 0.0233, + "mean_token_accuracy": 0.9912804439663887, + "num_tokens": 503350232.0, + "step": 4222 + }, + { + "entropy": 0.6389638409018517, + "epoch": 9.620473339036213, + "grad_norm": 0.474609375, + "learning_rate": 1.9997530743473548e-08, + "loss": 0.0176, + "mean_token_accuracy": 0.9940222352743149, + "num_tokens": 503469125.0, + "step": 4223 + }, + { + "entropy": 0.6327557638287544, + "epoch": 9.622754491017965, + "grad_norm": 0.48046875, + "learning_rate": 1.9760486615386376e-08, + "loss": 0.0157, + "mean_token_accuracy": 0.9942727982997894, + "num_tokens": 503588067.0, + "step": 4224 + }, + { + "entropy": 0.6352238282561302, + "epoch": 9.625035642999714, + "grad_norm": 0.64453125, + "learning_rate": 1.9524850223363868e-08, + "loss": 0.0211, + "mean_token_accuracy": 0.9917194172739983, + "num_tokens": 503707516.0, + "step": 4225 + }, + { + "entropy": 0.63454969227314, + "epoch": 9.627316794981466, + "grad_norm": 0.64453125, + "learning_rate": 1.9290621701149315e-08, + "loss": 0.0174, + "mean_token_accuracy": 0.9930506199598312, + "num_tokens": 503827137.0, + "step": 4226 + }, + { + "entropy": 0.6362317875027657, + "epoch": 9.629597946963216, + "grad_norm": 0.546875, + "learning_rate": 1.905780118168582e-08, + "loss": 0.0213, + "mean_token_accuracy": 0.9936222061514854, + "num_tokens": 503946916.0, + "step": 4227 + }, + { + "entropy": 0.6436940208077431, + "epoch": 9.631879098944967, + "grad_norm": 0.54296875, + "learning_rate": 1.882638879711768e-08, + "loss": 0.0157, + "mean_token_accuracy": 0.9940114393830299, + "num_tokens": 504066485.0, + "step": 4228 + }, + { + "entropy": 0.6373159065842628, + "epoch": 9.634160250926717, + "grad_norm": 0.68359375, + "learning_rate": 1.859638467879038e-08, + "loss": 0.0198, + "mean_token_accuracy": 0.9937139376997948, + "num_tokens": 504186503.0, + "step": 4229 + }, + { + "entropy": 0.6351614817976952, + "epoch": 9.636441402908469, + "grad_norm": 0.609375, + "learning_rate": 1.8367788957250054e-08, + "loss": 0.0262, + "mean_token_accuracy": 0.9937862828373909, + "num_tokens": 504306097.0, + "step": 4230 + }, + { + "entropy": 0.6373093649744987, + "epoch": 9.638722554890219, + "grad_norm": 0.51171875, + "learning_rate": 1.8140601762242916e-08, + "loss": 0.0207, + "mean_token_accuracy": 0.9934588372707367, + "num_tokens": 504425530.0, + "step": 4231 + }, + { + "entropy": 0.6376339644193649, + "epoch": 9.64100370687197, + "grad_norm": 0.828125, + "learning_rate": 1.7914823222715817e-08, + "loss": 0.0169, + "mean_token_accuracy": 0.9930949732661247, + "num_tokens": 504544477.0, + "step": 4232 + }, + { + "entropy": 0.6368639469146729, + "epoch": 9.643284858853722, + "grad_norm": 0.40625, + "learning_rate": 1.7690453466816805e-08, + "loss": 0.0114, + "mean_token_accuracy": 0.995904453098774, + "num_tokens": 504664384.0, + "step": 4233 + }, + { + "entropy": 0.6341697499155998, + "epoch": 9.645566010835472, + "grad_norm": 0.443359375, + "learning_rate": 1.7467492621893457e-08, + "loss": 0.0137, + "mean_token_accuracy": 0.9953213930130005, + "num_tokens": 504784217.0, + "step": 4234 + }, + { + "entropy": 0.6380485072731972, + "epoch": 9.647847162817223, + "grad_norm": 0.458984375, + "learning_rate": 1.724594081449399e-08, + "loss": 0.0154, + "mean_token_accuracy": 0.9953285977244377, + "num_tokens": 504903333.0, + "step": 4235 + }, + { + "entropy": 0.6272401884198189, + "epoch": 9.650128314798973, + "grad_norm": 0.53125, + "learning_rate": 1.702579817036726e-08, + "loss": 0.0171, + "mean_token_accuracy": 0.9960267767310143, + "num_tokens": 505023578.0, + "step": 4236 + }, + { + "entropy": 0.6324147284030914, + "epoch": 9.652409466780725, + "grad_norm": 0.6328125, + "learning_rate": 1.680706481446165e-08, + "loss": 0.0176, + "mean_token_accuracy": 0.9945574328303337, + "num_tokens": 505143127.0, + "step": 4237 + }, + { + "entropy": 0.6312595754861832, + "epoch": 9.654690618762475, + "grad_norm": 0.431640625, + "learning_rate": 1.6589740870926186e-08, + "loss": 0.0111, + "mean_token_accuracy": 0.9970932006835938, + "num_tokens": 505262828.0, + "step": 4238 + }, + { + "entropy": 0.6390678957104683, + "epoch": 9.656971770744226, + "grad_norm": 0.69921875, + "learning_rate": 1.6373826463109976e-08, + "loss": 0.0246, + "mean_token_accuracy": 0.9910559132695198, + "num_tokens": 505382128.0, + "step": 4239 + }, + { + "entropy": 0.6414987370371819, + "epoch": 9.659252922725976, + "grad_norm": 0.70703125, + "learning_rate": 1.6159321713561382e-08, + "loss": 0.0246, + "mean_token_accuracy": 0.993268609046936, + "num_tokens": 505502118.0, + "step": 4240 + }, + { + "entropy": 0.6333317682147026, + "epoch": 9.661534074707728, + "grad_norm": 0.431640625, + "learning_rate": 1.5946226744029402e-08, + "loss": 0.0108, + "mean_token_accuracy": 0.9950984641909599, + "num_tokens": 505621635.0, + "step": 4241 + }, + { + "entropy": 0.6327030584216118, + "epoch": 9.663815226689477, + "grad_norm": 0.482421875, + "learning_rate": 1.5734541675462567e-08, + "loss": 0.0139, + "mean_token_accuracy": 0.9944580867886543, + "num_tokens": 505740620.0, + "step": 4242 + }, + { + "entropy": 0.6357920914888382, + "epoch": 9.666096378671229, + "grad_norm": 0.51953125, + "learning_rate": 1.5524266628009212e-08, + "loss": 0.016, + "mean_token_accuracy": 0.9953301176428795, + "num_tokens": 505860142.0, + "step": 4243 + }, + { + "entropy": 0.6331880912184715, + "epoch": 9.668377530652979, + "grad_norm": 0.546875, + "learning_rate": 1.5315401721017752e-08, + "loss": 0.0133, + "mean_token_accuracy": 0.9956740811467171, + "num_tokens": 505979450.0, + "step": 4244 + }, + { + "entropy": 0.6361940801143646, + "epoch": 9.67065868263473, + "grad_norm": 0.51171875, + "learning_rate": 1.5107947073035312e-08, + "loss": 0.0157, + "mean_token_accuracy": 0.9961956292390823, + "num_tokens": 506099004.0, + "step": 4245 + }, + { + "entropy": 0.640230767428875, + "epoch": 9.672939834616482, + "grad_norm": 0.53125, + "learning_rate": 1.4901902801809642e-08, + "loss": 0.0171, + "mean_token_accuracy": 0.9949222058057785, + "num_tokens": 506218334.0, + "step": 4246 + }, + { + "entropy": 0.6355542838573456, + "epoch": 9.675220986598232, + "grad_norm": 0.546875, + "learning_rate": 1.4697269024287198e-08, + "loss": 0.018, + "mean_token_accuracy": 0.9947432801127434, + "num_tokens": 506338042.0, + "step": 4247 + }, + { + "entropy": 0.6320233717560768, + "epoch": 9.677502138579984, + "grad_norm": 0.5078125, + "learning_rate": 1.4494045856613959e-08, + "loss": 0.0113, + "mean_token_accuracy": 0.9960903078317642, + "num_tokens": 506456829.0, + "step": 4248 + }, + { + "entropy": 0.6376307308673859, + "epoch": 9.679783290561733, + "grad_norm": 0.6953125, + "learning_rate": 1.4292233414135992e-08, + "loss": 0.0182, + "mean_token_accuracy": 0.9946131855249405, + "num_tokens": 506575981.0, + "step": 4249 + }, + { + "entropy": 0.632510244846344, + "epoch": 9.682064442543485, + "grad_norm": 0.458984375, + "learning_rate": 1.4091831811397782e-08, + "loss": 0.011, + "mean_token_accuracy": 0.9961206987500191, + "num_tokens": 506694851.0, + "step": 4250 + }, + { + "entropy": 0.6346203163266182, + "epoch": 9.684345594525235, + "grad_norm": 0.5234375, + "learning_rate": 1.38928411621439e-08, + "loss": 0.0194, + "mean_token_accuracy": 0.995192863047123, + "num_tokens": 506814487.0, + "step": 4251 + }, + { + "entropy": 0.629237525165081, + "epoch": 9.686626746506986, + "grad_norm": 0.5390625, + "learning_rate": 1.3695261579316776e-08, + "loss": 0.0143, + "mean_token_accuracy": 0.9963223114609718, + "num_tokens": 506933366.0, + "step": 4252 + }, + { + "entropy": 0.6369072943925858, + "epoch": 9.688907898488736, + "grad_norm": 0.419921875, + "learning_rate": 1.3499093175059208e-08, + "loss": 0.0126, + "mean_token_accuracy": 0.9966975226998329, + "num_tokens": 507052524.0, + "step": 4253 + }, + { + "entropy": 0.6328678205609322, + "epoch": 9.691189050470488, + "grad_norm": 0.578125, + "learning_rate": 1.3304336060712685e-08, + "loss": 0.0172, + "mean_token_accuracy": 0.9959753900766373, + "num_tokens": 507171913.0, + "step": 4254 + }, + { + "entropy": 0.6362719535827637, + "epoch": 9.693470202452238, + "grad_norm": 0.53515625, + "learning_rate": 1.3110990346817676e-08, + "loss": 0.0171, + "mean_token_accuracy": 0.9936802014708519, + "num_tokens": 507290775.0, + "step": 4255 + }, + { + "entropy": 0.639990970492363, + "epoch": 9.69575135443399, + "grad_norm": 0.435546875, + "learning_rate": 1.2919056143113062e-08, + "loss": 0.0121, + "mean_token_accuracy": 0.9962001591920853, + "num_tokens": 507410235.0, + "step": 4256 + }, + { + "entropy": 0.6388968527317047, + "epoch": 9.698032506415739, + "grad_norm": 0.6171875, + "learning_rate": 1.2728533558537259e-08, + "loss": 0.012, + "mean_token_accuracy": 0.9963592439889908, + "num_tokens": 507529744.0, + "step": 4257 + }, + { + "entropy": 0.6365483403205872, + "epoch": 9.70031365839749, + "grad_norm": 0.5234375, + "learning_rate": 1.2539422701227099e-08, + "loss": 0.0167, + "mean_token_accuracy": 0.9938340559601784, + "num_tokens": 507649258.0, + "step": 4258 + }, + { + "entropy": 0.6352767273783684, + "epoch": 9.702594810379242, + "grad_norm": 0.427734375, + "learning_rate": 1.235172367851839e-08, + "loss": 0.0106, + "mean_token_accuracy": 0.9957899376749992, + "num_tokens": 507769298.0, + "step": 4259 + }, + { + "entropy": 0.6368807256221771, + "epoch": 9.704875962360992, + "grad_norm": 0.51171875, + "learning_rate": 1.2165436596945634e-08, + "loss": 0.016, + "mean_token_accuracy": 0.9949077740311623, + "num_tokens": 507888735.0, + "step": 4260 + }, + { + "entropy": 0.6302249506115913, + "epoch": 9.707157114342744, + "grad_norm": 0.482421875, + "learning_rate": 1.19805615622412e-08, + "loss": 0.0146, + "mean_token_accuracy": 0.9947085306048393, + "num_tokens": 508008653.0, + "step": 4261 + }, + { + "entropy": 0.6340556815266609, + "epoch": 9.709438266324494, + "grad_norm": 0.62109375, + "learning_rate": 1.179709867933726e-08, + "loss": 0.0216, + "mean_token_accuracy": 0.9947269558906555, + "num_tokens": 508128614.0, + "step": 4262 + }, + { + "entropy": 0.6382952034473419, + "epoch": 9.711719418306245, + "grad_norm": 0.51171875, + "learning_rate": 1.1615048052363298e-08, + "loss": 0.0129, + "mean_token_accuracy": 0.9950719252228737, + "num_tokens": 508247296.0, + "step": 4263 + }, + { + "entropy": 0.635470487177372, + "epoch": 9.714000570287995, + "grad_norm": 0.50390625, + "learning_rate": 1.1434409784648049e-08, + "loss": 0.0183, + "mean_token_accuracy": 0.9949280023574829, + "num_tokens": 508367115.0, + "step": 4264 + }, + { + "entropy": 0.6373466178774834, + "epoch": 9.716281722269747, + "grad_norm": 0.58203125, + "learning_rate": 1.125518397871811e-08, + "loss": 0.0195, + "mean_token_accuracy": 0.9938492253422737, + "num_tokens": 508486545.0, + "step": 4265 + }, + { + "entropy": 0.6336183547973633, + "epoch": 9.718562874251496, + "grad_norm": 0.609375, + "learning_rate": 1.1077370736298498e-08, + "loss": 0.0152, + "mean_token_accuracy": 0.9951714500784874, + "num_tokens": 508605634.0, + "step": 4266 + }, + { + "entropy": 0.6337263733148575, + "epoch": 9.720844026233248, + "grad_norm": 0.56640625, + "learning_rate": 1.090097015831293e-08, + "loss": 0.0246, + "mean_token_accuracy": 0.9928433001041412, + "num_tokens": 508725563.0, + "step": 4267 + }, + { + "entropy": 0.6397713124752045, + "epoch": 9.723125178214998, + "grad_norm": 0.51171875, + "learning_rate": 1.0725982344882701e-08, + "loss": 0.0123, + "mean_token_accuracy": 0.9955602511763573, + "num_tokens": 508844966.0, + "step": 4268 + }, + { + "entropy": 0.6351345777511597, + "epoch": 9.72540633019675, + "grad_norm": 0.59765625, + "learning_rate": 1.0552407395327813e-08, + "loss": 0.0244, + "mean_token_accuracy": 0.9943793341517448, + "num_tokens": 508964428.0, + "step": 4269 + }, + { + "entropy": 0.6348885372281075, + "epoch": 9.7276874821785, + "grad_norm": 0.416015625, + "learning_rate": 1.0380245408165846e-08, + "loss": 0.0124, + "mean_token_accuracy": 0.9964334890246391, + "num_tokens": 509083851.0, + "step": 4270 + }, + { + "entropy": 0.6334993988275528, + "epoch": 9.729968634160251, + "grad_norm": 0.482421875, + "learning_rate": 1.0209496481112247e-08, + "loss": 0.0194, + "mean_token_accuracy": 0.9950297847390175, + "num_tokens": 509203045.0, + "step": 4271 + }, + { + "entropy": 0.6301620900630951, + "epoch": 9.732249786142003, + "grad_norm": 0.451171875, + "learning_rate": 1.0040160711081437e-08, + "loss": 0.0148, + "mean_token_accuracy": 0.9957186505198479, + "num_tokens": 509322528.0, + "step": 4272 + }, + { + "entropy": 0.6366039291024208, + "epoch": 9.734530938123752, + "grad_norm": 0.515625, + "learning_rate": 9.87223819418487e-09, + "loss": 0.0182, + "mean_token_accuracy": 0.994105264544487, + "num_tokens": 509442781.0, + "step": 4273 + }, + { + "entropy": 0.6331815049052238, + "epoch": 9.736812090105504, + "grad_norm": 0.5, + "learning_rate": 9.705729025732135e-09, + "loss": 0.0137, + "mean_token_accuracy": 0.9948147311806679, + "num_tokens": 509562153.0, + "step": 4274 + }, + { + "entropy": 0.6348698735237122, + "epoch": 9.739093242087254, + "grad_norm": 0.498046875, + "learning_rate": 9.540633300230418e-09, + "loss": 0.0157, + "mean_token_accuracy": 0.994660884141922, + "num_tokens": 509681429.0, + "step": 4275 + }, + { + "entropy": 0.6368813663721085, + "epoch": 9.741374394069005, + "grad_norm": 0.55859375, + "learning_rate": 9.376951111385313e-09, + "loss": 0.0203, + "mean_token_accuracy": 0.9935576021671295, + "num_tokens": 509801384.0, + "step": 4276 + }, + { + "entropy": 0.637769065797329, + "epoch": 9.743655546050755, + "grad_norm": 0.73046875, + "learning_rate": 9.214682552099175e-09, + "loss": 0.0271, + "mean_token_accuracy": 0.9912835508584976, + "num_tokens": 509920783.0, + "step": 4277 + }, + { + "entropy": 0.6361338347196579, + "epoch": 9.745936698032507, + "grad_norm": 0.54296875, + "learning_rate": 9.053827714472773e-09, + "loss": 0.0108, + "mean_token_accuracy": 0.9962847828865051, + "num_tokens": 510040345.0, + "step": 4278 + }, + { + "entropy": 0.6321266368031502, + "epoch": 9.748217850014257, + "grad_norm": 0.52734375, + "learning_rate": 8.894386689804469e-09, + "loss": 0.0171, + "mean_token_accuracy": 0.9951008632779121, + "num_tokens": 510159106.0, + "step": 4279 + }, + { + "entropy": 0.6301240921020508, + "epoch": 9.750499001996008, + "grad_norm": 0.4921875, + "learning_rate": 8.73635956858937e-09, + "loss": 0.0169, + "mean_token_accuracy": 0.9943099543452263, + "num_tokens": 510277855.0, + "step": 4280 + }, + { + "entropy": 0.6307173147797585, + "epoch": 9.752780153977758, + "grad_norm": 0.55859375, + "learning_rate": 8.579746440520731e-09, + "loss": 0.0134, + "mean_token_accuracy": 0.9962006211280823, + "num_tokens": 510398289.0, + "step": 4281 + }, + { + "entropy": 0.6355291903018951, + "epoch": 9.75506130595951, + "grad_norm": 0.466796875, + "learning_rate": 8.424547394489668e-09, + "loss": 0.0178, + "mean_token_accuracy": 0.9942226931452751, + "num_tokens": 510517939.0, + "step": 4282 + }, + { + "entropy": 0.6280441209673882, + "epoch": 9.75734245794126, + "grad_norm": 0.75, + "learning_rate": 8.270762518583498e-09, + "loss": 0.0273, + "mean_token_accuracy": 0.9924489930272102, + "num_tokens": 510637268.0, + "step": 4283 + }, + { + "entropy": 0.6291334927082062, + "epoch": 9.759623609923011, + "grad_norm": 0.47265625, + "learning_rate": 8.118391900087952e-09, + "loss": 0.0166, + "mean_token_accuracy": 0.9937848523259163, + "num_tokens": 510756428.0, + "step": 4284 + }, + { + "entropy": 0.6349774524569511, + "epoch": 9.761904761904763, + "grad_norm": 0.58984375, + "learning_rate": 7.967435625485242e-09, + "loss": 0.0221, + "mean_token_accuracy": 0.9945520311594009, + "num_tokens": 510875773.0, + "step": 4285 + }, + { + "entropy": 0.6364731192588806, + "epoch": 9.764185913886513, + "grad_norm": 0.3984375, + "learning_rate": 7.81789378045572e-09, + "loss": 0.0145, + "mean_token_accuracy": 0.9953901544213295, + "num_tokens": 510995156.0, + "step": 4286 + }, + { + "entropy": 0.6353637129068375, + "epoch": 9.766467065868264, + "grad_norm": 0.6015625, + "learning_rate": 7.669766449876493e-09, + "loss": 0.0217, + "mean_token_accuracy": 0.9931512698531151, + "num_tokens": 511114134.0, + "step": 4287 + }, + { + "entropy": 0.6358574330806732, + "epoch": 9.768748217850014, + "grad_norm": 0.53125, + "learning_rate": 7.523053717821138e-09, + "loss": 0.0175, + "mean_token_accuracy": 0.9952821135520935, + "num_tokens": 511232752.0, + "step": 4288 + }, + { + "entropy": 0.6379683688282967, + "epoch": 9.771029369831766, + "grad_norm": 0.41796875, + "learning_rate": 7.377755667561659e-09, + "loss": 0.0132, + "mean_token_accuracy": 0.9950965344905853, + "num_tokens": 511352185.0, + "step": 4289 + }, + { + "entropy": 0.6363892406225204, + "epoch": 9.773310521813515, + "grad_norm": 0.498046875, + "learning_rate": 7.233872381565976e-09, + "loss": 0.0125, + "mean_token_accuracy": 0.9956540763378143, + "num_tokens": 511471498.0, + "step": 4290 + }, + { + "entropy": 0.6361528784036636, + "epoch": 9.775591673795267, + "grad_norm": 0.5078125, + "learning_rate": 7.091403941499597e-09, + "loss": 0.0131, + "mean_token_accuracy": 0.9953231960535049, + "num_tokens": 511590418.0, + "step": 4291 + }, + { + "entropy": 0.6351652815937996, + "epoch": 9.777872825777017, + "grad_norm": 0.474609375, + "learning_rate": 6.950350428225061e-09, + "loss": 0.0161, + "mean_token_accuracy": 0.9943402037024498, + "num_tokens": 511710224.0, + "step": 4292 + }, + { + "entropy": 0.6360719949007034, + "epoch": 9.780153977758768, + "grad_norm": 0.396484375, + "learning_rate": 6.810711921801105e-09, + "loss": 0.0149, + "mean_token_accuracy": 0.9953363016247749, + "num_tokens": 511829833.0, + "step": 4293 + }, + { + "entropy": 0.6341731324791908, + "epoch": 9.782435129740518, + "grad_norm": 0.625, + "learning_rate": 6.672488501484608e-09, + "loss": 0.0214, + "mean_token_accuracy": 0.9922474473714828, + "num_tokens": 511949810.0, + "step": 4294 + }, + { + "entropy": 0.6350837796926498, + "epoch": 9.78471628172227, + "grad_norm": 0.41796875, + "learning_rate": 6.535680245727816e-09, + "loss": 0.0135, + "mean_token_accuracy": 0.9947381839156151, + "num_tokens": 512069777.0, + "step": 4295 + }, + { + "entropy": 0.6373315006494522, + "epoch": 9.78699743370402, + "grad_norm": 0.498046875, + "learning_rate": 6.400287232180558e-09, + "loss": 0.0127, + "mean_token_accuracy": 0.9964157193899155, + "num_tokens": 512189796.0, + "step": 4296 + }, + { + "entropy": 0.6344960406422615, + "epoch": 9.789278585685771, + "grad_norm": 0.470703125, + "learning_rate": 6.266309537689696e-09, + "loss": 0.0153, + "mean_token_accuracy": 0.9961340054869652, + "num_tokens": 512309030.0, + "step": 4297 + }, + { + "entropy": 0.6326555609703064, + "epoch": 9.791559737667523, + "grad_norm": 0.55859375, + "learning_rate": 6.133747238298016e-09, + "loss": 0.0209, + "mean_token_accuracy": 0.9926702827215195, + "num_tokens": 512427758.0, + "step": 4298 + }, + { + "entropy": 0.6351582333445549, + "epoch": 9.793840889649273, + "grad_norm": 0.5390625, + "learning_rate": 6.002600409245607e-09, + "loss": 0.0175, + "mean_token_accuracy": 0.9949891641736031, + "num_tokens": 512548392.0, + "step": 4299 + }, + { + "entropy": 0.6370611861348152, + "epoch": 9.796122041631024, + "grad_norm": 0.57421875, + "learning_rate": 5.872869124968761e-09, + "loss": 0.0167, + "mean_token_accuracy": 0.99449672549963, + "num_tokens": 512667580.0, + "step": 4300 + }, + { + "entropy": 0.6404080912470818, + "epoch": 9.798403193612774, + "grad_norm": 0.546875, + "learning_rate": 5.7445534591002435e-09, + "loss": 0.0188, + "mean_token_accuracy": 0.9939380139112473, + "num_tokens": 512786939.0, + "step": 4301 + }, + { + "entropy": 0.6292253285646439, + "epoch": 9.800684345594526, + "grad_norm": 0.58203125, + "learning_rate": 5.617653484469576e-09, + "loss": 0.0202, + "mean_token_accuracy": 0.9946704506874084, + "num_tokens": 512905738.0, + "step": 4302 + }, + { + "entropy": 0.6361799612641335, + "epoch": 9.802965497576276, + "grad_norm": 0.455078125, + "learning_rate": 5.492169273103309e-09, + "loss": 0.0133, + "mean_token_accuracy": 0.9948587194085121, + "num_tokens": 513024735.0, + "step": 4303 + }, + { + "entropy": 0.6358808130025864, + "epoch": 9.805246649558027, + "grad_norm": 0.4140625, + "learning_rate": 5.368100896223083e-09, + "loss": 0.0124, + "mean_token_accuracy": 0.995994508266449, + "num_tokens": 513143570.0, + "step": 4304 + }, + { + "entropy": 0.6375257670879364, + "epoch": 9.807527801539777, + "grad_norm": 0.515625, + "learning_rate": 5.245448424248123e-09, + "loss": 0.0186, + "mean_token_accuracy": 0.9943975880742073, + "num_tokens": 513262430.0, + "step": 4305 + }, + { + "entropy": 0.6364579796791077, + "epoch": 9.809808953521529, + "grad_norm": 0.53515625, + "learning_rate": 5.124211926793577e-09, + "loss": 0.0124, + "mean_token_accuracy": 0.9954562783241272, + "num_tokens": 513381062.0, + "step": 4306 + }, + { + "entropy": 0.6378483772277832, + "epoch": 9.812090105503279, + "grad_norm": 0.482421875, + "learning_rate": 5.004391472670788e-09, + "loss": 0.0137, + "mean_token_accuracy": 0.9955693110823631, + "num_tokens": 513500610.0, + "step": 4307 + }, + { + "entropy": 0.6280825510621071, + "epoch": 9.81437125748503, + "grad_norm": 0.44921875, + "learning_rate": 4.885987129887859e-09, + "loss": 0.0111, + "mean_token_accuracy": 0.996827520430088, + "num_tokens": 513619500.0, + "step": 4308 + }, + { + "entropy": 0.6391116976737976, + "epoch": 9.81665240946678, + "grad_norm": 0.72265625, + "learning_rate": 4.768998965648253e-09, + "loss": 0.0199, + "mean_token_accuracy": 0.9935748875141144, + "num_tokens": 513738896.0, + "step": 4309 + }, + { + "entropy": 0.6353771686553955, + "epoch": 9.818933561448532, + "grad_norm": 0.55078125, + "learning_rate": 4.653427046352743e-09, + "loss": 0.0187, + "mean_token_accuracy": 0.9945207685232162, + "num_tokens": 513858192.0, + "step": 4310 + }, + { + "entropy": 0.6368462517857552, + "epoch": 9.821214713430283, + "grad_norm": 0.46484375, + "learning_rate": 4.53927143759747e-09, + "loss": 0.012, + "mean_token_accuracy": 0.9960479885339737, + "num_tokens": 513977719.0, + "step": 4311 + }, + { + "entropy": 0.6353224515914917, + "epoch": 9.823495865412033, + "grad_norm": 0.60546875, + "learning_rate": 4.426532204175049e-09, + "loss": 0.0143, + "mean_token_accuracy": 0.9950269386172295, + "num_tokens": 514097836.0, + "step": 4312 + }, + { + "entropy": 0.6379821300506592, + "epoch": 9.825777017393785, + "grad_norm": 0.51953125, + "learning_rate": 4.3152094100740175e-09, + "loss": 0.0124, + "mean_token_accuracy": 0.9951253980398178, + "num_tokens": 514217548.0, + "step": 4313 + }, + { + "entropy": 0.6363255754113197, + "epoch": 9.828058169375534, + "grad_norm": 0.5234375, + "learning_rate": 4.205303118479109e-09, + "loss": 0.0145, + "mean_token_accuracy": 0.9959533959627151, + "num_tokens": 514336628.0, + "step": 4314 + }, + { + "entropy": 0.6342978850007057, + "epoch": 9.830339321357286, + "grad_norm": 0.515625, + "learning_rate": 4.096813391770982e-09, + "loss": 0.0149, + "mean_token_accuracy": 0.9955312758684158, + "num_tokens": 514456648.0, + "step": 4315 + }, + { + "entropy": 0.6337519288063049, + "epoch": 9.832620473339036, + "grad_norm": 0.51953125, + "learning_rate": 3.989740291526212e-09, + "loss": 0.0104, + "mean_token_accuracy": 0.9968969896435738, + "num_tokens": 514575775.0, + "step": 4316 + }, + { + "entropy": 0.6360059902071953, + "epoch": 9.834901625320787, + "grad_norm": 0.67578125, + "learning_rate": 3.884083878517575e-09, + "loss": 0.02, + "mean_token_accuracy": 0.9951164051890373, + "num_tokens": 514695220.0, + "step": 4317 + }, + { + "entropy": 0.6342587247490883, + "epoch": 9.837182777302537, + "grad_norm": 0.63671875, + "learning_rate": 3.779844212713213e-09, + "loss": 0.0187, + "mean_token_accuracy": 0.9930863007903099, + "num_tokens": 514814489.0, + "step": 4318 + }, + { + "entropy": 0.6350710615515709, + "epoch": 9.839463929284289, + "grad_norm": 0.52734375, + "learning_rate": 3.6770213532782985e-09, + "loss": 0.0147, + "mean_token_accuracy": 0.9954249262809753, + "num_tokens": 514933723.0, + "step": 4319 + }, + { + "entropy": 0.6307850480079651, + "epoch": 9.841745081266039, + "grad_norm": 0.462890625, + "learning_rate": 3.5756153585725374e-09, + "loss": 0.0131, + "mean_token_accuracy": 0.9949779734015465, + "num_tokens": 515053197.0, + "step": 4320 + }, + { + "entropy": 0.6354513019323349, + "epoch": 9.84402623324779, + "grad_norm": 0.60546875, + "learning_rate": 3.475626286152112e-09, + "loss": 0.02, + "mean_token_accuracy": 0.9933178499341011, + "num_tokens": 515172673.0, + "step": 4321 + }, + { + "entropy": 0.6355723366141319, + "epoch": 9.84630738522954, + "grad_norm": 0.435546875, + "learning_rate": 3.3770541927691247e-09, + "loss": 0.0146, + "mean_token_accuracy": 0.9957782179117203, + "num_tokens": 515291975.0, + "step": 4322 + }, + { + "entropy": 0.6359907537698746, + "epoch": 9.848588537211292, + "grad_norm": 0.4609375, + "learning_rate": 3.2798991343707676e-09, + "loss": 0.0143, + "mean_token_accuracy": 0.9948391318321228, + "num_tokens": 515411440.0, + "step": 4323 + }, + { + "entropy": 0.6338881552219391, + "epoch": 9.850869689193043, + "grad_norm": 0.5546875, + "learning_rate": 3.1841611661007077e-09, + "loss": 0.0162, + "mean_token_accuracy": 0.9924850016832352, + "num_tokens": 515531163.0, + "step": 4324 + }, + { + "entropy": 0.6336909905076027, + "epoch": 9.853150841174793, + "grad_norm": 0.6328125, + "learning_rate": 3.089840342297701e-09, + "loss": 0.0208, + "mean_token_accuracy": 0.9941145852208138, + "num_tokens": 515650767.0, + "step": 4325 + }, + { + "entropy": 0.6357816606760025, + "epoch": 9.855431993156545, + "grad_norm": 0.484375, + "learning_rate": 2.9969367164969787e-09, + "loss": 0.0156, + "mean_token_accuracy": 0.9944195374846458, + "num_tokens": 515770451.0, + "step": 4326 + }, + { + "entropy": 0.6345168203115463, + "epoch": 9.857713145138295, + "grad_norm": 0.5390625, + "learning_rate": 2.905450341428029e-09, + "loss": 0.0162, + "mean_token_accuracy": 0.9944839552044868, + "num_tokens": 515889776.0, + "step": 4327 + }, + { + "entropy": 0.6290989518165588, + "epoch": 9.859994297120046, + "grad_norm": 0.4921875, + "learning_rate": 2.8153812690173697e-09, + "loss": 0.0118, + "mean_token_accuracy": 0.9955369159579277, + "num_tokens": 516008591.0, + "step": 4328 + }, + { + "entropy": 0.6335896104574203, + "epoch": 9.862275449101796, + "grad_norm": 0.396484375, + "learning_rate": 2.726729550386331e-09, + "loss": 0.0147, + "mean_token_accuracy": 0.9958271458745003, + "num_tokens": 516127793.0, + "step": 4329 + }, + { + "entropy": 0.6366668939590454, + "epoch": 9.864556601083548, + "grad_norm": 0.48046875, + "learning_rate": 2.6394952358518854e-09, + "loss": 0.0121, + "mean_token_accuracy": 0.9956038668751717, + "num_tokens": 516246391.0, + "step": 4330 + }, + { + "entropy": 0.6334044262766838, + "epoch": 9.866837753065298, + "grad_norm": 0.62109375, + "learning_rate": 2.553678374926649e-09, + "loss": 0.0177, + "mean_token_accuracy": 0.9940154924988747, + "num_tokens": 516366275.0, + "step": 4331 + }, + { + "entropy": 0.6344807669520378, + "epoch": 9.86911890504705, + "grad_norm": 0.5859375, + "learning_rate": 2.4692790163183268e-09, + "loss": 0.017, + "mean_token_accuracy": 0.9947279095649719, + "num_tokens": 516485708.0, + "step": 4332 + }, + { + "entropy": 0.6330255717039108, + "epoch": 9.871400057028799, + "grad_norm": 0.46484375, + "learning_rate": 2.3862972079305435e-09, + "loss": 0.0197, + "mean_token_accuracy": 0.9951571300625801, + "num_tokens": 516604920.0, + "step": 4333 + }, + { + "entropy": 0.6329288855195045, + "epoch": 9.87368120901055, + "grad_norm": 0.46484375, + "learning_rate": 2.3047329968620137e-09, + "loss": 0.0147, + "mean_token_accuracy": 0.9954222440719604, + "num_tokens": 516724195.0, + "step": 4334 + }, + { + "entropy": 0.6345260515809059, + "epoch": 9.8759623609923, + "grad_norm": 0.4375, + "learning_rate": 2.2245864294073715e-09, + "loss": 0.0129, + "mean_token_accuracy": 0.9957624971866608, + "num_tokens": 516843395.0, + "step": 4335 + }, + { + "entropy": 0.6383890211582184, + "epoch": 9.878243512974052, + "grad_norm": 0.53515625, + "learning_rate": 2.145857551056063e-09, + "loss": 0.0188, + "mean_token_accuracy": 0.9946229979395866, + "num_tokens": 516964190.0, + "step": 4336 + }, + { + "entropy": 0.6307515352964401, + "epoch": 9.880524664955804, + "grad_norm": 0.5078125, + "learning_rate": 2.0685464064928996e-09, + "loss": 0.0178, + "mean_token_accuracy": 0.9930268973112106, + "num_tokens": 517083185.0, + "step": 4337 + }, + { + "entropy": 0.6397590786218643, + "epoch": 9.882805816937553, + "grad_norm": 0.53515625, + "learning_rate": 1.992653039598613e-09, + "loss": 0.0171, + "mean_token_accuracy": 0.994856521487236, + "num_tokens": 517202681.0, + "step": 4338 + }, + { + "entropy": 0.6320808380842209, + "epoch": 9.885086968919305, + "grad_norm": 0.451171875, + "learning_rate": 1.91817749344847e-09, + "loss": 0.0143, + "mean_token_accuracy": 0.99614217877388, + "num_tokens": 517321643.0, + "step": 4339 + }, + { + "entropy": 0.6383469253778458, + "epoch": 9.887368120901055, + "grad_norm": 0.419921875, + "learning_rate": 1.8451198103133783e-09, + "loss": 0.0139, + "mean_token_accuracy": 0.9952416568994522, + "num_tokens": 517441103.0, + "step": 4340 + }, + { + "entropy": 0.6381924524903297, + "epoch": 9.889649272882806, + "grad_norm": 0.51953125, + "learning_rate": 1.7734800316596135e-09, + "loss": 0.0165, + "mean_token_accuracy": 0.9937506318092346, + "num_tokens": 517560537.0, + "step": 4341 + }, + { + "entropy": 0.6400131583213806, + "epoch": 9.891930424864556, + "grad_norm": 0.490234375, + "learning_rate": 1.703258198148261e-09, + "loss": 0.0121, + "mean_token_accuracy": 0.9956729784607887, + "num_tokens": 517679489.0, + "step": 4342 + }, + { + "entropy": 0.6362211257219315, + "epoch": 9.894211576846308, + "grad_norm": 0.52734375, + "learning_rate": 1.6344543496360499e-09, + "loss": 0.0188, + "mean_token_accuracy": 0.9943748936057091, + "num_tokens": 517799137.0, + "step": 4343 + }, + { + "entropy": 0.6368946060538292, + "epoch": 9.896492728828058, + "grad_norm": 0.41796875, + "learning_rate": 1.567068525175075e-09, + "loss": 0.0127, + "mean_token_accuracy": 0.9958680719137192, + "num_tokens": 517918420.0, + "step": 4344 + }, + { + "entropy": 0.632478728890419, + "epoch": 9.89877388080981, + "grad_norm": 0.703125, + "learning_rate": 1.5011007630114093e-09, + "loss": 0.0211, + "mean_token_accuracy": 0.9943599998950958, + "num_tokens": 518037521.0, + "step": 4345 + }, + { + "entropy": 0.6375446319580078, + "epoch": 9.90105503279156, + "grad_norm": 0.50390625, + "learning_rate": 1.4365511005878796e-09, + "loss": 0.0177, + "mean_token_accuracy": 0.9938899204134941, + "num_tokens": 518157863.0, + "step": 4346 + }, + { + "entropy": 0.6362556591629982, + "epoch": 9.90333618477331, + "grad_norm": 0.5546875, + "learning_rate": 1.3734195745412904e-09, + "loss": 0.0144, + "mean_token_accuracy": 0.9952046126127243, + "num_tokens": 518277541.0, + "step": 4347 + }, + { + "entropy": 0.6348711177706718, + "epoch": 9.90561733675506, + "grad_norm": 0.52734375, + "learning_rate": 1.3117062207038123e-09, + "loss": 0.0192, + "mean_token_accuracy": 0.9962498173117638, + "num_tokens": 518396925.0, + "step": 4348 + }, + { + "entropy": 0.6340797245502472, + "epoch": 9.907898488736812, + "grad_norm": 0.50390625, + "learning_rate": 1.2514110741029816e-09, + "loss": 0.0157, + "mean_token_accuracy": 0.9949052259325981, + "num_tokens": 518516444.0, + "step": 4349 + }, + { + "entropy": 0.6391857489943504, + "epoch": 9.910179640718562, + "grad_norm": 0.5078125, + "learning_rate": 1.1925341689608682e-09, + "loss": 0.0136, + "mean_token_accuracy": 0.995218500494957, + "num_tokens": 518636421.0, + "step": 4350 + }, + { + "entropy": 0.6353739574551582, + "epoch": 9.912460792700314, + "grad_norm": 0.57421875, + "learning_rate": 1.135075538695185e-09, + "loss": 0.0121, + "mean_token_accuracy": 0.9954201653599739, + "num_tokens": 518755073.0, + "step": 4351 + }, + { + "entropy": 0.6317922249436378, + "epoch": 9.914741944682065, + "grad_norm": 0.59375, + "learning_rate": 1.0790352159179007e-09, + "loss": 0.014, + "mean_token_accuracy": 0.9962163493037224, + "num_tokens": 518874238.0, + "step": 4352 + }, + { + "entropy": 0.6307963952422142, + "epoch": 9.917023096663815, + "grad_norm": 0.5625, + "learning_rate": 1.024413232436905e-09, + "loss": 0.0198, + "mean_token_accuracy": 0.9955975860357285, + "num_tokens": 518993430.0, + "step": 4353 + }, + { + "entropy": 0.6354452073574066, + "epoch": 9.919304248645567, + "grad_norm": 0.60546875, + "learning_rate": 9.71209619254343e-10, + "loss": 0.0179, + "mean_token_accuracy": 0.9950416013598442, + "num_tokens": 519112087.0, + "step": 4354 + }, + { + "entropy": 0.6343529522418976, + "epoch": 9.921585400627317, + "grad_norm": 0.5625, + "learning_rate": 9.194244065674484e-10, + "loss": 0.0251, + "mean_token_accuracy": 0.9922114685177803, + "num_tokens": 519231314.0, + "step": 4355 + }, + { + "entropy": 0.6397521868348122, + "epoch": 9.923866552609068, + "grad_norm": 0.443359375, + "learning_rate": 8.690576237688208e-10, + "loss": 0.0122, + "mean_token_accuracy": 0.996030755341053, + "num_tokens": 519351366.0, + "step": 4356 + }, + { + "entropy": 0.6356499716639519, + "epoch": 9.926147704590818, + "grad_norm": 0.4609375, + "learning_rate": 8.201092994453153e-10, + "loss": 0.0128, + "mean_token_accuracy": 0.9962072670459747, + "num_tokens": 519470279.0, + "step": 4357 + }, + { + "entropy": 0.6328211799263954, + "epoch": 9.92842885657257, + "grad_norm": 0.4453125, + "learning_rate": 7.725794613791527e-10, + "loss": 0.0142, + "mean_token_accuracy": 0.9962450638413429, + "num_tokens": 519589832.0, + "step": 4358 + }, + { + "entropy": 0.6350522562861443, + "epoch": 9.93071000855432, + "grad_norm": 0.5390625, + "learning_rate": 7.264681365476422e-10, + "loss": 0.022, + "mean_token_accuracy": 0.991623692214489, + "num_tokens": 519709809.0, + "step": 4359 + }, + { + "entropy": 0.6348958760499954, + "epoch": 9.932991160536071, + "grad_norm": 0.388671875, + "learning_rate": 6.817753511226266e-10, + "loss": 0.0105, + "mean_token_accuracy": 0.9967608228325844, + "num_tokens": 519829536.0, + "step": 4360 + }, + { + "entropy": 0.632507286965847, + "epoch": 9.93527231251782, + "grad_norm": 0.451171875, + "learning_rate": 6.385011304704814e-10, + "loss": 0.0148, + "mean_token_accuracy": 0.9945643171668053, + "num_tokens": 519948756.0, + "step": 4361 + }, + { + "entropy": 0.6363910287618637, + "epoch": 9.937553464499572, + "grad_norm": 0.50390625, + "learning_rate": 5.96645499152948e-10, + "loss": 0.0116, + "mean_token_accuracy": 0.9965335801243782, + "num_tokens": 520068046.0, + "step": 4362 + }, + { + "entropy": 0.638060100376606, + "epoch": 9.939834616481322, + "grad_norm": 0.44921875, + "learning_rate": 5.562084809268564e-10, + "loss": 0.0133, + "mean_token_accuracy": 0.995528869330883, + "num_tokens": 520187452.0, + "step": 4363 + }, + { + "entropy": 0.6317355260252953, + "epoch": 9.942115768463074, + "grad_norm": 0.54296875, + "learning_rate": 5.171900987430146e-10, + "loss": 0.0214, + "mean_token_accuracy": 0.9927324280142784, + "num_tokens": 520306804.0, + "step": 4364 + }, + { + "entropy": 0.6319392919540405, + "epoch": 9.944396920444824, + "grad_norm": 0.6484375, + "learning_rate": 4.795903747475961e-10, + "loss": 0.0193, + "mean_token_accuracy": 0.993607334792614, + "num_tokens": 520425977.0, + "step": 4365 + }, + { + "entropy": 0.634120523929596, + "epoch": 9.946678072426575, + "grad_norm": 0.66796875, + "learning_rate": 4.434093302815856e-10, + "loss": 0.0207, + "mean_token_accuracy": 0.9933065548539162, + "num_tokens": 520545260.0, + "step": 4366 + }, + { + "entropy": 0.632045716047287, + "epoch": 9.948959224408327, + "grad_norm": 0.51171875, + "learning_rate": 4.0864698588077844e-10, + "loss": 0.0147, + "mean_token_accuracy": 0.994079940021038, + "num_tokens": 520664029.0, + "step": 4367 + }, + { + "entropy": 0.6334119364619255, + "epoch": 9.951240376390077, + "grad_norm": 0.490234375, + "learning_rate": 3.7530336127550306e-10, + "loss": 0.012, + "mean_token_accuracy": 0.9966258853673935, + "num_tokens": 520782958.0, + "step": 4368 + }, + { + "entropy": 0.6263623163104057, + "epoch": 9.953521528371828, + "grad_norm": 0.59765625, + "learning_rate": 3.4337847539089866e-10, + "loss": 0.0192, + "mean_token_accuracy": 0.9944785609841347, + "num_tokens": 520902382.0, + "step": 4369 + }, + { + "entropy": 0.633624941110611, + "epoch": 9.955802680353578, + "grad_norm": 0.47265625, + "learning_rate": 3.1287234634663766e-10, + "loss": 0.0147, + "mean_token_accuracy": 0.9943075180053711, + "num_tokens": 521021596.0, + "step": 4370 + }, + { + "entropy": 0.6345456540584564, + "epoch": 9.95808383233533, + "grad_norm": 0.44140625, + "learning_rate": 2.8378499145803593e-10, + "loss": 0.0196, + "mean_token_accuracy": 0.9943880885839462, + "num_tokens": 521141067.0, + "step": 4371 + }, + { + "entropy": 0.6369287371635437, + "epoch": 9.96036498431708, + "grad_norm": 0.50390625, + "learning_rate": 2.5611642723410987e-10, + "loss": 0.012, + "mean_token_accuracy": 0.9963438510894775, + "num_tokens": 521260274.0, + "step": 4372 + }, + { + "entropy": 0.6283308416604996, + "epoch": 9.962646136298831, + "grad_norm": 0.56640625, + "learning_rate": 2.2986666937896418e-10, + "loss": 0.0165, + "mean_token_accuracy": 0.9942722544074059, + "num_tokens": 521379458.0, + "step": 4373 + }, + { + "entropy": 0.6341523230075836, + "epoch": 9.964927288280581, + "grad_norm": 0.546875, + "learning_rate": 2.050357327917918e-10, + "loss": 0.0173, + "mean_token_accuracy": 0.9941214770078659, + "num_tokens": 521499205.0, + "step": 4374 + }, + { + "entropy": 0.636444516479969, + "epoch": 9.967208440262333, + "grad_norm": 0.40625, + "learning_rate": 1.816236315657638e-10, + "loss": 0.0123, + "mean_token_accuracy": 0.9948192685842514, + "num_tokens": 521619479.0, + "step": 4375 + }, + { + "entropy": 0.6401667073369026, + "epoch": 9.969489592244082, + "grad_norm": 0.3984375, + "learning_rate": 1.5963037898913957e-10, + "loss": 0.0132, + "mean_token_accuracy": 0.9967956766486168, + "num_tokens": 521739437.0, + "step": 4376 + }, + { + "entropy": 0.6332905292510986, + "epoch": 9.971770744225834, + "grad_norm": 0.58203125, + "learning_rate": 1.3905598754526684e-10, + "loss": 0.0181, + "mean_token_accuracy": 0.9938467517495155, + "num_tokens": 521859806.0, + "step": 4377 + }, + { + "entropy": 0.6329859495162964, + "epoch": 9.974051896207584, + "grad_norm": 0.65625, + "learning_rate": 1.1990046891147133e-10, + "loss": 0.0115, + "mean_token_accuracy": 0.9955286607146263, + "num_tokens": 521980085.0, + "step": 4378 + }, + { + "entropy": 0.6318459138274193, + "epoch": 9.976333048189336, + "grad_norm": 0.6328125, + "learning_rate": 1.021638339598896e-10, + "loss": 0.0218, + "mean_token_accuracy": 0.9940021112561226, + "num_tokens": 522099347.0, + "step": 4379 + }, + { + "entropy": 0.6314427107572556, + "epoch": 9.978614200171087, + "grad_norm": 0.59375, + "learning_rate": 8.584609275802402e-11, + "loss": 0.0215, + "mean_token_accuracy": 0.9929032847285271, + "num_tokens": 522218554.0, + "step": 4380 + }, + { + "entropy": 0.6355159729719162, + "epoch": 9.980895352152837, + "grad_norm": 0.59375, + "learning_rate": 7.094725456707752e-11, + "loss": 0.0207, + "mean_token_accuracy": 0.9933439865708351, + "num_tokens": 522337742.0, + "step": 4381 + }, + { + "entropy": 0.6317205503582954, + "epoch": 9.983176504134589, + "grad_norm": 0.4296875, + "learning_rate": 5.746732784361886e-11, + "loss": 0.0123, + "mean_token_accuracy": 0.9960421919822693, + "num_tokens": 522456684.0, + "step": 4382 + }, + { + "entropy": 0.6361135467886925, + "epoch": 9.985457656116338, + "grad_norm": 0.42578125, + "learning_rate": 4.540632023819491e-11, + "loss": 0.0125, + "mean_token_accuracy": 0.995446652173996, + "num_tokens": 522576181.0, + "step": 4383 + }, + { + "entropy": 0.6361514180898666, + "epoch": 9.98773880809809, + "grad_norm": 0.62109375, + "learning_rate": 3.47642385967184e-11, + "loss": 0.0198, + "mean_token_accuracy": 0.9935339912772179, + "num_tokens": 522695184.0, + "step": 4384 + }, + { + "entropy": 0.6331942826509476, + "epoch": 9.99001996007984, + "grad_norm": 0.5390625, + "learning_rate": 2.5541088959357697e-11, + "loss": 0.0194, + "mean_token_accuracy": 0.9935000389814377, + "num_tokens": 522814760.0, + "step": 4385 + }, + { + "entropy": 0.6366120204329491, + "epoch": 9.992301112061591, + "grad_norm": 0.5, + "learning_rate": 1.773687656109191e-11, + "loss": 0.0164, + "mean_token_accuracy": 0.9950039833784103, + "num_tokens": 522933627.0, + "step": 4386 + }, + { + "entropy": 0.6330078840255737, + "epoch": 9.994582264043341, + "grad_norm": 0.5078125, + "learning_rate": 1.1351605831433354e-11, + "loss": 0.0119, + "mean_token_accuracy": 0.9949922114610672, + "num_tokens": 523052670.0, + "step": 4387 + }, + { + "entropy": 0.6331177055835724, + "epoch": 9.996863416025093, + "grad_norm": 0.46484375, + "learning_rate": 6.385280394149984e-12, + "loss": 0.0113, + "mean_token_accuracy": 0.9970112442970276, + "num_tokens": 523171767.0, + "step": 4388 + }, + { + "entropy": 0.6268584951758385, + "epoch": 9.999144568006843, + "grad_norm": 0.50390625, + "learning_rate": 2.8379030686531696e-12, + "loss": 0.0152, + "mean_token_accuracy": 0.993193581700325, + "num_tokens": 523291375.0, + "step": 4389 + }, + { + "entropy": 0.6333612203598022, + "epoch": 10.0, + "grad_norm": 0.6796875, + "learning_rate": 7.094758677772539e-13, + "loss": 0.0116, + "mean_token_accuracy": 0.995600163936615, + "num_tokens": 523334780.0, + "step": 4390 + } + ], + "logging_steps": 1, + "max_steps": 4390, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.276140012720427e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}