diff --git "a/checkpoint-20117/trainer_state.json" "b/checkpoint-20117/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-20117/trainer_state.json" @@ -0,0 +1,22155 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.999950290798827, + "eval_steps": 500, + "global_step": 20117, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000994184023462743, + "grad_norm": 1.0385397672653198, + "learning_rate": 1.8e-05, + "loss": 0.604, + "memory/device_reserved (GiB)": 21.62, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 10, + "tokens_per_second_per_gpu": 347.46 + }, + { + "epoch": 0.001988368046925486, + "grad_norm": 0.556696891784668, + "learning_rate": 3.8e-05, + "loss": 0.4244, + "memory/device_reserved (GiB)": 21.62, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 20, + "tokens_per_second_per_gpu": 318.15 + }, + { + "epoch": 0.002982552070388229, + "grad_norm": 0.24665255844593048, + "learning_rate": 5.8e-05, + "loss": 0.3883, + "memory/device_reserved (GiB)": 21.62, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 30, + "tokens_per_second_per_gpu": 330.79 + }, + { + "epoch": 0.003976736093850972, + "grad_norm": 0.3350813090801239, + "learning_rate": 7.800000000000001e-05, + "loss": 0.4163, + "memory/device_reserved (GiB)": 22.38, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 40, + "tokens_per_second_per_gpu": 396.14 + }, + { + "epoch": 0.0049709201173137145, + "grad_norm": 0.42506587505340576, + "learning_rate": 9.8e-05, + "loss": 0.3811, + "memory/device_reserved (GiB)": 22.42, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 50, + "tokens_per_second_per_gpu": 402.54 + }, + { + "epoch": 0.005965104140776458, + "grad_norm": 0.5153183937072754, + "learning_rate": 0.000118, + "loss": 0.418, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 60, + "tokens_per_second_per_gpu": 330.44 + }, + { + "epoch": 0.0069592881642392005, + "grad_norm": 0.3010534644126892, + "learning_rate": 0.000138, + "loss": 0.3671, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 70, + "tokens_per_second_per_gpu": 344.47 + }, + { + "epoch": 0.007953472187701944, + "grad_norm": 0.46113327145576477, + "learning_rate": 0.00015800000000000002, + "loss": 0.3387, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 80, + "tokens_per_second_per_gpu": 372.27 + }, + { + "epoch": 0.008947656211164686, + "grad_norm": 0.4268002212047577, + "learning_rate": 0.00017800000000000002, + "loss": 0.2999, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 90, + "tokens_per_second_per_gpu": 344.49 + }, + { + "epoch": 0.009941840234627429, + "grad_norm": 0.5650917291641235, + "learning_rate": 0.00019800000000000002, + "loss": 0.3356, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 100, + "tokens_per_second_per_gpu": 335.57 + }, + { + "epoch": 0.010936024258090173, + "grad_norm": 0.2521424889564514, + "learning_rate": 0.00019999990023993625, + "loss": 0.3025, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 110, + "tokens_per_second_per_gpu": 307.78 + }, + { + "epoch": 0.011930208281552916, + "grad_norm": 0.34742406010627747, + "learning_rate": 0.00019999955539058868, + "loss": 0.351, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 120, + "tokens_per_second_per_gpu": 355.93 + }, + { + "epoch": 0.012924392305015658, + "grad_norm": 0.2816642224788666, + "learning_rate": 0.00019999896422120075, + "loss": 0.4031, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 130, + "tokens_per_second_per_gpu": 392.82 + }, + { + "epoch": 0.013918576328478401, + "grad_norm": 0.41705670952796936, + "learning_rate": 0.0001999981267332287, + "loss": 0.3481, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 140, + "tokens_per_second_per_gpu": 404.67 + }, + { + "epoch": 0.014912760351941143, + "grad_norm": 0.5290879011154175, + "learning_rate": 0.00019999704292873545, + "loss": 0.3784, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 150, + "tokens_per_second_per_gpu": 409.59 + }, + { + "epoch": 0.015906944375403888, + "grad_norm": 0.2704632878303528, + "learning_rate": 0.0001999957128103906, + "loss": 0.2029, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 160, + "tokens_per_second_per_gpu": 309.25 + }, + { + "epoch": 0.01690112839886663, + "grad_norm": 0.3863286077976227, + "learning_rate": 0.00019999413638147049, + "loss": 0.3084, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 170, + "tokens_per_second_per_gpu": 395.0 + }, + { + "epoch": 0.017895312422329373, + "grad_norm": 0.32178717851638794, + "learning_rate": 0.00019999231364585827, + "loss": 0.2713, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 180, + "tokens_per_second_per_gpu": 343.31 + }, + { + "epoch": 0.018889496445792117, + "grad_norm": 0.3010699450969696, + "learning_rate": 0.00019999024460804366, + "loss": 0.353, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 190, + "tokens_per_second_per_gpu": 353.97 + }, + { + "epoch": 0.019883680469254858, + "grad_norm": 0.2678498327732086, + "learning_rate": 0.00019998792927312315, + "loss": 0.1904, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 200, + "tokens_per_second_per_gpu": 352.72 + }, + { + "epoch": 0.020877864492717602, + "grad_norm": 0.25298821926116943, + "learning_rate": 0.00019998536764679993, + "loss": 0.2397, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 210, + "tokens_per_second_per_gpu": 336.24 + }, + { + "epoch": 0.021872048516180347, + "grad_norm": 0.4027327001094818, + "learning_rate": 0.0001999825597353838, + "loss": 0.3111, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 220, + "tokens_per_second_per_gpu": 388.05 + }, + { + "epoch": 0.022866232539643087, + "grad_norm": 0.4060591757297516, + "learning_rate": 0.00019997950554579124, + "loss": 0.2578, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 230, + "tokens_per_second_per_gpu": 354.57 + }, + { + "epoch": 0.02386041656310583, + "grad_norm": 0.29408156871795654, + "learning_rate": 0.00019997620508554537, + "loss": 0.2952, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.45, + "memory/max_allocated (GiB)": 20.45, + "step": 240, + "tokens_per_second_per_gpu": 334.85 + }, + { + "epoch": 0.024854600586568572, + "grad_norm": 0.381528377532959, + "learning_rate": 0.00019997265836277595, + "loss": 0.2397, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 250, + "tokens_per_second_per_gpu": 402.73 + }, + { + "epoch": 0.025848784610031317, + "grad_norm": 0.30223792791366577, + "learning_rate": 0.00019996886538621925, + "loss": 0.4017, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 260, + "tokens_per_second_per_gpu": 430.9 + }, + { + "epoch": 0.02684296863349406, + "grad_norm": 0.3889918327331543, + "learning_rate": 0.0001999648261652182, + "loss": 0.2051, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 270, + "tokens_per_second_per_gpu": 279.64 + }, + { + "epoch": 0.027837152656956802, + "grad_norm": 0.4357030391693115, + "learning_rate": 0.00019996054070972225, + "loss": 0.3332, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 280, + "tokens_per_second_per_gpu": 362.33 + }, + { + "epoch": 0.028831336680419546, + "grad_norm": 0.3736005425453186, + "learning_rate": 0.00019995600903028742, + "loss": 0.3052, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 290, + "tokens_per_second_per_gpu": 362.89 + }, + { + "epoch": 0.029825520703882287, + "grad_norm": 0.39748865365982056, + "learning_rate": 0.00019995123113807615, + "loss": 0.361, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 300, + "tokens_per_second_per_gpu": 387.24 + }, + { + "epoch": 0.03081970472734503, + "grad_norm": 0.18977899849414825, + "learning_rate": 0.00019994620704485741, + "loss": 0.2449, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 310, + "tokens_per_second_per_gpu": 339.16 + }, + { + "epoch": 0.031813888750807776, + "grad_norm": 0.3898354172706604, + "learning_rate": 0.00019994093676300662, + "loss": 0.266, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 320, + "tokens_per_second_per_gpu": 360.62 + }, + { + "epoch": 0.032808072774270516, + "grad_norm": 0.3335312008857727, + "learning_rate": 0.00019993542030550553, + "loss": 0.2886, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 330, + "tokens_per_second_per_gpu": 386.95 + }, + { + "epoch": 0.03380225679773326, + "grad_norm": 0.3043772280216217, + "learning_rate": 0.00019992965768594244, + "loss": 0.2542, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 340, + "tokens_per_second_per_gpu": 400.18 + }, + { + "epoch": 0.034796440821196005, + "grad_norm": 0.35784608125686646, + "learning_rate": 0.00019992364891851185, + "loss": 0.2748, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.66, + "memory/max_allocated (GiB)": 19.66, + "step": 350, + "tokens_per_second_per_gpu": 333.64 + }, + { + "epoch": 0.035790624844658746, + "grad_norm": 0.5068204998970032, + "learning_rate": 0.00019991739401801464, + "loss": 0.2705, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 360, + "tokens_per_second_per_gpu": 367.07 + }, + { + "epoch": 0.03678480886812149, + "grad_norm": 0.44382113218307495, + "learning_rate": 0.00019991089299985793, + "loss": 0.2403, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 370, + "tokens_per_second_per_gpu": 365.0 + }, + { + "epoch": 0.037778992891584234, + "grad_norm": 0.27181145548820496, + "learning_rate": 0.0001999041458800551, + "loss": 0.3065, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.18, + "memory/max_allocated (GiB)": 18.18, + "step": 380, + "tokens_per_second_per_gpu": 372.54 + }, + { + "epoch": 0.038773176915046975, + "grad_norm": 0.28408923745155334, + "learning_rate": 0.00019989715267522575, + "loss": 0.2894, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 390, + "tokens_per_second_per_gpu": 340.31 + }, + { + "epoch": 0.039767360938509716, + "grad_norm": 0.4882698357105255, + "learning_rate": 0.00019988991340259563, + "loss": 0.4061, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 400, + "tokens_per_second_per_gpu": 358.94 + }, + { + "epoch": 0.040761544961972464, + "grad_norm": 0.2663392722606659, + "learning_rate": 0.0001998824280799966, + "loss": 0.3141, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 410, + "tokens_per_second_per_gpu": 368.76 + }, + { + "epoch": 0.041755728985435205, + "grad_norm": 0.25356051325798035, + "learning_rate": 0.00019987469672586654, + "loss": 0.3374, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 420, + "tokens_per_second_per_gpu": 402.99 + }, + { + "epoch": 0.042749913008897945, + "grad_norm": 0.4773045778274536, + "learning_rate": 0.00019986671935924946, + "loss": 0.2929, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 430, + "tokens_per_second_per_gpu": 360.32 + }, + { + "epoch": 0.04374409703236069, + "grad_norm": 0.37164929509162903, + "learning_rate": 0.0001998584959997953, + "loss": 0.3106, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 440, + "tokens_per_second_per_gpu": 415.21 + }, + { + "epoch": 0.044738281055823434, + "grad_norm": 0.3310747742652893, + "learning_rate": 0.00019985002666775986, + "loss": 0.2676, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 450, + "tokens_per_second_per_gpu": 363.95 + }, + { + "epoch": 0.045732465079286175, + "grad_norm": 0.32523512840270996, + "learning_rate": 0.000199841311384005, + "loss": 0.3139, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 460, + "tokens_per_second_per_gpu": 361.92 + }, + { + "epoch": 0.046726649102748916, + "grad_norm": 0.40525123476982117, + "learning_rate": 0.00019983235016999827, + "loss": 0.323, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 470, + "tokens_per_second_per_gpu": 413.28 + }, + { + "epoch": 0.04772083312621166, + "grad_norm": 0.4233141541481018, + "learning_rate": 0.000199823143047813, + "loss": 0.2941, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 480, + "tokens_per_second_per_gpu": 383.95 + }, + { + "epoch": 0.048715017149674404, + "grad_norm": 0.21106044948101044, + "learning_rate": 0.0001998136900401283, + "loss": 0.2835, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 490, + "tokens_per_second_per_gpu": 368.99 + }, + { + "epoch": 0.049709201173137145, + "grad_norm": 0.34198832511901855, + "learning_rate": 0.00019980399117022895, + "loss": 0.3895, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 500, + "tokens_per_second_per_gpu": 387.18 + }, + { + "epoch": 0.05070338519659989, + "grad_norm": 0.44045203924179077, + "learning_rate": 0.00019979404646200527, + "loss": 0.2854, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 510, + "tokens_per_second_per_gpu": 336.09 + }, + { + "epoch": 0.051697569220062634, + "grad_norm": 0.33906373381614685, + "learning_rate": 0.0001997838559399532, + "loss": 0.3218, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 520, + "tokens_per_second_per_gpu": 406.12 + }, + { + "epoch": 0.052691753243525374, + "grad_norm": 0.32613444328308105, + "learning_rate": 0.00019977341962917414, + "loss": 0.2803, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 530, + "tokens_per_second_per_gpu": 407.45 + }, + { + "epoch": 0.05368593726698812, + "grad_norm": 0.3789099454879761, + "learning_rate": 0.00019976273755537499, + "loss": 0.3143, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 540, + "tokens_per_second_per_gpu": 377.61 + }, + { + "epoch": 0.05468012129045086, + "grad_norm": 0.4602185785770416, + "learning_rate": 0.00019975180974486786, + "loss": 0.2434, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 550, + "tokens_per_second_per_gpu": 349.16 + }, + { + "epoch": 0.055674305313913604, + "grad_norm": 0.4232983887195587, + "learning_rate": 0.00019974063622457032, + "loss": 0.3238, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 560, + "tokens_per_second_per_gpu": 348.84 + }, + { + "epoch": 0.05666848933737635, + "grad_norm": 0.16223137080669403, + "learning_rate": 0.0001997292170220051, + "loss": 0.2722, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 570, + "tokens_per_second_per_gpu": 340.57 + }, + { + "epoch": 0.05766267336083909, + "grad_norm": 0.4484419822692871, + "learning_rate": 0.00019971755216530008, + "loss": 0.2801, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 580, + "tokens_per_second_per_gpu": 353.53 + }, + { + "epoch": 0.05865685738430183, + "grad_norm": 0.23834413290023804, + "learning_rate": 0.0001997056416831883, + "loss": 0.3015, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 590, + "tokens_per_second_per_gpu": 365.09 + }, + { + "epoch": 0.059651041407764574, + "grad_norm": 0.4154009521007538, + "learning_rate": 0.0001996934856050078, + "loss": 0.2959, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 600, + "tokens_per_second_per_gpu": 365.21 + }, + { + "epoch": 0.06064522543122732, + "grad_norm": 0.23120558261871338, + "learning_rate": 0.00019968108396070157, + "loss": 0.2563, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 610, + "tokens_per_second_per_gpu": 397.04 + }, + { + "epoch": 0.06163940945469006, + "grad_norm": 0.4453487694263458, + "learning_rate": 0.00019966843678081745, + "loss": 0.3025, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 620, + "tokens_per_second_per_gpu": 380.75 + }, + { + "epoch": 0.06263359347815281, + "grad_norm": 0.47098028659820557, + "learning_rate": 0.0001996555440965081, + "loss": 0.2248, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 630, + "tokens_per_second_per_gpu": 367.41 + }, + { + "epoch": 0.06362777750161555, + "grad_norm": 0.2540164887905121, + "learning_rate": 0.000199642405939531, + "loss": 0.2597, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 640, + "tokens_per_second_per_gpu": 361.69 + }, + { + "epoch": 0.06462196152507829, + "grad_norm": 0.30327877402305603, + "learning_rate": 0.00019962902234224816, + "loss": 0.2623, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 650, + "tokens_per_second_per_gpu": 316.43 + }, + { + "epoch": 0.06561614554854103, + "grad_norm": 0.3211521804332733, + "learning_rate": 0.00019961539333762622, + "loss": 0.2571, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 660, + "tokens_per_second_per_gpu": 352.04 + }, + { + "epoch": 0.06661032957200377, + "grad_norm": 0.19880682229995728, + "learning_rate": 0.00019960151895923628, + "loss": 0.2531, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 670, + "tokens_per_second_per_gpu": 419.33 + }, + { + "epoch": 0.06760451359546651, + "grad_norm": 0.3732224702835083, + "learning_rate": 0.0001995873992412539, + "loss": 0.3275, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 680, + "tokens_per_second_per_gpu": 428.82 + }, + { + "epoch": 0.06859869761892927, + "grad_norm": 0.2961219847202301, + "learning_rate": 0.00019957303421845889, + "loss": 0.2884, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 690, + "tokens_per_second_per_gpu": 394.73 + }, + { + "epoch": 0.06959288164239201, + "grad_norm": 0.4014001488685608, + "learning_rate": 0.00019955842392623539, + "loss": 0.25, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 700, + "tokens_per_second_per_gpu": 402.5 + }, + { + "epoch": 0.07058706566585475, + "grad_norm": 0.3465085029602051, + "learning_rate": 0.0001995435684005716, + "loss": 0.3208, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 710, + "tokens_per_second_per_gpu": 418.41 + }, + { + "epoch": 0.07158124968931749, + "grad_norm": 0.302223265171051, + "learning_rate": 0.0001995284676780598, + "loss": 0.2538, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 720, + "tokens_per_second_per_gpu": 303.27 + }, + { + "epoch": 0.07257543371278023, + "grad_norm": 0.27174416184425354, + "learning_rate": 0.00019951312179589632, + "loss": 0.2559, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 730, + "tokens_per_second_per_gpu": 373.05 + }, + { + "epoch": 0.07356961773624297, + "grad_norm": 0.23477095365524292, + "learning_rate": 0.00019949753079188124, + "loss": 0.2655, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 740, + "tokens_per_second_per_gpu": 334.0 + }, + { + "epoch": 0.07456380175970573, + "grad_norm": 0.35739773511886597, + "learning_rate": 0.00019948169470441855, + "loss": 0.2869, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 750, + "tokens_per_second_per_gpu": 398.33 + }, + { + "epoch": 0.07555798578316847, + "grad_norm": 0.42109552025794983, + "learning_rate": 0.0001994656135725159, + "loss": 0.254, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.69, + "memory/max_allocated (GiB)": 19.69, + "step": 760, + "tokens_per_second_per_gpu": 303.95 + }, + { + "epoch": 0.07655216980663121, + "grad_norm": 0.5730820298194885, + "learning_rate": 0.00019944928743578446, + "loss": 0.2718, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.09, + "memory/max_allocated (GiB)": 19.09, + "step": 770, + "tokens_per_second_per_gpu": 379.86 + }, + { + "epoch": 0.07754635383009395, + "grad_norm": 0.3591574430465698, + "learning_rate": 0.000199432716334439, + "loss": 0.3039, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 780, + "tokens_per_second_per_gpu": 387.31 + }, + { + "epoch": 0.07854053785355669, + "grad_norm": 0.2447095662355423, + "learning_rate": 0.0001994159003092976, + "loss": 0.3044, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 790, + "tokens_per_second_per_gpu": 332.59 + }, + { + "epoch": 0.07953472187701943, + "grad_norm": 1.134704351425171, + "learning_rate": 0.0001993988394017817, + "loss": 0.21, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 800, + "tokens_per_second_per_gpu": 376.02 + }, + { + "epoch": 0.08052890590048217, + "grad_norm": 0.4009522795677185, + "learning_rate": 0.00019938153365391595, + "loss": 0.3189, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 810, + "tokens_per_second_per_gpu": 409.87 + }, + { + "epoch": 0.08152308992394493, + "grad_norm": 0.32234618067741394, + "learning_rate": 0.00019936398310832802, + "loss": 0.3242, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 820, + "tokens_per_second_per_gpu": 429.4 + }, + { + "epoch": 0.08251727394740767, + "grad_norm": 0.42835402488708496, + "learning_rate": 0.00019934618780824865, + "loss": 0.2646, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 830, + "tokens_per_second_per_gpu": 334.03 + }, + { + "epoch": 0.08351145797087041, + "grad_norm": 0.39109423756599426, + "learning_rate": 0.00019932814779751143, + "loss": 0.2891, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 840, + "tokens_per_second_per_gpu": 386.65 + }, + { + "epoch": 0.08450564199433315, + "grad_norm": 0.30428260564804077, + "learning_rate": 0.00019930986312055268, + "loss": 0.2478, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 850, + "tokens_per_second_per_gpu": 344.59 + }, + { + "epoch": 0.08549982601779589, + "grad_norm": 0.41264912486076355, + "learning_rate": 0.00019929133382241146, + "loss": 0.2942, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 860, + "tokens_per_second_per_gpu": 315.94 + }, + { + "epoch": 0.08649401004125863, + "grad_norm": 0.3414939045906067, + "learning_rate": 0.00019927255994872932, + "loss": 0.2403, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 870, + "tokens_per_second_per_gpu": 373.66 + }, + { + "epoch": 0.08748819406472139, + "grad_norm": 0.3265244662761688, + "learning_rate": 0.00019925354154575028, + "loss": 0.2024, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 880, + "tokens_per_second_per_gpu": 304.2 + }, + { + "epoch": 0.08848237808818413, + "grad_norm": 0.45628246665000916, + "learning_rate": 0.00019923427866032074, + "loss": 0.2319, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 890, + "tokens_per_second_per_gpu": 361.11 + }, + { + "epoch": 0.08947656211164687, + "grad_norm": 0.15768177807331085, + "learning_rate": 0.00019921477133988917, + "loss": 0.2944, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 900, + "tokens_per_second_per_gpu": 342.86 + }, + { + "epoch": 0.09047074613510961, + "grad_norm": 0.31603825092315674, + "learning_rate": 0.0001991950196325063, + "loss": 0.2697, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 910, + "tokens_per_second_per_gpu": 366.81 + }, + { + "epoch": 0.09146493015857235, + "grad_norm": 0.30974528193473816, + "learning_rate": 0.00019917502358682474, + "loss": 0.2915, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.23, + "memory/max_allocated (GiB)": 19.23, + "step": 920, + "tokens_per_second_per_gpu": 333.81 + }, + { + "epoch": 0.09245911418203509, + "grad_norm": 0.5116154551506042, + "learning_rate": 0.00019915478325209892, + "loss": 0.2984, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 930, + "tokens_per_second_per_gpu": 420.26 + }, + { + "epoch": 0.09345329820549783, + "grad_norm": 0.4105079174041748, + "learning_rate": 0.00019913429867818517, + "loss": 0.2456, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 940, + "tokens_per_second_per_gpu": 297.2 + }, + { + "epoch": 0.09444748222896059, + "grad_norm": 0.32649049162864685, + "learning_rate": 0.00019911356991554122, + "loss": 0.2974, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 950, + "tokens_per_second_per_gpu": 395.2 + }, + { + "epoch": 0.09544166625242333, + "grad_norm": 0.3354904353618622, + "learning_rate": 0.00019909259701522645, + "loss": 0.2627, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 960, + "tokens_per_second_per_gpu": 324.97 + }, + { + "epoch": 0.09643585027588607, + "grad_norm": 0.33404502272605896, + "learning_rate": 0.00019907138002890154, + "loss": 0.2386, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 970, + "tokens_per_second_per_gpu": 317.18 + }, + { + "epoch": 0.09743003429934881, + "grad_norm": 0.3161865472793579, + "learning_rate": 0.0001990499190088284, + "loss": 0.2362, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 980, + "tokens_per_second_per_gpu": 310.78 + }, + { + "epoch": 0.09842421832281155, + "grad_norm": 0.33621177077293396, + "learning_rate": 0.00019902821400787004, + "loss": 0.2792, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 990, + "tokens_per_second_per_gpu": 333.17 + }, + { + "epoch": 0.09941840234627429, + "grad_norm": 0.09930042922496796, + "learning_rate": 0.00019900626507949053, + "loss": 0.2622, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1000, + "tokens_per_second_per_gpu": 365.14 + }, + { + "epoch": 0.10041258636973704, + "grad_norm": 0.3367346525192261, + "learning_rate": 0.00019898407227775464, + "loss": 0.2214, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 1010, + "tokens_per_second_per_gpu": 342.03 + }, + { + "epoch": 0.10140677039319979, + "grad_norm": 0.35485416650772095, + "learning_rate": 0.00019896163565732798, + "loss": 0.3446, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1020, + "tokens_per_second_per_gpu": 378.53 + }, + { + "epoch": 0.10240095441666253, + "grad_norm": 0.31839531660079956, + "learning_rate": 0.0001989389552734767, + "loss": 0.2519, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 1030, + "tokens_per_second_per_gpu": 390.74 + }, + { + "epoch": 0.10339513844012527, + "grad_norm": 0.36318239569664, + "learning_rate": 0.0001989160311820673, + "loss": 0.3, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1040, + "tokens_per_second_per_gpu": 404.8 + }, + { + "epoch": 0.10438932246358801, + "grad_norm": 0.20552317798137665, + "learning_rate": 0.00019889286343956677, + "loss": 0.2531, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 1050, + "tokens_per_second_per_gpu": 349.7 + }, + { + "epoch": 0.10538350648705075, + "grad_norm": 0.4065081477165222, + "learning_rate": 0.00019886945210304208, + "loss": 0.3196, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 1060, + "tokens_per_second_per_gpu": 356.46 + }, + { + "epoch": 0.10637769051051349, + "grad_norm": 0.3974571228027344, + "learning_rate": 0.00019884579723016037, + "loss": 0.2585, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 1070, + "tokens_per_second_per_gpu": 361.16 + }, + { + "epoch": 0.10737187453397624, + "grad_norm": 0.39827367663383484, + "learning_rate": 0.0001988218988791885, + "loss": 0.2807, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1080, + "tokens_per_second_per_gpu": 399.5 + }, + { + "epoch": 0.10836605855743899, + "grad_norm": 0.38661155104637146, + "learning_rate": 0.00019879775710899322, + "loss": 0.262, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.81, + "memory/max_allocated (GiB)": 19.81, + "step": 1090, + "tokens_per_second_per_gpu": 311.89 + }, + { + "epoch": 0.10936024258090173, + "grad_norm": 0.272942453622818, + "learning_rate": 0.0001987733719790408, + "loss": 0.1925, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1100, + "tokens_per_second_per_gpu": 399.72 + }, + { + "epoch": 0.11035442660436447, + "grad_norm": 0.32272958755493164, + "learning_rate": 0.00019874874354939697, + "loss": 0.1643, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 1110, + "tokens_per_second_per_gpu": 315.4 + }, + { + "epoch": 0.11134861062782721, + "grad_norm": 0.3583936095237732, + "learning_rate": 0.00019872387188072673, + "loss": 0.2834, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 1120, + "tokens_per_second_per_gpu": 439.26 + }, + { + "epoch": 0.11234279465128995, + "grad_norm": 0.3295114040374756, + "learning_rate": 0.00019869875703429433, + "loss": 0.2157, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 1130, + "tokens_per_second_per_gpu": 290.71 + }, + { + "epoch": 0.1133369786747527, + "grad_norm": 0.21794943511486053, + "learning_rate": 0.00019867339907196283, + "loss": 0.2848, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1140, + "tokens_per_second_per_gpu": 382.48 + }, + { + "epoch": 0.11433116269821544, + "grad_norm": 0.47928282618522644, + "learning_rate": 0.00019864779805619435, + "loss": 0.2497, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 1150, + "tokens_per_second_per_gpu": 351.65 + }, + { + "epoch": 0.11532534672167818, + "grad_norm": 0.4386768341064453, + "learning_rate": 0.0001986219540500496, + "loss": 0.2885, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 1160, + "tokens_per_second_per_gpu": 351.08 + }, + { + "epoch": 0.11631953074514093, + "grad_norm": 0.46047040820121765, + "learning_rate": 0.00019859586711718776, + "loss": 0.325, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1170, + "tokens_per_second_per_gpu": 375.58 + }, + { + "epoch": 0.11731371476860367, + "grad_norm": 0.4252080023288727, + "learning_rate": 0.00019856953732186653, + "loss": 0.2923, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 1180, + "tokens_per_second_per_gpu": 349.36 + }, + { + "epoch": 0.11830789879206641, + "grad_norm": 0.3643350899219513, + "learning_rate": 0.00019854296472894168, + "loss": 0.2315, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1190, + "tokens_per_second_per_gpu": 390.28 + }, + { + "epoch": 0.11930208281552915, + "grad_norm": 0.4747346341609955, + "learning_rate": 0.00019851614940386722, + "loss": 0.3214, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.19, + "memory/max_allocated (GiB)": 19.19, + "step": 1200, + "tokens_per_second_per_gpu": 414.56 + }, + { + "epoch": 0.1202962668389919, + "grad_norm": 0.32468438148498535, + "learning_rate": 0.0001984890914126949, + "loss": 0.2606, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 1210, + "tokens_per_second_per_gpu": 393.59 + }, + { + "epoch": 0.12129045086245464, + "grad_norm": 0.27018123865127563, + "learning_rate": 0.00019846179082207429, + "loss": 0.2457, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 1220, + "tokens_per_second_per_gpu": 357.71 + }, + { + "epoch": 0.12228463488591738, + "grad_norm": 0.39820268750190735, + "learning_rate": 0.00019843424769925248, + "loss": 0.2618, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 1230, + "tokens_per_second_per_gpu": 377.03 + }, + { + "epoch": 0.12327881890938013, + "grad_norm": 0.4186467230319977, + "learning_rate": 0.00019840646211207407, + "loss": 0.2864, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1240, + "tokens_per_second_per_gpu": 415.58 + }, + { + "epoch": 0.12427300293284287, + "grad_norm": 0.3047218918800354, + "learning_rate": 0.00019837843412898081, + "loss": 0.1777, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1250, + "tokens_per_second_per_gpu": 376.07 + }, + { + "epoch": 0.12526718695630562, + "grad_norm": 0.3663698136806488, + "learning_rate": 0.0001983501638190115, + "loss": 0.2906, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1260, + "tokens_per_second_per_gpu": 341.64 + }, + { + "epoch": 0.12626137097976836, + "grad_norm": 0.5897945761680603, + "learning_rate": 0.00019832165125180194, + "loss": 0.2498, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 1270, + "tokens_per_second_per_gpu": 380.54 + }, + { + "epoch": 0.1272555550032311, + "grad_norm": 0.40836209058761597, + "learning_rate": 0.0001982928964975846, + "loss": 0.2722, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 1280, + "tokens_per_second_per_gpu": 387.55 + }, + { + "epoch": 0.12824973902669384, + "grad_norm": 0.33597612380981445, + "learning_rate": 0.00019826389962718848, + "loss": 0.3202, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1290, + "tokens_per_second_per_gpu": 313.03 + }, + { + "epoch": 0.12924392305015658, + "grad_norm": 0.44784456491470337, + "learning_rate": 0.00019823466071203902, + "loss": 0.2949, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1300, + "tokens_per_second_per_gpu": 390.03 + }, + { + "epoch": 0.13023810707361932, + "grad_norm": 0.3199595510959625, + "learning_rate": 0.0001982051798241579, + "loss": 0.2323, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 1310, + "tokens_per_second_per_gpu": 415.8 + }, + { + "epoch": 0.13123229109708207, + "grad_norm": 0.4944785535335541, + "learning_rate": 0.0001981754570361627, + "loss": 0.291, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 1320, + "tokens_per_second_per_gpu": 347.08 + }, + { + "epoch": 0.1322264751205448, + "grad_norm": 0.379162073135376, + "learning_rate": 0.00019814549242126698, + "loss": 0.2631, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 1330, + "tokens_per_second_per_gpu": 405.3 + }, + { + "epoch": 0.13322065914400755, + "grad_norm": 0.20690025389194489, + "learning_rate": 0.00019811528605327992, + "loss": 0.2099, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1340, + "tokens_per_second_per_gpu": 377.71 + }, + { + "epoch": 0.1342148431674703, + "grad_norm": 0.39738351106643677, + "learning_rate": 0.00019808483800660612, + "loss": 0.2486, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 1350, + "tokens_per_second_per_gpu": 333.2 + }, + { + "epoch": 0.13520902719093303, + "grad_norm": 0.5237305164337158, + "learning_rate": 0.00019805414835624566, + "loss": 0.2407, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 1360, + "tokens_per_second_per_gpu": 353.94 + }, + { + "epoch": 0.1362032112143958, + "grad_norm": 0.2773837447166443, + "learning_rate": 0.00019802321717779354, + "loss": 0.3119, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 1370, + "tokens_per_second_per_gpu": 391.05 + }, + { + "epoch": 0.13719739523785854, + "grad_norm": 0.2825298011302948, + "learning_rate": 0.00019799204454743987, + "loss": 0.2812, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1380, + "tokens_per_second_per_gpu": 348.62 + }, + { + "epoch": 0.13819157926132128, + "grad_norm": 0.3622908592224121, + "learning_rate": 0.00019796063054196937, + "loss": 0.2506, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1390, + "tokens_per_second_per_gpu": 380.11 + }, + { + "epoch": 0.13918576328478402, + "grad_norm": 0.3992385268211365, + "learning_rate": 0.0001979289752387614, + "loss": 0.2132, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1400, + "tokens_per_second_per_gpu": 301.04 + }, + { + "epoch": 0.14017994730824676, + "grad_norm": 0.4148050546646118, + "learning_rate": 0.00019789707871578966, + "loss": 0.1813, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 1410, + "tokens_per_second_per_gpu": 296.18 + }, + { + "epoch": 0.1411741313317095, + "grad_norm": 0.36811864376068115, + "learning_rate": 0.000197864941051622, + "loss": 0.2607, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 1420, + "tokens_per_second_per_gpu": 376.48 + }, + { + "epoch": 0.14216831535517224, + "grad_norm": 0.33353865146636963, + "learning_rate": 0.00019783256232542033, + "loss": 0.2694, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 1430, + "tokens_per_second_per_gpu": 352.36 + }, + { + "epoch": 0.14316249937863498, + "grad_norm": 0.4390527606010437, + "learning_rate": 0.00019779994261694025, + "loss": 0.2851, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 1440, + "tokens_per_second_per_gpu": 332.85 + }, + { + "epoch": 0.14415668340209772, + "grad_norm": 0.4553990066051483, + "learning_rate": 0.00019776708200653102, + "loss": 0.3301, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 1450, + "tokens_per_second_per_gpu": 400.73 + }, + { + "epoch": 0.14515086742556046, + "grad_norm": 0.3526112139225006, + "learning_rate": 0.00019773398057513526, + "loss": 0.2276, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 1460, + "tokens_per_second_per_gpu": 342.73 + }, + { + "epoch": 0.1461450514490232, + "grad_norm": 0.4758242070674896, + "learning_rate": 0.0001977006384042888, + "loss": 0.2185, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 1470, + "tokens_per_second_per_gpu": 343.6 + }, + { + "epoch": 0.14713923547248595, + "grad_norm": 0.4020686447620392, + "learning_rate": 0.00019766705557612045, + "loss": 0.2598, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 1480, + "tokens_per_second_per_gpu": 288.04 + }, + { + "epoch": 0.1481334194959487, + "grad_norm": 0.44152265787124634, + "learning_rate": 0.00019763323217335182, + "loss": 0.3394, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 1490, + "tokens_per_second_per_gpu": 408.81 + }, + { + "epoch": 0.14912760351941146, + "grad_norm": 0.31458431482315063, + "learning_rate": 0.00019759916827929706, + "loss": 0.2692, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 1500, + "tokens_per_second_per_gpu": 359.64 + }, + { + "epoch": 0.1501217875428742, + "grad_norm": 0.48072609305381775, + "learning_rate": 0.0001975648639778628, + "loss": 0.3545, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1510, + "tokens_per_second_per_gpu": 374.78 + }, + { + "epoch": 0.15111597156633694, + "grad_norm": 0.30275699496269226, + "learning_rate": 0.00019753031935354777, + "loss": 0.2109, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 1520, + "tokens_per_second_per_gpu": 371.06 + }, + { + "epoch": 0.15211015558979968, + "grad_norm": 0.5390923619270325, + "learning_rate": 0.00019749553449144267, + "loss": 0.2435, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 1530, + "tokens_per_second_per_gpu": 368.76 + }, + { + "epoch": 0.15310433961326242, + "grad_norm": 0.28221625089645386, + "learning_rate": 0.00019746050947722993, + "loss": 0.2105, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1540, + "tokens_per_second_per_gpu": 322.88 + }, + { + "epoch": 0.15409852363672516, + "grad_norm": 0.3471927046775818, + "learning_rate": 0.00019742524439718363, + "loss": 0.2761, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 17.4, + "memory/max_allocated (GiB)": 17.4, + "step": 1550, + "tokens_per_second_per_gpu": 331.45 + }, + { + "epoch": 0.1550927076601879, + "grad_norm": 0.34601831436157227, + "learning_rate": 0.0001973897393381691, + "loss": 0.2419, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1560, + "tokens_per_second_per_gpu": 330.91 + }, + { + "epoch": 0.15608689168365064, + "grad_norm": 0.4680122435092926, + "learning_rate": 0.00019735399438764275, + "loss": 0.2948, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1570, + "tokens_per_second_per_gpu": 369.5 + }, + { + "epoch": 0.15708107570711338, + "grad_norm": 0.35631850361824036, + "learning_rate": 0.000197318009633652, + "loss": 0.2865, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1580, + "tokens_per_second_per_gpu": 331.56 + }, + { + "epoch": 0.15807525973057612, + "grad_norm": 0.43517372012138367, + "learning_rate": 0.0001972817851648349, + "loss": 0.2912, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1590, + "tokens_per_second_per_gpu": 334.38 + }, + { + "epoch": 0.15906944375403886, + "grad_norm": 0.3614802360534668, + "learning_rate": 0.00019724532107041995, + "loss": 0.2182, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.45, + "memory/max_allocated (GiB)": 20.45, + "step": 1600, + "tokens_per_second_per_gpu": 330.71 + }, + { + "epoch": 0.1600636277775016, + "grad_norm": 0.3124898672103882, + "learning_rate": 0.00019720861744022594, + "loss": 0.1887, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1610, + "tokens_per_second_per_gpu": 419.4 + }, + { + "epoch": 0.16105781180096435, + "grad_norm": 0.37882000207901, + "learning_rate": 0.00019717167436466166, + "loss": 0.3199, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 1620, + "tokens_per_second_per_gpu": 352.77 + }, + { + "epoch": 0.16205199582442711, + "grad_norm": 0.47813880443573, + "learning_rate": 0.00019713449193472572, + "loss": 0.2644, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 1630, + "tokens_per_second_per_gpu": 328.31 + }, + { + "epoch": 0.16304617984788985, + "grad_norm": 0.4390937387943268, + "learning_rate": 0.00019709707024200633, + "loss": 0.2157, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 1640, + "tokens_per_second_per_gpu": 337.97 + }, + { + "epoch": 0.1640403638713526, + "grad_norm": 0.28492602705955505, + "learning_rate": 0.00019705940937868096, + "loss": 0.2301, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1650, + "tokens_per_second_per_gpu": 377.43 + }, + { + "epoch": 0.16503454789481534, + "grad_norm": 0.41057339310646057, + "learning_rate": 0.00019702150943751636, + "loss": 0.2755, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 1660, + "tokens_per_second_per_gpu": 364.78 + }, + { + "epoch": 0.16602873191827808, + "grad_norm": 0.36004287004470825, + "learning_rate": 0.00019698337051186803, + "loss": 0.2254, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1670, + "tokens_per_second_per_gpu": 348.57 + }, + { + "epoch": 0.16702291594174082, + "grad_norm": 0.36488133668899536, + "learning_rate": 0.00019694499269568022, + "loss": 0.2556, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1680, + "tokens_per_second_per_gpu": 338.09 + }, + { + "epoch": 0.16801709996520356, + "grad_norm": 0.1386333703994751, + "learning_rate": 0.00019690637608348562, + "loss": 0.2765, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1690, + "tokens_per_second_per_gpu": 279.22 + }, + { + "epoch": 0.1690112839886663, + "grad_norm": 0.30839794874191284, + "learning_rate": 0.00019686752077040505, + "loss": 0.2745, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 1700, + "tokens_per_second_per_gpu": 363.03 + }, + { + "epoch": 0.17000546801212904, + "grad_norm": 0.48986828327178955, + "learning_rate": 0.00019682842685214745, + "loss": 0.2415, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1710, + "tokens_per_second_per_gpu": 321.69 + }, + { + "epoch": 0.17099965203559178, + "grad_norm": 0.25523653626441956, + "learning_rate": 0.00019678909442500937, + "loss": 0.2618, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.18, + "memory/max_allocated (GiB)": 18.18, + "step": 1720, + "tokens_per_second_per_gpu": 312.41 + }, + { + "epoch": 0.17199383605905452, + "grad_norm": 0.1990797519683838, + "learning_rate": 0.00019674952358587488, + "loss": 0.2569, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1730, + "tokens_per_second_per_gpu": 338.48 + }, + { + "epoch": 0.17298802008251726, + "grad_norm": 1.2966935634613037, + "learning_rate": 0.00019670971443221528, + "loss": 0.2789, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 1740, + "tokens_per_second_per_gpu": 377.96 + }, + { + "epoch": 0.17398220410598, + "grad_norm": 0.4156191647052765, + "learning_rate": 0.00019666966706208898, + "loss": 0.2537, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1750, + "tokens_per_second_per_gpu": 423.21 + }, + { + "epoch": 0.17497638812944277, + "grad_norm": 0.43119025230407715, + "learning_rate": 0.00019662938157414113, + "loss": 0.3316, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 1760, + "tokens_per_second_per_gpu": 385.14 + }, + { + "epoch": 0.1759705721529055, + "grad_norm": 0.4492398798465729, + "learning_rate": 0.00019658885806760336, + "loss": 0.2969, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 1770, + "tokens_per_second_per_gpu": 364.99 + }, + { + "epoch": 0.17696475617636825, + "grad_norm": 0.4664623737335205, + "learning_rate": 0.00019654809664229364, + "loss": 0.291, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1780, + "tokens_per_second_per_gpu": 295.57 + }, + { + "epoch": 0.177958940199831, + "grad_norm": 0.46608006954193115, + "learning_rate": 0.000196507097398616, + "loss": 0.295, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1790, + "tokens_per_second_per_gpu": 284.68 + }, + { + "epoch": 0.17895312422329374, + "grad_norm": 0.378513365983963, + "learning_rate": 0.00019646586043756023, + "loss": 0.2396, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1800, + "tokens_per_second_per_gpu": 325.71 + }, + { + "epoch": 0.17994730824675648, + "grad_norm": 0.36179885268211365, + "learning_rate": 0.00019642438586070168, + "loss": 0.2364, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 1810, + "tokens_per_second_per_gpu": 349.38 + }, + { + "epoch": 0.18094149227021922, + "grad_norm": 0.31644207239151, + "learning_rate": 0.000196382673770201, + "loss": 0.223, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 1820, + "tokens_per_second_per_gpu": 320.59 + }, + { + "epoch": 0.18193567629368196, + "grad_norm": 0.27562472224235535, + "learning_rate": 0.00019634072426880382, + "loss": 0.2641, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 1830, + "tokens_per_second_per_gpu": 386.93 + }, + { + "epoch": 0.1829298603171447, + "grad_norm": 0.4514225423336029, + "learning_rate": 0.00019629853745984076, + "loss": 0.2167, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1840, + "tokens_per_second_per_gpu": 350.66 + }, + { + "epoch": 0.18392404434060744, + "grad_norm": 0.43807438015937805, + "learning_rate": 0.00019625611344722675, + "loss": 0.2429, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1850, + "tokens_per_second_per_gpu": 390.61 + }, + { + "epoch": 0.18491822836407018, + "grad_norm": 0.3817460536956787, + "learning_rate": 0.00019621345233546115, + "loss": 0.2565, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 1860, + "tokens_per_second_per_gpu": 366.89 + }, + { + "epoch": 0.18591241238753292, + "grad_norm": 0.51050865650177, + "learning_rate": 0.0001961705542296272, + "loss": 0.2815, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 1870, + "tokens_per_second_per_gpu": 314.07 + }, + { + "epoch": 0.18690659641099566, + "grad_norm": 0.34085533022880554, + "learning_rate": 0.00019612741923539218, + "loss": 0.2353, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 1880, + "tokens_per_second_per_gpu": 366.93 + }, + { + "epoch": 0.18790078043445843, + "grad_norm": 0.38903746008872986, + "learning_rate": 0.00019608404745900652, + "loss": 0.2679, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 1890, + "tokens_per_second_per_gpu": 407.24 + }, + { + "epoch": 0.18889496445792117, + "grad_norm": 0.3198868930339813, + "learning_rate": 0.00019604043900730414, + "loss": 0.299, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1900, + "tokens_per_second_per_gpu": 420.81 + }, + { + "epoch": 0.1898891484813839, + "grad_norm": 0.43067699670791626, + "learning_rate": 0.0001959965939877019, + "loss": 0.2957, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1910, + "tokens_per_second_per_gpu": 467.81 + }, + { + "epoch": 0.19088333250484665, + "grad_norm": 0.3319333493709564, + "learning_rate": 0.00019595251250819932, + "loss": 0.2512, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 17.12, + "memory/max_allocated (GiB)": 17.12, + "step": 1920, + "tokens_per_second_per_gpu": 283.54 + }, + { + "epoch": 0.1918775165283094, + "grad_norm": 0.399617999792099, + "learning_rate": 0.00019590819467737837, + "loss": 0.2627, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 1930, + "tokens_per_second_per_gpu": 314.92 + }, + { + "epoch": 0.19287170055177214, + "grad_norm": 0.31971487402915955, + "learning_rate": 0.00019586364060440332, + "loss": 0.2705, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 1940, + "tokens_per_second_per_gpu": 371.9 + }, + { + "epoch": 0.19386588457523488, + "grad_norm": 0.22907792031764984, + "learning_rate": 0.0001958188503990202, + "loss": 0.2479, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 1950, + "tokens_per_second_per_gpu": 329.97 + }, + { + "epoch": 0.19486006859869762, + "grad_norm": 0.3624865710735321, + "learning_rate": 0.00019577382417155676, + "loss": 0.309, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 1960, + "tokens_per_second_per_gpu": 466.41 + }, + { + "epoch": 0.19585425262216036, + "grad_norm": 0.3285991847515106, + "learning_rate": 0.00019572856203292215, + "loss": 0.2188, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 1970, + "tokens_per_second_per_gpu": 336.34 + }, + { + "epoch": 0.1968484366456231, + "grad_norm": 0.22952090203762054, + "learning_rate": 0.00019568306409460654, + "loss": 0.2277, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1980, + "tokens_per_second_per_gpu": 377.15 + }, + { + "epoch": 0.19784262066908584, + "grad_norm": 0.38868236541748047, + "learning_rate": 0.000195637330468681, + "loss": 0.299, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 1990, + "tokens_per_second_per_gpu": 371.96 + }, + { + "epoch": 0.19883680469254858, + "grad_norm": 0.30058735609054565, + "learning_rate": 0.0001955913612677971, + "loss": 0.2993, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2000, + "tokens_per_second_per_gpu": 418.93 + }, + { + "epoch": 0.19983098871601132, + "grad_norm": 0.25231263041496277, + "learning_rate": 0.00019554515660518668, + "loss": 0.2894, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2010, + "tokens_per_second_per_gpu": 372.68 + }, + { + "epoch": 0.2008251727394741, + "grad_norm": 0.5621690154075623, + "learning_rate": 0.00019549871659466165, + "loss": 0.2152, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 2020, + "tokens_per_second_per_gpu": 358.38 + }, + { + "epoch": 0.20181935676293683, + "grad_norm": 0.3443728983402252, + "learning_rate": 0.0001954520413506135, + "loss": 0.2398, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2030, + "tokens_per_second_per_gpu": 306.09 + }, + { + "epoch": 0.20281354078639957, + "grad_norm": 0.40964439511299133, + "learning_rate": 0.0001954051309880133, + "loss": 0.2259, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 2040, + "tokens_per_second_per_gpu": 371.31 + }, + { + "epoch": 0.2038077248098623, + "grad_norm": 0.49535176157951355, + "learning_rate": 0.0001953579856224111, + "loss": 0.2681, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 2050, + "tokens_per_second_per_gpu": 368.85 + }, + { + "epoch": 0.20480190883332505, + "grad_norm": 0.4733733832836151, + "learning_rate": 0.00019531060536993598, + "loss": 0.2309, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.47, + "memory/max_allocated (GiB)": 21.47, + "step": 2060, + "tokens_per_second_per_gpu": 378.99 + }, + { + "epoch": 0.2057960928567878, + "grad_norm": 0.4018077552318573, + "learning_rate": 0.00019526299034729544, + "loss": 0.2717, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 2070, + "tokens_per_second_per_gpu": 373.89 + }, + { + "epoch": 0.20679027688025053, + "grad_norm": 0.18390242755413055, + "learning_rate": 0.0001952151406717754, + "loss": 0.2604, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 2080, + "tokens_per_second_per_gpu": 339.07 + }, + { + "epoch": 0.20778446090371328, + "grad_norm": 0.31284740567207336, + "learning_rate": 0.0001951670564612397, + "loss": 0.2528, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 2090, + "tokens_per_second_per_gpu": 358.01 + }, + { + "epoch": 0.20877864492717602, + "grad_norm": 0.1479184776544571, + "learning_rate": 0.0001951187378341299, + "loss": 0.2753, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2100, + "tokens_per_second_per_gpu": 373.8 + }, + { + "epoch": 0.20977282895063876, + "grad_norm": 0.35812532901763916, + "learning_rate": 0.00019507018490946503, + "loss": 0.2799, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2110, + "tokens_per_second_per_gpu": 394.97 + }, + { + "epoch": 0.2107670129741015, + "grad_norm": 0.39899322390556335, + "learning_rate": 0.00019502139780684118, + "loss": 0.2785, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 2120, + "tokens_per_second_per_gpu": 371.01 + }, + { + "epoch": 0.21176119699756424, + "grad_norm": 0.30708786845207214, + "learning_rate": 0.00019497237664643132, + "loss": 0.2985, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 2130, + "tokens_per_second_per_gpu": 393.88 + }, + { + "epoch": 0.21275538102102698, + "grad_norm": 0.280734658241272, + "learning_rate": 0.00019492312154898488, + "loss": 0.2661, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 2140, + "tokens_per_second_per_gpu": 401.57 + }, + { + "epoch": 0.21374956504448975, + "grad_norm": 0.19114673137664795, + "learning_rate": 0.00019487363263582765, + "loss": 0.2197, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 2150, + "tokens_per_second_per_gpu": 346.18 + }, + { + "epoch": 0.2147437490679525, + "grad_norm": 0.5506372451782227, + "learning_rate": 0.00019482391002886122, + "loss": 0.2724, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 2160, + "tokens_per_second_per_gpu": 281.19 + }, + { + "epoch": 0.21573793309141523, + "grad_norm": 0.3187256157398224, + "learning_rate": 0.0001947739538505629, + "loss": 0.2758, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2170, + "tokens_per_second_per_gpu": 405.51 + }, + { + "epoch": 0.21673211711487797, + "grad_norm": 0.4344545602798462, + "learning_rate": 0.00019472376422398528, + "loss": 0.2792, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2180, + "tokens_per_second_per_gpu": 313.61 + }, + { + "epoch": 0.2177263011383407, + "grad_norm": 0.4564478099346161, + "learning_rate": 0.00019467334127275606, + "loss": 0.2474, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.43, + "memory/max_allocated (GiB)": 20.43, + "step": 2190, + "tokens_per_second_per_gpu": 328.19 + }, + { + "epoch": 0.21872048516180345, + "grad_norm": 0.25979048013687134, + "learning_rate": 0.00019462268512107766, + "loss": 0.2877, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2200, + "tokens_per_second_per_gpu": 360.33 + }, + { + "epoch": 0.2197146691852662, + "grad_norm": 0.3676232397556305, + "learning_rate": 0.00019457179589372684, + "loss": 0.3336, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2210, + "tokens_per_second_per_gpu": 368.21 + }, + { + "epoch": 0.22070885320872893, + "grad_norm": 0.7130278944969177, + "learning_rate": 0.0001945206737160545, + "loss": 0.253, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 2220, + "tokens_per_second_per_gpu": 305.12 + }, + { + "epoch": 0.22170303723219167, + "grad_norm": 2.660079002380371, + "learning_rate": 0.0001944693187139854, + "loss": 0.3069, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2230, + "tokens_per_second_per_gpu": 337.93 + }, + { + "epoch": 0.22269722125565442, + "grad_norm": 0.2822214663028717, + "learning_rate": 0.00019441773101401777, + "loss": 0.2744, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 2240, + "tokens_per_second_per_gpu": 349.76 + }, + { + "epoch": 0.22369140527911716, + "grad_norm": 0.45128756761550903, + "learning_rate": 0.00019436591074322302, + "loss": 0.245, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 2250, + "tokens_per_second_per_gpu": 390.72 + }, + { + "epoch": 0.2246855893025799, + "grad_norm": 0.2821448743343353, + "learning_rate": 0.00019431385802924539, + "loss": 0.2625, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2260, + "tokens_per_second_per_gpu": 321.11 + }, + { + "epoch": 0.22567977332604264, + "grad_norm": 0.3787819445133209, + "learning_rate": 0.00019426157300030176, + "loss": 0.2116, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2270, + "tokens_per_second_per_gpu": 345.13 + }, + { + "epoch": 0.2266739573495054, + "grad_norm": 0.36587730050086975, + "learning_rate": 0.0001942090557851812, + "loss": 0.2618, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 2280, + "tokens_per_second_per_gpu": 288.26 + }, + { + "epoch": 0.22766814137296815, + "grad_norm": 0.5559653639793396, + "learning_rate": 0.0001941563065132447, + "loss": 0.307, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2290, + "tokens_per_second_per_gpu": 402.27 + }, + { + "epoch": 0.2286623253964309, + "grad_norm": 0.26463228464126587, + "learning_rate": 0.0001941033253144249, + "loss": 0.2583, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2300, + "tokens_per_second_per_gpu": 388.95 + }, + { + "epoch": 0.22965650941989363, + "grad_norm": 0.330695778131485, + "learning_rate": 0.0001940501123192256, + "loss": 0.2671, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 2310, + "tokens_per_second_per_gpu": 353.27 + }, + { + "epoch": 0.23065069344335637, + "grad_norm": 0.3159751892089844, + "learning_rate": 0.00019399666765872176, + "loss": 0.2023, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2320, + "tokens_per_second_per_gpu": 347.05 + }, + { + "epoch": 0.2316448774668191, + "grad_norm": 0.4762006103992462, + "learning_rate": 0.0001939429914645588, + "loss": 0.2633, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 2330, + "tokens_per_second_per_gpu": 292.93 + }, + { + "epoch": 0.23263906149028185, + "grad_norm": 0.47802531719207764, + "learning_rate": 0.00019388908386895254, + "loss": 0.2381, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 2340, + "tokens_per_second_per_gpu": 366.87 + }, + { + "epoch": 0.2336332455137446, + "grad_norm": 0.4501783847808838, + "learning_rate": 0.00019383494500468883, + "loss": 0.3052, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 2350, + "tokens_per_second_per_gpu": 412.56 + }, + { + "epoch": 0.23462742953720733, + "grad_norm": 0.3937671184539795, + "learning_rate": 0.0001937805750051231, + "loss": 0.2495, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2360, + "tokens_per_second_per_gpu": 374.59 + }, + { + "epoch": 0.23562161356067007, + "grad_norm": 0.2139206975698471, + "learning_rate": 0.00019372597400418019, + "loss": 0.1679, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2370, + "tokens_per_second_per_gpu": 329.45 + }, + { + "epoch": 0.23661579758413281, + "grad_norm": 0.4658704102039337, + "learning_rate": 0.00019367114213635382, + "loss": 0.2242, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 2380, + "tokens_per_second_per_gpu": 302.96 + }, + { + "epoch": 0.23760998160759556, + "grad_norm": 0.34023401141166687, + "learning_rate": 0.00019361607953670654, + "loss": 0.2632, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2390, + "tokens_per_second_per_gpu": 387.14 + }, + { + "epoch": 0.2386041656310583, + "grad_norm": 0.4804230034351349, + "learning_rate": 0.00019356078634086914, + "loss": 0.251, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 2400, + "tokens_per_second_per_gpu": 410.6 + }, + { + "epoch": 0.23959834965452106, + "grad_norm": 0.32980307936668396, + "learning_rate": 0.00019350526268504048, + "loss": 0.176, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2410, + "tokens_per_second_per_gpu": 372.52 + }, + { + "epoch": 0.2405925336779838, + "grad_norm": 0.25920480489730835, + "learning_rate": 0.00019344950870598703, + "loss": 0.2976, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.83, + "memory/max_allocated (GiB)": 19.83, + "step": 2420, + "tokens_per_second_per_gpu": 330.47 + }, + { + "epoch": 0.24158671770144655, + "grad_norm": 0.455616295337677, + "learning_rate": 0.00019339352454104264, + "loss": 0.2976, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2430, + "tokens_per_second_per_gpu": 340.47 + }, + { + "epoch": 0.2425809017249093, + "grad_norm": 0.3018989562988281, + "learning_rate": 0.00019333731032810812, + "loss": 0.2732, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2440, + "tokens_per_second_per_gpu": 462.35 + }, + { + "epoch": 0.24357508574837203, + "grad_norm": 0.4818798303604126, + "learning_rate": 0.00019328086620565095, + "loss": 0.2886, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 2450, + "tokens_per_second_per_gpu": 323.25 + }, + { + "epoch": 0.24456926977183477, + "grad_norm": 0.4339105188846588, + "learning_rate": 0.000193224192312705, + "loss": 0.3325, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 2460, + "tokens_per_second_per_gpu": 401.06 + }, + { + "epoch": 0.2455634537952975, + "grad_norm": 0.2657606303691864, + "learning_rate": 0.00019316728878887, + "loss": 0.2369, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2470, + "tokens_per_second_per_gpu": 344.52 + }, + { + "epoch": 0.24655763781876025, + "grad_norm": 0.2487681657075882, + "learning_rate": 0.0001931101557743113, + "loss": 0.2481, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 2480, + "tokens_per_second_per_gpu": 338.75 + }, + { + "epoch": 0.247551821842223, + "grad_norm": 0.4219423532485962, + "learning_rate": 0.0001930527934097597, + "loss": 0.2467, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.23, + "memory/max_allocated (GiB)": 19.23, + "step": 2490, + "tokens_per_second_per_gpu": 357.65 + }, + { + "epoch": 0.24854600586568573, + "grad_norm": 0.3535912334918976, + "learning_rate": 0.00019299520183651075, + "loss": 0.2844, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 2500, + "tokens_per_second_per_gpu": 341.4 + }, + { + "epoch": 0.24954018988914847, + "grad_norm": 0.28509521484375, + "learning_rate": 0.0001929373811964247, + "loss": 0.2335, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 2510, + "tokens_per_second_per_gpu": 325.61 + }, + { + "epoch": 0.25053437391261124, + "grad_norm": 0.540902853012085, + "learning_rate": 0.00019287933163192602, + "loss": 0.314, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 2520, + "tokens_per_second_per_gpu": 390.5 + }, + { + "epoch": 0.251528557936074, + "grad_norm": 0.2346281111240387, + "learning_rate": 0.00019282105328600303, + "loss": 0.2592, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 17.12, + "memory/max_allocated (GiB)": 17.12, + "step": 2530, + "tokens_per_second_per_gpu": 335.14 + }, + { + "epoch": 0.2525227419595367, + "grad_norm": 0.4211244583129883, + "learning_rate": 0.0001927625463022076, + "loss": 0.2119, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2540, + "tokens_per_second_per_gpu": 360.45 + }, + { + "epoch": 0.25351692598299946, + "grad_norm": 0.5125908851623535, + "learning_rate": 0.00019270381082465483, + "loss": 0.2628, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2550, + "tokens_per_second_per_gpu": 316.73 + }, + { + "epoch": 0.2545111100064622, + "grad_norm": 0.20704445242881775, + "learning_rate": 0.00019264484699802262, + "loss": 0.2393, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 2560, + "tokens_per_second_per_gpu": 352.58 + }, + { + "epoch": 0.25550529402992495, + "grad_norm": 0.2666438817977905, + "learning_rate": 0.00019258565496755128, + "loss": 0.1621, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 2570, + "tokens_per_second_per_gpu": 346.34 + }, + { + "epoch": 0.2564994780533877, + "grad_norm": 0.4072653353214264, + "learning_rate": 0.00019252623487904335, + "loss": 0.2066, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 17.11, + "memory/max_allocated (GiB)": 17.11, + "step": 2580, + "tokens_per_second_per_gpu": 360.03 + }, + { + "epoch": 0.2574936620768504, + "grad_norm": 0.517437219619751, + "learning_rate": 0.00019246658687886302, + "loss": 0.256, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 2590, + "tokens_per_second_per_gpu": 335.1 + }, + { + "epoch": 0.25848784610031317, + "grad_norm": 1.1084229946136475, + "learning_rate": 0.00019240671111393597, + "loss": 0.2437, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2600, + "tokens_per_second_per_gpu": 336.03 + }, + { + "epoch": 0.2594820301237759, + "grad_norm": 0.3532284200191498, + "learning_rate": 0.00019234660773174883, + "loss": 0.2102, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 2610, + "tokens_per_second_per_gpu": 304.29 + }, + { + "epoch": 0.26047621414723865, + "grad_norm": 0.4219340980052948, + "learning_rate": 0.00019228627688034898, + "loss": 0.3338, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.61, + "memory/max_allocated (GiB)": 20.61, + "step": 2620, + "tokens_per_second_per_gpu": 350.23 + }, + { + "epoch": 0.2614703981707014, + "grad_norm": 0.3215112090110779, + "learning_rate": 0.000192225718708344, + "loss": 0.2466, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2630, + "tokens_per_second_per_gpu": 369.51 + }, + { + "epoch": 0.26246458219416413, + "grad_norm": 0.36818957328796387, + "learning_rate": 0.00019216493336490152, + "loss": 0.2839, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 2640, + "tokens_per_second_per_gpu": 369.73 + }, + { + "epoch": 0.26345876621762687, + "grad_norm": 0.41190099716186523, + "learning_rate": 0.0001921039209997486, + "loss": 0.2231, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2650, + "tokens_per_second_per_gpu": 334.38 + }, + { + "epoch": 0.2644529502410896, + "grad_norm": 0.3659015893936157, + "learning_rate": 0.0001920426817631717, + "loss": 0.162, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 2660, + "tokens_per_second_per_gpu": 336.11 + }, + { + "epoch": 0.26544713426455235, + "grad_norm": 0.13166293501853943, + "learning_rate": 0.00019198121580601596, + "loss": 0.2313, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 2670, + "tokens_per_second_per_gpu": 341.25 + }, + { + "epoch": 0.2664413182880151, + "grad_norm": 0.3052745759487152, + "learning_rate": 0.00019191952327968497, + "loss": 0.2887, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 2680, + "tokens_per_second_per_gpu": 390.54 + }, + { + "epoch": 0.26743550231147784, + "grad_norm": 0.3094649910926819, + "learning_rate": 0.00019185760433614054, + "loss": 0.2272, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2690, + "tokens_per_second_per_gpu": 371.44 + }, + { + "epoch": 0.2684296863349406, + "grad_norm": 4.753208160400391, + "learning_rate": 0.00019179545912790207, + "loss": 0.2826, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 2700, + "tokens_per_second_per_gpu": 374.86 + }, + { + "epoch": 0.2694238703584033, + "grad_norm": 0.3265855014324188, + "learning_rate": 0.00019173308780804637, + "loss": 0.2372, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 2710, + "tokens_per_second_per_gpu": 320.39 + }, + { + "epoch": 0.27041805438186606, + "grad_norm": 0.3575897514820099, + "learning_rate": 0.00019167049053020712, + "loss": 0.247, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 2720, + "tokens_per_second_per_gpu": 312.17 + }, + { + "epoch": 0.27141223840532885, + "grad_norm": 0.486605703830719, + "learning_rate": 0.00019160766744857476, + "loss": 0.2732, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 2730, + "tokens_per_second_per_gpu": 388.44 + }, + { + "epoch": 0.2724064224287916, + "grad_norm": 0.3815803527832031, + "learning_rate": 0.00019154461871789572, + "loss": 0.2733, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.76, + "memory/max_allocated (GiB)": 19.76, + "step": 2740, + "tokens_per_second_per_gpu": 338.38 + }, + { + "epoch": 0.27340060645225434, + "grad_norm": 0.3805699050426483, + "learning_rate": 0.0001914813444934724, + "loss": 0.2912, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.67, + "memory/max_allocated (GiB)": 19.67, + "step": 2750, + "tokens_per_second_per_gpu": 385.54 + }, + { + "epoch": 0.2743947904757171, + "grad_norm": 0.5376086831092834, + "learning_rate": 0.00019141784493116254, + "loss": 0.3009, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2760, + "tokens_per_second_per_gpu": 392.53 + }, + { + "epoch": 0.2753889744991798, + "grad_norm": 0.2709028720855713, + "learning_rate": 0.000191354120187379, + "loss": 0.3044, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2770, + "tokens_per_second_per_gpu": 363.03 + }, + { + "epoch": 0.27638315852264256, + "grad_norm": 0.24123673141002655, + "learning_rate": 0.00019129017041908934, + "loss": 0.2744, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 2780, + "tokens_per_second_per_gpu": 351.75 + }, + { + "epoch": 0.2773773425461053, + "grad_norm": 0.4202309548854828, + "learning_rate": 0.00019122599578381532, + "loss": 0.2764, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2790, + "tokens_per_second_per_gpu": 387.68 + }, + { + "epoch": 0.27837152656956804, + "grad_norm": 0.42621132731437683, + "learning_rate": 0.00019116159643963262, + "loss": 0.245, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 2800, + "tokens_per_second_per_gpu": 351.56 + }, + { + "epoch": 0.2793657105930308, + "grad_norm": 0.30440858006477356, + "learning_rate": 0.00019109697254517048, + "loss": 0.2809, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2810, + "tokens_per_second_per_gpu": 382.5 + }, + { + "epoch": 0.2803598946164935, + "grad_norm": 0.49908211827278137, + "learning_rate": 0.00019103212425961111, + "loss": 0.319, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2820, + "tokens_per_second_per_gpu": 402.18 + }, + { + "epoch": 0.28135407863995626, + "grad_norm": 0.38030481338500977, + "learning_rate": 0.00019096705174268967, + "loss": 0.2392, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2830, + "tokens_per_second_per_gpu": 319.29 + }, + { + "epoch": 0.282348262663419, + "grad_norm": 0.4159802198410034, + "learning_rate": 0.00019090175515469344, + "loss": 0.2962, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 2840, + "tokens_per_second_per_gpu": 354.51 + }, + { + "epoch": 0.28334244668688174, + "grad_norm": 0.47257235646247864, + "learning_rate": 0.00019083623465646172, + "loss": 0.2523, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2850, + "tokens_per_second_per_gpu": 336.31 + }, + { + "epoch": 0.2843366307103445, + "grad_norm": 0.2591971158981323, + "learning_rate": 0.0001907704904093854, + "loss": 0.2605, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2860, + "tokens_per_second_per_gpu": 386.6 + }, + { + "epoch": 0.2853308147338072, + "grad_norm": 0.3341462314128876, + "learning_rate": 0.00019070452257540638, + "loss": 0.3024, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2870, + "tokens_per_second_per_gpu": 389.19 + }, + { + "epoch": 0.28632499875726997, + "grad_norm": 0.3934953808784485, + "learning_rate": 0.00019063833131701744, + "loss": 0.2648, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 2880, + "tokens_per_second_per_gpu": 333.15 + }, + { + "epoch": 0.2873191827807327, + "grad_norm": 0.3615911304950714, + "learning_rate": 0.00019057191679726162, + "loss": 0.2232, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 2890, + "tokens_per_second_per_gpu": 326.24 + }, + { + "epoch": 0.28831336680419545, + "grad_norm": 0.45740652084350586, + "learning_rate": 0.00019050527917973192, + "loss": 0.2467, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 2900, + "tokens_per_second_per_gpu": 399.74 + }, + { + "epoch": 0.2893075508276582, + "grad_norm": 0.5232924818992615, + "learning_rate": 0.00019043841862857088, + "loss": 0.292, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 2910, + "tokens_per_second_per_gpu": 385.0 + }, + { + "epoch": 0.29030173485112093, + "grad_norm": 0.3617020547389984, + "learning_rate": 0.00019037133530847014, + "loss": 0.2618, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 2920, + "tokens_per_second_per_gpu": 380.42 + }, + { + "epoch": 0.29129591887458367, + "grad_norm": 0.4762667119503021, + "learning_rate": 0.00019030402938467013, + "loss": 0.281, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 2930, + "tokens_per_second_per_gpu": 398.66 + }, + { + "epoch": 0.2922901028980464, + "grad_norm": 0.5239278078079224, + "learning_rate": 0.00019023650102295957, + "loss": 0.2205, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 2940, + "tokens_per_second_per_gpu": 317.01 + }, + { + "epoch": 0.29328428692150915, + "grad_norm": 0.38952499628067017, + "learning_rate": 0.00019016875038967507, + "loss": 0.2618, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 2950, + "tokens_per_second_per_gpu": 356.78 + }, + { + "epoch": 0.2942784709449719, + "grad_norm": 0.4657343924045563, + "learning_rate": 0.00019010077765170072, + "loss": 0.2508, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 2960, + "tokens_per_second_per_gpu": 364.11 + }, + { + "epoch": 0.29527265496843463, + "grad_norm": 0.4659232497215271, + "learning_rate": 0.0001900325829764678, + "loss": 0.2148, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.69, + "memory/max_allocated (GiB)": 19.69, + "step": 2970, + "tokens_per_second_per_gpu": 387.16 + }, + { + "epoch": 0.2962668389918974, + "grad_norm": 0.5370404124259949, + "learning_rate": 0.0001899641665319542, + "loss": 0.2857, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2980, + "tokens_per_second_per_gpu": 315.91 + }, + { + "epoch": 0.29726102301536017, + "grad_norm": 0.3220022916793823, + "learning_rate": 0.00018989552848668406, + "loss": 0.2989, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 2990, + "tokens_per_second_per_gpu": 394.41 + }, + { + "epoch": 0.2982552070388229, + "grad_norm": 0.38055410981178284, + "learning_rate": 0.0001898266690097274, + "loss": 0.2354, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 3000, + "tokens_per_second_per_gpu": 319.33 + }, + { + "epoch": 0.29924939106228565, + "grad_norm": 0.36663955450057983, + "learning_rate": 0.00018975758827069968, + "loss": 0.2437, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3010, + "tokens_per_second_per_gpu": 326.18 + }, + { + "epoch": 0.3002435750857484, + "grad_norm": 0.3347165882587433, + "learning_rate": 0.00018968828643976135, + "loss": 0.2016, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3020, + "tokens_per_second_per_gpu": 363.38 + }, + { + "epoch": 0.30123775910921113, + "grad_norm": 0.4647526144981384, + "learning_rate": 0.0001896187636876175, + "loss": 0.2421, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3030, + "tokens_per_second_per_gpu": 366.11 + }, + { + "epoch": 0.3022319431326739, + "grad_norm": 0.35668620467185974, + "learning_rate": 0.00018954902018551728, + "loss": 0.28, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3040, + "tokens_per_second_per_gpu": 333.01 + }, + { + "epoch": 0.3032261271561366, + "grad_norm": 0.4070911109447479, + "learning_rate": 0.00018947905610525374, + "loss": 0.3329, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 3050, + "tokens_per_second_per_gpu": 367.0 + }, + { + "epoch": 0.30422031117959936, + "grad_norm": 0.4529660642147064, + "learning_rate": 0.00018940887161916317, + "loss": 0.2532, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3060, + "tokens_per_second_per_gpu": 438.51 + }, + { + "epoch": 0.3052144952030621, + "grad_norm": 0.323428213596344, + "learning_rate": 0.0001893384669001248, + "loss": 0.2717, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 3070, + "tokens_per_second_per_gpu": 304.06 + }, + { + "epoch": 0.30620867922652484, + "grad_norm": 0.33601540327072144, + "learning_rate": 0.00018926784212156038, + "loss": 0.2575, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 17.29, + "memory/max_allocated (GiB)": 17.29, + "step": 3080, + "tokens_per_second_per_gpu": 315.78 + }, + { + "epoch": 0.3072028632499876, + "grad_norm": 0.4147079586982727, + "learning_rate": 0.0001891969974574336, + "loss": 0.2606, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3090, + "tokens_per_second_per_gpu": 357.02 + }, + { + "epoch": 0.3081970472734503, + "grad_norm": 0.5923524498939514, + "learning_rate": 0.00018912593308224987, + "loss": 0.2824, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 3100, + "tokens_per_second_per_gpu": 360.95 + }, + { + "epoch": 0.30919123129691306, + "grad_norm": 0.2961815297603607, + "learning_rate": 0.00018905464917105577, + "loss": 0.3001, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3110, + "tokens_per_second_per_gpu": 369.31 + }, + { + "epoch": 0.3101854153203758, + "grad_norm": 0.2864610254764557, + "learning_rate": 0.00018898314589943862, + "loss": 0.2441, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 3120, + "tokens_per_second_per_gpu": 383.33 + }, + { + "epoch": 0.31117959934383854, + "grad_norm": 0.28341177105903625, + "learning_rate": 0.00018891142344352611, + "loss": 0.2402, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 3130, + "tokens_per_second_per_gpu": 347.6 + }, + { + "epoch": 0.3121737833673013, + "grad_norm": 0.3956068158149719, + "learning_rate": 0.0001888394819799858, + "loss": 0.2318, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3140, + "tokens_per_second_per_gpu": 343.71 + }, + { + "epoch": 0.313167967390764, + "grad_norm": 0.3796366751194, + "learning_rate": 0.00018876732168602472, + "loss": 0.2047, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3150, + "tokens_per_second_per_gpu": 353.35 + }, + { + "epoch": 0.31416215141422676, + "grad_norm": 0.34916970133781433, + "learning_rate": 0.00018869494273938893, + "loss": 0.3128, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 3160, + "tokens_per_second_per_gpu": 316.54 + }, + { + "epoch": 0.3151563354376895, + "grad_norm": 0.3254728317260742, + "learning_rate": 0.00018862234531836307, + "loss": 0.3033, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 3170, + "tokens_per_second_per_gpu": 377.71 + }, + { + "epoch": 0.31615051946115225, + "grad_norm": 0.43219825625419617, + "learning_rate": 0.0001885495296017699, + "loss": 0.2952, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 3180, + "tokens_per_second_per_gpu": 314.3 + }, + { + "epoch": 0.317144703484615, + "grad_norm": 0.2606908082962036, + "learning_rate": 0.00018847649576897, + "loss": 0.2087, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3190, + "tokens_per_second_per_gpu": 345.1 + }, + { + "epoch": 0.31813888750807773, + "grad_norm": 0.46498534083366394, + "learning_rate": 0.00018840324399986105, + "loss": 0.2665, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3200, + "tokens_per_second_per_gpu": 313.16 + }, + { + "epoch": 0.31913307153154047, + "grad_norm": 0.35087594389915466, + "learning_rate": 0.00018832977447487772, + "loss": 0.2328, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3210, + "tokens_per_second_per_gpu": 321.23 + }, + { + "epoch": 0.3201272555550032, + "grad_norm": 0.46294090151786804, + "learning_rate": 0.00018825608737499088, + "loss": 0.2233, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3220, + "tokens_per_second_per_gpu": 366.88 + }, + { + "epoch": 0.32112143957846595, + "grad_norm": 0.35385075211524963, + "learning_rate": 0.00018818218288170753, + "loss": 0.2653, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3230, + "tokens_per_second_per_gpu": 350.06 + }, + { + "epoch": 0.3221156236019287, + "grad_norm": 0.4959578812122345, + "learning_rate": 0.00018810806117706998, + "loss": 0.2921, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3240, + "tokens_per_second_per_gpu": 415.69 + }, + { + "epoch": 0.3231098076253915, + "grad_norm": 0.4978230893611908, + "learning_rate": 0.0001880337224436557, + "loss": 0.3187, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 3250, + "tokens_per_second_per_gpu": 377.08 + }, + { + "epoch": 0.32410399164885423, + "grad_norm": 0.45267730951309204, + "learning_rate": 0.00018795916686457667, + "loss": 0.3013, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3260, + "tokens_per_second_per_gpu": 410.24 + }, + { + "epoch": 0.32509817567231697, + "grad_norm": 0.3263493478298187, + "learning_rate": 0.00018788439462347908, + "loss": 0.2876, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3270, + "tokens_per_second_per_gpu": 446.75 + }, + { + "epoch": 0.3260923596957797, + "grad_norm": 0.4769454896450043, + "learning_rate": 0.00018780940590454277, + "loss": 0.3504, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 3280, + "tokens_per_second_per_gpu": 301.09 + }, + { + "epoch": 0.32708654371924245, + "grad_norm": 0.21503253281116486, + "learning_rate": 0.00018773420089248074, + "loss": 0.2291, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3290, + "tokens_per_second_per_gpu": 375.18 + }, + { + "epoch": 0.3280807277427052, + "grad_norm": 0.49010154604911804, + "learning_rate": 0.00018765877977253888, + "loss": 0.2674, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3300, + "tokens_per_second_per_gpu": 349.74 + }, + { + "epoch": 0.32907491176616793, + "grad_norm": 0.3158112168312073, + "learning_rate": 0.00018758314273049532, + "loss": 0.2975, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3310, + "tokens_per_second_per_gpu": 336.25 + }, + { + "epoch": 0.3300690957896307, + "grad_norm": 0.428846150636673, + "learning_rate": 0.0001875072899526601, + "loss": 0.2618, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3320, + "tokens_per_second_per_gpu": 284.44 + }, + { + "epoch": 0.3310632798130934, + "grad_norm": 0.5033756494522095, + "learning_rate": 0.00018743122162587464, + "loss": 0.2926, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 3330, + "tokens_per_second_per_gpu": 419.59 + }, + { + "epoch": 0.33205746383655615, + "grad_norm": 0.17825426161289215, + "learning_rate": 0.0001873549379375113, + "loss": 0.2464, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3340, + "tokens_per_second_per_gpu": 342.16 + }, + { + "epoch": 0.3330516478600189, + "grad_norm": 0.34919679164886475, + "learning_rate": 0.00018727843907547293, + "loss": 0.2729, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 3350, + "tokens_per_second_per_gpu": 390.88 + }, + { + "epoch": 0.33404583188348164, + "grad_norm": 0.37125077843666077, + "learning_rate": 0.00018720172522819243, + "loss": 0.3117, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3360, + "tokens_per_second_per_gpu": 394.73 + }, + { + "epoch": 0.3350400159069444, + "grad_norm": 0.31769683957099915, + "learning_rate": 0.00018712479658463215, + "loss": 0.2702, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3370, + "tokens_per_second_per_gpu": 424.16 + }, + { + "epoch": 0.3360341999304071, + "grad_norm": 0.398548424243927, + "learning_rate": 0.00018704765333428367, + "loss": 0.1966, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3380, + "tokens_per_second_per_gpu": 310.88 + }, + { + "epoch": 0.33702838395386986, + "grad_norm": 0.24172648787498474, + "learning_rate": 0.00018697029566716705, + "loss": 0.2189, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3390, + "tokens_per_second_per_gpu": 311.63 + }, + { + "epoch": 0.3380225679773326, + "grad_norm": 0.46132785081863403, + "learning_rate": 0.00018689272377383064, + "loss": 0.3093, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 3400, + "tokens_per_second_per_gpu": 411.8 + }, + { + "epoch": 0.33901675200079534, + "grad_norm": 0.3627679944038391, + "learning_rate": 0.00018681493784535036, + "loss": 0.2558, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 3410, + "tokens_per_second_per_gpu": 319.21 + }, + { + "epoch": 0.3400109360242581, + "grad_norm": 1.1992244720458984, + "learning_rate": 0.00018673693807332945, + "loss": 0.228, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3420, + "tokens_per_second_per_gpu": 354.78 + }, + { + "epoch": 0.3410051200477208, + "grad_norm": 0.26419004797935486, + "learning_rate": 0.00018665872464989773, + "loss": 0.1874, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3430, + "tokens_per_second_per_gpu": 369.02 + }, + { + "epoch": 0.34199930407118356, + "grad_norm": 0.3501751720905304, + "learning_rate": 0.00018658029776771152, + "loss": 0.2231, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 3440, + "tokens_per_second_per_gpu": 358.64 + }, + { + "epoch": 0.3429934880946463, + "grad_norm": 0.4123583137989044, + "learning_rate": 0.0001865016576199527, + "loss": 0.2456, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3450, + "tokens_per_second_per_gpu": 399.31 + }, + { + "epoch": 0.34398767211810904, + "grad_norm": 0.4507691264152527, + "learning_rate": 0.00018642280440032863, + "loss": 0.2716, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3460, + "tokens_per_second_per_gpu": 384.1 + }, + { + "epoch": 0.3449818561415718, + "grad_norm": 0.43500733375549316, + "learning_rate": 0.00018634373830307146, + "loss": 0.2352, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 3470, + "tokens_per_second_per_gpu": 319.67 + }, + { + "epoch": 0.3459760401650345, + "grad_norm": 0.40590760111808777, + "learning_rate": 0.00018626445952293766, + "loss": 0.2623, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3480, + "tokens_per_second_per_gpu": 335.37 + }, + { + "epoch": 0.34697022418849727, + "grad_norm": 0.2494644969701767, + "learning_rate": 0.00018618496825520767, + "loss": 0.2245, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3490, + "tokens_per_second_per_gpu": 347.95 + }, + { + "epoch": 0.34796440821196, + "grad_norm": 0.47100207209587097, + "learning_rate": 0.00018610526469568526, + "loss": 0.2775, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3500, + "tokens_per_second_per_gpu": 393.01 + }, + { + "epoch": 0.3489585922354228, + "grad_norm": 0.5600543022155762, + "learning_rate": 0.00018602534904069712, + "loss": 0.3007, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 3510, + "tokens_per_second_per_gpu": 399.51 + }, + { + "epoch": 0.34995277625888555, + "grad_norm": 0.25791943073272705, + "learning_rate": 0.00018594522148709244, + "loss": 0.2134, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 3520, + "tokens_per_second_per_gpu": 341.75 + }, + { + "epoch": 0.3509469602823483, + "grad_norm": 0.2849276661872864, + "learning_rate": 0.00018586488223224228, + "loss": 0.1919, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 3530, + "tokens_per_second_per_gpu": 369.12 + }, + { + "epoch": 0.351941144305811, + "grad_norm": 0.32681041955947876, + "learning_rate": 0.00018578433147403925, + "loss": 0.2192, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3540, + "tokens_per_second_per_gpu": 309.14 + }, + { + "epoch": 0.35293532832927377, + "grad_norm": 0.5691526532173157, + "learning_rate": 0.00018570356941089686, + "loss": 0.2775, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3550, + "tokens_per_second_per_gpu": 357.84 + }, + { + "epoch": 0.3539295123527365, + "grad_norm": 0.35383832454681396, + "learning_rate": 0.00018562259624174915, + "loss": 0.2285, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3560, + "tokens_per_second_per_gpu": 341.0 + }, + { + "epoch": 0.35492369637619925, + "grad_norm": 0.4128180742263794, + "learning_rate": 0.00018554141216605016, + "loss": 0.2216, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3570, + "tokens_per_second_per_gpu": 310.04 + }, + { + "epoch": 0.355917880399662, + "grad_norm": 0.4811583459377289, + "learning_rate": 0.00018546001738377338, + "loss": 0.3354, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3580, + "tokens_per_second_per_gpu": 446.97 + }, + { + "epoch": 0.35691206442312473, + "grad_norm": 0.4148445725440979, + "learning_rate": 0.0001853784120954114, + "loss": 0.216, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3590, + "tokens_per_second_per_gpu": 327.38 + }, + { + "epoch": 0.35790624844658747, + "grad_norm": 0.5083706378936768, + "learning_rate": 0.0001852965965019753, + "loss": 0.2762, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3600, + "tokens_per_second_per_gpu": 371.89 + }, + { + "epoch": 0.3589004324700502, + "grad_norm": 0.4946528673171997, + "learning_rate": 0.00018521457080499418, + "loss": 0.2455, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 3610, + "tokens_per_second_per_gpu": 361.51 + }, + { + "epoch": 0.35989461649351295, + "grad_norm": 0.4880548417568207, + "learning_rate": 0.00018513233520651466, + "loss": 0.2299, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3620, + "tokens_per_second_per_gpu": 310.24 + }, + { + "epoch": 0.3608888005169757, + "grad_norm": 0.18661662936210632, + "learning_rate": 0.00018504988990910036, + "loss": 0.2325, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3630, + "tokens_per_second_per_gpu": 360.45 + }, + { + "epoch": 0.36188298454043843, + "grad_norm": 0.49652183055877686, + "learning_rate": 0.00018496723511583153, + "loss": 0.2312, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 3640, + "tokens_per_second_per_gpu": 347.3 + }, + { + "epoch": 0.3628771685639012, + "grad_norm": 0.35343873500823975, + "learning_rate": 0.0001848843710303044, + "loss": 0.154, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3650, + "tokens_per_second_per_gpu": 334.16 + }, + { + "epoch": 0.3638713525873639, + "grad_norm": 0.5269297361373901, + "learning_rate": 0.0001848012978566307, + "loss": 0.2677, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3660, + "tokens_per_second_per_gpu": 343.01 + }, + { + "epoch": 0.36486553661082666, + "grad_norm": 0.4809168875217438, + "learning_rate": 0.00018471801579943717, + "loss": 0.3083, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3670, + "tokens_per_second_per_gpu": 359.61 + }, + { + "epoch": 0.3658597206342894, + "grad_norm": 0.4312402904033661, + "learning_rate": 0.0001846345250638652, + "loss": 0.2711, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3680, + "tokens_per_second_per_gpu": 343.63 + }, + { + "epoch": 0.36685390465775214, + "grad_norm": 0.44654685258865356, + "learning_rate": 0.0001845508258555701, + "loss": 0.2629, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3690, + "tokens_per_second_per_gpu": 380.56 + }, + { + "epoch": 0.3678480886812149, + "grad_norm": 0.19989164173603058, + "learning_rate": 0.00018446691838072067, + "loss": 0.2451, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 3700, + "tokens_per_second_per_gpu": 316.7 + }, + { + "epoch": 0.3688422727046776, + "grad_norm": 0.268655925989151, + "learning_rate": 0.00018438280284599877, + "loss": 0.2172, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 3710, + "tokens_per_second_per_gpu": 270.99 + }, + { + "epoch": 0.36983645672814036, + "grad_norm": 0.3657344579696655, + "learning_rate": 0.00018429847945859872, + "loss": 0.2505, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 3720, + "tokens_per_second_per_gpu": 319.93 + }, + { + "epoch": 0.3708306407516031, + "grad_norm": 0.40177711844444275, + "learning_rate": 0.00018421394842622695, + "loss": 0.2462, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3730, + "tokens_per_second_per_gpu": 344.24 + }, + { + "epoch": 0.37182482477506584, + "grad_norm": 0.48767533898353577, + "learning_rate": 0.00018412920995710113, + "loss": 0.2827, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3740, + "tokens_per_second_per_gpu": 380.45 + }, + { + "epoch": 0.3728190087985286, + "grad_norm": 0.45828619599342346, + "learning_rate": 0.00018404426425995007, + "loss": 0.2355, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 3750, + "tokens_per_second_per_gpu": 388.78 + }, + { + "epoch": 0.3738131928219913, + "grad_norm": 0.49931567907333374, + "learning_rate": 0.000183959111544013, + "loss": 0.2813, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3760, + "tokens_per_second_per_gpu": 361.55 + }, + { + "epoch": 0.3748073768454541, + "grad_norm": 0.3232674300670624, + "learning_rate": 0.00018387375201903903, + "loss": 0.2488, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3770, + "tokens_per_second_per_gpu": 326.01 + }, + { + "epoch": 0.37580156086891686, + "grad_norm": 0.41870149970054626, + "learning_rate": 0.0001837881858952867, + "loss": 0.3117, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3780, + "tokens_per_second_per_gpu": 356.89 + }, + { + "epoch": 0.3767957448923796, + "grad_norm": 0.396383672952652, + "learning_rate": 0.00018370241338352348, + "loss": 0.3046, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3790, + "tokens_per_second_per_gpu": 397.36 + }, + { + "epoch": 0.37778992891584234, + "grad_norm": 0.33363988995552063, + "learning_rate": 0.00018361643469502517, + "loss": 0.2074, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3800, + "tokens_per_second_per_gpu": 320.36 + }, + { + "epoch": 0.3787841129393051, + "grad_norm": 0.34591570496559143, + "learning_rate": 0.00018353025004157552, + "loss": 0.2449, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 3810, + "tokens_per_second_per_gpu": 370.18 + }, + { + "epoch": 0.3797782969627678, + "grad_norm": 0.4369080066680908, + "learning_rate": 0.00018344385963546547, + "loss": 0.2017, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 3820, + "tokens_per_second_per_gpu": 346.43 + }, + { + "epoch": 0.38077248098623057, + "grad_norm": 0.4190782308578491, + "learning_rate": 0.00018335726368949286, + "loss": 0.2987, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3830, + "tokens_per_second_per_gpu": 381.05 + }, + { + "epoch": 0.3817666650096933, + "grad_norm": 0.4989373981952667, + "learning_rate": 0.00018327046241696184, + "loss": 0.2992, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 3840, + "tokens_per_second_per_gpu": 363.52 + }, + { + "epoch": 0.38276084903315605, + "grad_norm": 0.3604322671890259, + "learning_rate": 0.00018318345603168226, + "loss": 0.2311, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 3850, + "tokens_per_second_per_gpu": 286.25 + }, + { + "epoch": 0.3837550330566188, + "grad_norm": 0.303365021944046, + "learning_rate": 0.00018309624474796926, + "loss": 0.1952, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3860, + "tokens_per_second_per_gpu": 381.31 + }, + { + "epoch": 0.38474921708008153, + "grad_norm": 0.5674360990524292, + "learning_rate": 0.00018300882878064266, + "loss": 0.2694, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.82, + "memory/max_allocated (GiB)": 18.82, + "step": 3870, + "tokens_per_second_per_gpu": 346.22 + }, + { + "epoch": 0.38574340110354427, + "grad_norm": 0.4402889609336853, + "learning_rate": 0.00018292120834502643, + "loss": 0.2825, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3880, + "tokens_per_second_per_gpu": 339.23 + }, + { + "epoch": 0.386737585127007, + "grad_norm": 0.3922783136367798, + "learning_rate": 0.00018283338365694825, + "loss": 0.2294, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3890, + "tokens_per_second_per_gpu": 289.83 + }, + { + "epoch": 0.38773176915046975, + "grad_norm": 0.6003592014312744, + "learning_rate": 0.00018274535493273893, + "loss": 0.2244, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3900, + "tokens_per_second_per_gpu": 350.12 + }, + { + "epoch": 0.3887259531739325, + "grad_norm": 0.436212956905365, + "learning_rate": 0.00018265712238923175, + "loss": 0.2341, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3910, + "tokens_per_second_per_gpu": 343.17 + }, + { + "epoch": 0.38972013719739523, + "grad_norm": 0.2501852810382843, + "learning_rate": 0.00018256868624376215, + "loss": 0.2647, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3920, + "tokens_per_second_per_gpu": 396.07 + }, + { + "epoch": 0.390714321220858, + "grad_norm": 0.36171767115592957, + "learning_rate": 0.00018248004671416704, + "loss": 0.2664, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.69, + "memory/max_allocated (GiB)": 19.69, + "step": 3930, + "tokens_per_second_per_gpu": 426.89 + }, + { + "epoch": 0.3917085052443207, + "grad_norm": 0.47077932953834534, + "learning_rate": 0.00018239120401878432, + "loss": 0.3584, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3940, + "tokens_per_second_per_gpu": 452.63 + }, + { + "epoch": 0.39270268926778346, + "grad_norm": 0.413924902677536, + "learning_rate": 0.00018230215837645232, + "loss": 0.2715, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3950, + "tokens_per_second_per_gpu": 424.3 + }, + { + "epoch": 0.3936968732912462, + "grad_norm": 0.40877413749694824, + "learning_rate": 0.00018221291000650928, + "loss": 0.2855, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3960, + "tokens_per_second_per_gpu": 406.29 + }, + { + "epoch": 0.39469105731470894, + "grad_norm": 0.4080711007118225, + "learning_rate": 0.0001821234591287928, + "loss": 0.1893, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 3970, + "tokens_per_second_per_gpu": 347.04 + }, + { + "epoch": 0.3956852413381717, + "grad_norm": 0.2622958719730377, + "learning_rate": 0.00018203380596363932, + "loss": 0.2328, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3980, + "tokens_per_second_per_gpu": 323.13 + }, + { + "epoch": 0.3966794253616344, + "grad_norm": 0.32758989930152893, + "learning_rate": 0.0001819439507318835, + "loss": 0.196, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 3990, + "tokens_per_second_per_gpu": 255.43 + }, + { + "epoch": 0.39767360938509716, + "grad_norm": 0.4135094881057739, + "learning_rate": 0.00018185389365485774, + "loss": 0.2874, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4000, + "tokens_per_second_per_gpu": 352.42 + }, + { + "epoch": 0.3986677934085599, + "grad_norm": 0.4753275215625763, + "learning_rate": 0.00018176363495439173, + "loss": 0.2796, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 4010, + "tokens_per_second_per_gpu": 300.74 + }, + { + "epoch": 0.39966197743202264, + "grad_norm": 0.41000860929489136, + "learning_rate": 0.00018167317485281168, + "loss": 0.3278, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 4020, + "tokens_per_second_per_gpu": 381.42 + }, + { + "epoch": 0.40065616145548544, + "grad_norm": 0.3132636845111847, + "learning_rate": 0.00018158251357293996, + "loss": 0.2514, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4030, + "tokens_per_second_per_gpu": 340.93 + }, + { + "epoch": 0.4016503454789482, + "grad_norm": 0.3330332338809967, + "learning_rate": 0.00018149165133809442, + "loss": 0.219, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4040, + "tokens_per_second_per_gpu": 331.95 + }, + { + "epoch": 0.4026445295024109, + "grad_norm": 0.5818430781364441, + "learning_rate": 0.000181400588372088, + "loss": 0.3451, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 4050, + "tokens_per_second_per_gpu": 433.43 + }, + { + "epoch": 0.40363871352587366, + "grad_norm": 0.4116646945476532, + "learning_rate": 0.00018130932489922804, + "loss": 0.1907, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4060, + "tokens_per_second_per_gpu": 326.32 + }, + { + "epoch": 0.4046328975493364, + "grad_norm": 0.31805434823036194, + "learning_rate": 0.0001812178611443157, + "loss": 0.18, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4070, + "tokens_per_second_per_gpu": 284.15 + }, + { + "epoch": 0.40562708157279914, + "grad_norm": 0.5397758483886719, + "learning_rate": 0.0001811261973326456, + "loss": 0.2596, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 4080, + "tokens_per_second_per_gpu": 369.06 + }, + { + "epoch": 0.4066212655962619, + "grad_norm": 0.4468703866004944, + "learning_rate": 0.00018103433369000502, + "loss": 0.2464, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 4090, + "tokens_per_second_per_gpu": 343.56 + }, + { + "epoch": 0.4076154496197246, + "grad_norm": 0.3228696286678314, + "learning_rate": 0.0001809422704426736, + "loss": 0.2, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4100, + "tokens_per_second_per_gpu": 301.0 + }, + { + "epoch": 0.40860963364318736, + "grad_norm": 0.5811059474945068, + "learning_rate": 0.00018085000781742252, + "loss": 0.2642, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 4110, + "tokens_per_second_per_gpu": 366.31 + }, + { + "epoch": 0.4096038176666501, + "grad_norm": 0.6858221292495728, + "learning_rate": 0.00018075754604151415, + "loss": 0.2658, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.68, + "memory/max_allocated (GiB)": 19.68, + "step": 4120, + "tokens_per_second_per_gpu": 340.12 + }, + { + "epoch": 0.41059800169011285, + "grad_norm": 0.3095184862613678, + "learning_rate": 0.00018066488534270142, + "loss": 0.2542, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 4130, + "tokens_per_second_per_gpu": 399.71 + }, + { + "epoch": 0.4115921857135756, + "grad_norm": 0.5910835266113281, + "learning_rate": 0.0001805720259492271, + "loss": 0.289, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4140, + "tokens_per_second_per_gpu": 366.05 + }, + { + "epoch": 0.4125863697370383, + "grad_norm": 0.3704458773136139, + "learning_rate": 0.00018047896808982364, + "loss": 0.2581, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 4150, + "tokens_per_second_per_gpu": 348.57 + }, + { + "epoch": 0.41358055376050107, + "grad_norm": 0.4034087359905243, + "learning_rate": 0.00018038571199371215, + "loss": 0.2207, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 4160, + "tokens_per_second_per_gpu": 323.67 + }, + { + "epoch": 0.4145747377839638, + "grad_norm": 0.5003114342689514, + "learning_rate": 0.0001802922578906021, + "loss": 0.2927, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4170, + "tokens_per_second_per_gpu": 325.75 + }, + { + "epoch": 0.41556892180742655, + "grad_norm": 0.29613539576530457, + "learning_rate": 0.0001801986060106907, + "loss": 0.2906, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4180, + "tokens_per_second_per_gpu": 335.06 + }, + { + "epoch": 0.4165631058308893, + "grad_norm": 0.49740076065063477, + "learning_rate": 0.00018010475658466235, + "loss": 0.2853, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4190, + "tokens_per_second_per_gpu": 420.46 + }, + { + "epoch": 0.41755728985435203, + "grad_norm": 0.32316842675209045, + "learning_rate": 0.000180010709843688, + "loss": 0.1658, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 4200, + "tokens_per_second_per_gpu": 288.98 + }, + { + "epoch": 0.4185514738778148, + "grad_norm": 0.29272300004959106, + "learning_rate": 0.00017991646601942467, + "loss": 0.2719, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.23, + "memory/max_allocated (GiB)": 19.23, + "step": 4210, + "tokens_per_second_per_gpu": 364.92 + }, + { + "epoch": 0.4195456579012775, + "grad_norm": 0.5684819221496582, + "learning_rate": 0.0001798220253440148, + "loss": 0.2755, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 4220, + "tokens_per_second_per_gpu": 317.91 + }, + { + "epoch": 0.42053984192474025, + "grad_norm": 0.4488314986228943, + "learning_rate": 0.00017972738805008574, + "loss": 0.2131, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 4230, + "tokens_per_second_per_gpu": 328.07 + }, + { + "epoch": 0.421534025948203, + "grad_norm": 0.3249848484992981, + "learning_rate": 0.0001796325543707491, + "loss": 0.2987, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 4240, + "tokens_per_second_per_gpu": 421.46 + }, + { + "epoch": 0.42252820997166574, + "grad_norm": 0.6481621265411377, + "learning_rate": 0.00017953752453960038, + "loss": 0.2498, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.43, + "memory/max_allocated (GiB)": 20.43, + "step": 4250, + "tokens_per_second_per_gpu": 363.83 + }, + { + "epoch": 0.4235223939951285, + "grad_norm": 0.3045104146003723, + "learning_rate": 0.00017944229879071806, + "loss": 0.2295, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 4260, + "tokens_per_second_per_gpu": 398.66 + }, + { + "epoch": 0.4245165780185912, + "grad_norm": 0.32762956619262695, + "learning_rate": 0.0001793468773586633, + "loss": 0.2406, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4270, + "tokens_per_second_per_gpu": 399.72 + }, + { + "epoch": 0.42551076204205396, + "grad_norm": 0.6278170347213745, + "learning_rate": 0.00017925126047847924, + "loss": 0.2523, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 4280, + "tokens_per_second_per_gpu": 324.65 + }, + { + "epoch": 0.42650494606551675, + "grad_norm": 0.45905986428260803, + "learning_rate": 0.00017915544838569052, + "loss": 0.2615, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4290, + "tokens_per_second_per_gpu": 353.66 + }, + { + "epoch": 0.4274991300889795, + "grad_norm": 0.48581770062446594, + "learning_rate": 0.00017905944131630253, + "loss": 0.2519, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4300, + "tokens_per_second_per_gpu": 321.73 + }, + { + "epoch": 0.42849331411244224, + "grad_norm": 0.49877023696899414, + "learning_rate": 0.00017896323950680098, + "loss": 0.2382, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 4310, + "tokens_per_second_per_gpu": 353.68 + }, + { + "epoch": 0.429487498135905, + "grad_norm": 0.580008327960968, + "learning_rate": 0.00017886684319415127, + "loss": 0.2478, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 4320, + "tokens_per_second_per_gpu": 368.72 + }, + { + "epoch": 0.4304816821593677, + "grad_norm": 0.3998342454433441, + "learning_rate": 0.00017877025261579788, + "loss": 0.2202, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 4330, + "tokens_per_second_per_gpu": 395.43 + }, + { + "epoch": 0.43147586618283046, + "grad_norm": 0.38088393211364746, + "learning_rate": 0.00017867346800966383, + "loss": 0.2521, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4340, + "tokens_per_second_per_gpu": 427.5 + }, + { + "epoch": 0.4324700502062932, + "grad_norm": 0.3859814703464508, + "learning_rate": 0.00017857648961415004, + "loss": 0.2353, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 4350, + "tokens_per_second_per_gpu": 376.77 + }, + { + "epoch": 0.43346423422975594, + "grad_norm": 0.34923896193504333, + "learning_rate": 0.00017847931766813482, + "loss": 0.2567, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 4360, + "tokens_per_second_per_gpu": 368.67 + }, + { + "epoch": 0.4344584182532187, + "grad_norm": 0.34750089049339294, + "learning_rate": 0.0001783819524109732, + "loss": 0.2291, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4370, + "tokens_per_second_per_gpu": 379.66 + }, + { + "epoch": 0.4354526022766814, + "grad_norm": 0.36663779616355896, + "learning_rate": 0.0001782843940824964, + "loss": 0.2866, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 4380, + "tokens_per_second_per_gpu": 411.45 + }, + { + "epoch": 0.43644678630014416, + "grad_norm": 0.3929060697555542, + "learning_rate": 0.00017818664292301118, + "loss": 0.2563, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4390, + "tokens_per_second_per_gpu": 381.12 + }, + { + "epoch": 0.4374409703236069, + "grad_norm": 0.4182446599006653, + "learning_rate": 0.0001780886991732993, + "loss": 0.2268, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 4400, + "tokens_per_second_per_gpu": 297.67 + }, + { + "epoch": 0.43843515434706964, + "grad_norm": 0.5998858213424683, + "learning_rate": 0.00017799056307461696, + "loss": 0.2629, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4410, + "tokens_per_second_per_gpu": 344.26 + }, + { + "epoch": 0.4394293383705324, + "grad_norm": 0.4282694160938263, + "learning_rate": 0.0001778922348686941, + "loss": 0.2523, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 4420, + "tokens_per_second_per_gpu": 328.11 + }, + { + "epoch": 0.4404235223939951, + "grad_norm": 0.5925072431564331, + "learning_rate": 0.00017779371479773382, + "loss": 0.27, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4430, + "tokens_per_second_per_gpu": 336.68 + }, + { + "epoch": 0.44141770641745787, + "grad_norm": 0.5149052739143372, + "learning_rate": 0.00017769500310441192, + "loss": 0.3033, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 4440, + "tokens_per_second_per_gpu": 370.47 + }, + { + "epoch": 0.4424118904409206, + "grad_norm": 0.418197363615036, + "learning_rate": 0.00017759610003187617, + "loss": 0.2193, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4450, + "tokens_per_second_per_gpu": 352.55 + }, + { + "epoch": 0.44340607446438335, + "grad_norm": 0.4415562152862549, + "learning_rate": 0.00017749700582374574, + "loss": 0.1978, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 4460, + "tokens_per_second_per_gpu": 343.08 + }, + { + "epoch": 0.4444002584878461, + "grad_norm": 0.32262691855430603, + "learning_rate": 0.0001773977207241106, + "loss": 0.2448, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 4470, + "tokens_per_second_per_gpu": 319.4 + }, + { + "epoch": 0.44539444251130883, + "grad_norm": 0.49002590775489807, + "learning_rate": 0.00017729824497753093, + "loss": 0.2772, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4480, + "tokens_per_second_per_gpu": 331.65 + }, + { + "epoch": 0.44638862653477157, + "grad_norm": 0.4270131587982178, + "learning_rate": 0.0001771985788290365, + "loss": 0.2557, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 4490, + "tokens_per_second_per_gpu": 300.31 + }, + { + "epoch": 0.4473828105582343, + "grad_norm": 0.5524002909660339, + "learning_rate": 0.00017709872252412616, + "loss": 0.2696, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4500, + "tokens_per_second_per_gpu": 348.17 + }, + { + "epoch": 0.44837699458169705, + "grad_norm": 0.32532012462615967, + "learning_rate": 0.00017699867630876703, + "loss": 0.1997, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4510, + "tokens_per_second_per_gpu": 315.46 + }, + { + "epoch": 0.4493711786051598, + "grad_norm": 0.49136292934417725, + "learning_rate": 0.0001768984404293941, + "loss": 0.2824, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4520, + "tokens_per_second_per_gpu": 391.14 + }, + { + "epoch": 0.45036536262862253, + "grad_norm": 0.43822959065437317, + "learning_rate": 0.00017679801513290956, + "loss": 0.2931, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 4530, + "tokens_per_second_per_gpu": 382.92 + }, + { + "epoch": 0.4513595466520853, + "grad_norm": 0.448585569858551, + "learning_rate": 0.00017669740066668214, + "loss": 0.2444, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4540, + "tokens_per_second_per_gpu": 364.48 + }, + { + "epoch": 0.45235373067554807, + "grad_norm": 0.3125511407852173, + "learning_rate": 0.0001765965972785465, + "loss": 0.2227, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4550, + "tokens_per_second_per_gpu": 360.51 + }, + { + "epoch": 0.4533479146990108, + "grad_norm": 0.6492682695388794, + "learning_rate": 0.00017649560521680266, + "loss": 0.3157, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 4560, + "tokens_per_second_per_gpu": 361.61 + }, + { + "epoch": 0.45434209872247355, + "grad_norm": 0.23529411852359772, + "learning_rate": 0.0001763944247302155, + "loss": 0.2644, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 4570, + "tokens_per_second_per_gpu": 333.05 + }, + { + "epoch": 0.4553362827459363, + "grad_norm": 0.37334245443344116, + "learning_rate": 0.00017629305606801387, + "loss": 0.1995, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4580, + "tokens_per_second_per_gpu": 380.12 + }, + { + "epoch": 0.45633046676939903, + "grad_norm": 0.26320332288742065, + "learning_rate": 0.00017619149947989028, + "loss": 0.201, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4590, + "tokens_per_second_per_gpu": 333.8 + }, + { + "epoch": 0.4573246507928618, + "grad_norm": 0.4711815416812897, + "learning_rate": 0.000176089755216, + "loss": 0.3037, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4600, + "tokens_per_second_per_gpu": 398.88 + }, + { + "epoch": 0.4583188348163245, + "grad_norm": 0.3597434461116791, + "learning_rate": 0.0001759878235269607, + "loss": 0.2393, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4610, + "tokens_per_second_per_gpu": 340.42 + }, + { + "epoch": 0.45931301883978726, + "grad_norm": 0.5157446265220642, + "learning_rate": 0.00017588570466385166, + "loss": 0.314, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4620, + "tokens_per_second_per_gpu": 374.07 + }, + { + "epoch": 0.46030720286325, + "grad_norm": 0.4747403562068939, + "learning_rate": 0.0001757833988782132, + "loss": 0.2606, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.47, + "memory/max_allocated (GiB)": 21.47, + "step": 4630, + "tokens_per_second_per_gpu": 333.75 + }, + { + "epoch": 0.46130138688671274, + "grad_norm": 0.45278453826904297, + "learning_rate": 0.00017568090642204612, + "loss": 0.2106, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 4640, + "tokens_per_second_per_gpu": 352.11 + }, + { + "epoch": 0.4622955709101755, + "grad_norm": 0.7563058137893677, + "learning_rate": 0.00017557822754781102, + "loss": 0.2457, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 4650, + "tokens_per_second_per_gpu": 370.21 + }, + { + "epoch": 0.4632897549336382, + "grad_norm": 0.27436432242393494, + "learning_rate": 0.00017547536250842765, + "loss": 0.2659, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4660, + "tokens_per_second_per_gpu": 347.64 + }, + { + "epoch": 0.46428393895710096, + "grad_norm": 0.3469400107860565, + "learning_rate": 0.00017537231155727428, + "loss": 0.2744, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 4670, + "tokens_per_second_per_gpu": 354.35 + }, + { + "epoch": 0.4652781229805637, + "grad_norm": 0.29021984338760376, + "learning_rate": 0.0001752690749481873, + "loss": 0.236, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4680, + "tokens_per_second_per_gpu": 357.93 + }, + { + "epoch": 0.46627230700402644, + "grad_norm": 0.28982868790626526, + "learning_rate": 0.00017516565293546025, + "loss": 0.2694, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 4690, + "tokens_per_second_per_gpu": 433.34 + }, + { + "epoch": 0.4672664910274892, + "grad_norm": 0.39744624495506287, + "learning_rate": 0.00017506204577384337, + "loss": 0.2209, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4700, + "tokens_per_second_per_gpu": 380.58 + }, + { + "epoch": 0.4682606750509519, + "grad_norm": 0.14510154724121094, + "learning_rate": 0.00017495825371854302, + "loss": 0.2147, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4710, + "tokens_per_second_per_gpu": 342.34 + }, + { + "epoch": 0.46925485907441467, + "grad_norm": 0.26011091470718384, + "learning_rate": 0.000174854277025221, + "loss": 0.2411, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4720, + "tokens_per_second_per_gpu": 395.32 + }, + { + "epoch": 0.4702490430978774, + "grad_norm": 0.5020186901092529, + "learning_rate": 0.00017475011594999385, + "loss": 0.2466, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 4730, + "tokens_per_second_per_gpu": 385.96 + }, + { + "epoch": 0.47124322712134015, + "grad_norm": 0.5160934925079346, + "learning_rate": 0.0001746457707494323, + "loss": 0.3094, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 4740, + "tokens_per_second_per_gpu": 343.47 + }, + { + "epoch": 0.4722374111448029, + "grad_norm": 0.43567317724227905, + "learning_rate": 0.00017454124168056066, + "loss": 0.2324, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 4750, + "tokens_per_second_per_gpu": 346.39 + }, + { + "epoch": 0.47323159516826563, + "grad_norm": 0.39488813281059265, + "learning_rate": 0.0001744365290008561, + "loss": 0.1983, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4760, + "tokens_per_second_per_gpu": 363.53 + }, + { + "epoch": 0.47422577919172837, + "grad_norm": 0.2595832943916321, + "learning_rate": 0.00017433163296824808, + "loss": 0.2783, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 4770, + "tokens_per_second_per_gpu": 431.69 + }, + { + "epoch": 0.4752199632151911, + "grad_norm": 0.3657257556915283, + "learning_rate": 0.00017422655384111772, + "loss": 0.2223, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4780, + "tokens_per_second_per_gpu": 333.82 + }, + { + "epoch": 0.47621414723865385, + "grad_norm": 0.3269219994544983, + "learning_rate": 0.00017412129187829712, + "loss": 0.2042, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 4790, + "tokens_per_second_per_gpu": 356.0 + }, + { + "epoch": 0.4772083312621166, + "grad_norm": 0.35581299662590027, + "learning_rate": 0.00017401584733906872, + "loss": 0.216, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4800, + "tokens_per_second_per_gpu": 306.32 + }, + { + "epoch": 0.4782025152855794, + "grad_norm": 0.5693733096122742, + "learning_rate": 0.00017391022048316476, + "loss": 0.3306, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.19, + "memory/max_allocated (GiB)": 19.19, + "step": 4810, + "tokens_per_second_per_gpu": 359.5 + }, + { + "epoch": 0.47919669930904213, + "grad_norm": 0.33154231309890747, + "learning_rate": 0.00017380441157076643, + "loss": 0.2469, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 4820, + "tokens_per_second_per_gpu": 363.91 + }, + { + "epoch": 0.48019088333250487, + "grad_norm": 0.417501837015152, + "learning_rate": 0.00017369842086250347, + "loss": 0.2286, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 4830, + "tokens_per_second_per_gpu": 261.89 + }, + { + "epoch": 0.4811850673559676, + "grad_norm": 0.2794663608074188, + "learning_rate": 0.00017359224861945345, + "loss": 0.2415, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4840, + "tokens_per_second_per_gpu": 350.79 + }, + { + "epoch": 0.48217925137943035, + "grad_norm": 0.31123244762420654, + "learning_rate": 0.00017348589510314096, + "loss": 0.2396, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 4850, + "tokens_per_second_per_gpu": 380.51 + }, + { + "epoch": 0.4831734354028931, + "grad_norm": 0.21615763008594513, + "learning_rate": 0.00017337936057553726, + "loss": 0.2286, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4860, + "tokens_per_second_per_gpu": 430.28 + }, + { + "epoch": 0.48416761942635583, + "grad_norm": 0.38201475143432617, + "learning_rate": 0.0001732726452990594, + "loss": 0.2213, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4870, + "tokens_per_second_per_gpu": 337.65 + }, + { + "epoch": 0.4851618034498186, + "grad_norm": 0.4545513987541199, + "learning_rate": 0.00017316574953656958, + "loss": 0.2696, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4880, + "tokens_per_second_per_gpu": 388.83 + }, + { + "epoch": 0.4861559874732813, + "grad_norm": 0.2672022879123688, + "learning_rate": 0.00017305867355137475, + "loss": 0.1962, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4890, + "tokens_per_second_per_gpu": 318.92 + }, + { + "epoch": 0.48715017149674406, + "grad_norm": 0.4752904772758484, + "learning_rate": 0.00017295141760722567, + "loss": 0.2107, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4900, + "tokens_per_second_per_gpu": 361.56 + }, + { + "epoch": 0.4881443555202068, + "grad_norm": 0.4459490180015564, + "learning_rate": 0.0001728439819683164, + "loss": 0.2648, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 4910, + "tokens_per_second_per_gpu": 302.7 + }, + { + "epoch": 0.48913853954366954, + "grad_norm": 0.6155937314033508, + "learning_rate": 0.00017273636689928357, + "loss": 0.2714, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 4920, + "tokens_per_second_per_gpu": 379.02 + }, + { + "epoch": 0.4901327235671323, + "grad_norm": 0.4557485282421112, + "learning_rate": 0.00017262857266520595, + "loss": 0.1966, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 4930, + "tokens_per_second_per_gpu": 369.72 + }, + { + "epoch": 0.491126907590595, + "grad_norm": 0.4636807143688202, + "learning_rate": 0.0001725205995316034, + "loss": 0.2803, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4940, + "tokens_per_second_per_gpu": 373.68 + }, + { + "epoch": 0.49212109161405776, + "grad_norm": 0.45894595980644226, + "learning_rate": 0.00017241244776443666, + "loss": 0.3439, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 4950, + "tokens_per_second_per_gpu": 426.86 + }, + { + "epoch": 0.4931152756375205, + "grad_norm": 0.39921844005584717, + "learning_rate": 0.0001723041176301063, + "loss": 0.3035, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 4960, + "tokens_per_second_per_gpu": 375.09 + }, + { + "epoch": 0.49410945966098324, + "grad_norm": 0.28210845589637756, + "learning_rate": 0.00017219560939545246, + "loss": 0.2043, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 4970, + "tokens_per_second_per_gpu": 320.21 + }, + { + "epoch": 0.495103643684446, + "grad_norm": 0.5301778316497803, + "learning_rate": 0.00017208692332775375, + "loss": 0.2293, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 4980, + "tokens_per_second_per_gpu": 327.97 + }, + { + "epoch": 0.4960978277079087, + "grad_norm": 0.40421542525291443, + "learning_rate": 0.000171978059694727, + "loss": 0.3101, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 4990, + "tokens_per_second_per_gpu": 395.91 + }, + { + "epoch": 0.49709201173137146, + "grad_norm": 0.383989155292511, + "learning_rate": 0.0001718690187645263, + "loss": 0.2447, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 5000, + "tokens_per_second_per_gpu": 344.59 + }, + { + "epoch": 0.4980861957548342, + "grad_norm": 0.559518039226532, + "learning_rate": 0.00017175980080574247, + "loss": 0.3176, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 5010, + "tokens_per_second_per_gpu": 378.55 + }, + { + "epoch": 0.49908037977829695, + "grad_norm": 0.46650487184524536, + "learning_rate": 0.00017165040608740255, + "loss": 0.2006, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 5020, + "tokens_per_second_per_gpu": 334.42 + }, + { + "epoch": 0.5000745638017597, + "grad_norm": 0.3541397750377655, + "learning_rate": 0.00017154083487896872, + "loss": 0.2542, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 5030, + "tokens_per_second_per_gpu": 360.31 + }, + { + "epoch": 0.5010687478252225, + "grad_norm": 0.48084208369255066, + "learning_rate": 0.00017143108745033811, + "loss": 0.2133, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 5040, + "tokens_per_second_per_gpu": 281.78 + }, + { + "epoch": 0.5020629318486852, + "grad_norm": 0.28656521439552307, + "learning_rate": 0.0001713211640718418, + "loss": 0.246, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 5050, + "tokens_per_second_per_gpu": 334.53 + }, + { + "epoch": 0.503057115872148, + "grad_norm": 0.48624783754348755, + "learning_rate": 0.0001712110650142443, + "loss": 0.2497, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.65, + "memory/max_allocated (GiB)": 20.65, + "step": 5060, + "tokens_per_second_per_gpu": 397.02 + }, + { + "epoch": 0.5040512998956107, + "grad_norm": 0.3334798216819763, + "learning_rate": 0.00017110079054874288, + "loss": 0.2366, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5070, + "tokens_per_second_per_gpu": 371.25 + }, + { + "epoch": 0.5050454839190734, + "grad_norm": 0.4432489275932312, + "learning_rate": 0.00017099034094696685, + "loss": 0.2104, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5080, + "tokens_per_second_per_gpu": 384.5 + }, + { + "epoch": 0.5060396679425362, + "grad_norm": 0.5348425507545471, + "learning_rate": 0.00017087971648097693, + "loss": 0.2292, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5090, + "tokens_per_second_per_gpu": 311.48 + }, + { + "epoch": 0.5070338519659989, + "grad_norm": 0.4587235748767853, + "learning_rate": 0.00017076891742326452, + "loss": 0.2297, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 5100, + "tokens_per_second_per_gpu": 371.52 + }, + { + "epoch": 0.5080280359894617, + "grad_norm": 0.3222121298313141, + "learning_rate": 0.00017065794404675112, + "loss": 0.2447, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 5110, + "tokens_per_second_per_gpu": 346.06 + }, + { + "epoch": 0.5090222200129244, + "grad_norm": 0.38898178935050964, + "learning_rate": 0.0001705467966247877, + "loss": 0.192, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5120, + "tokens_per_second_per_gpu": 388.93 + }, + { + "epoch": 0.5100164040363871, + "grad_norm": 0.49159225821495056, + "learning_rate": 0.00017043547543115373, + "loss": 0.2604, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 5130, + "tokens_per_second_per_gpu": 348.77 + }, + { + "epoch": 0.5110105880598499, + "grad_norm": 0.34262073040008545, + "learning_rate": 0.0001703239807400569, + "loss": 0.1843, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 5140, + "tokens_per_second_per_gpu": 361.68 + }, + { + "epoch": 0.5120047720833126, + "grad_norm": 0.5016794800758362, + "learning_rate": 0.00017021231282613223, + "loss": 0.2527, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 5150, + "tokens_per_second_per_gpu": 385.36 + }, + { + "epoch": 0.5129989561067754, + "grad_norm": 0.44959282875061035, + "learning_rate": 0.00017010047196444137, + "loss": 0.2349, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5160, + "tokens_per_second_per_gpu": 319.58 + }, + { + "epoch": 0.5139931401302381, + "grad_norm": 0.27737611532211304, + "learning_rate": 0.00016998845843047193, + "loss": 0.2564, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 5170, + "tokens_per_second_per_gpu": 325.49 + }, + { + "epoch": 0.5149873241537009, + "grad_norm": 0.5175918340682983, + "learning_rate": 0.00016987627250013702, + "loss": 0.2375, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 5180, + "tokens_per_second_per_gpu": 334.38 + }, + { + "epoch": 0.5159815081771636, + "grad_norm": 0.5541722178459167, + "learning_rate": 0.00016976391444977425, + "loss": 0.2478, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.81, + "memory/max_allocated (GiB)": 19.81, + "step": 5190, + "tokens_per_second_per_gpu": 316.46 + }, + { + "epoch": 0.5169756922006263, + "grad_norm": 1.0252865552902222, + "learning_rate": 0.00016965138455614525, + "loss": 0.2371, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5200, + "tokens_per_second_per_gpu": 339.79 + }, + { + "epoch": 0.5179698762240891, + "grad_norm": 0.582073450088501, + "learning_rate": 0.00016953868309643491, + "loss": 0.2311, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5210, + "tokens_per_second_per_gpu": 359.2 + }, + { + "epoch": 0.5189640602475518, + "grad_norm": 0.5147042870521545, + "learning_rate": 0.0001694258103482508, + "loss": 0.2693, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 5220, + "tokens_per_second_per_gpu": 379.55 + }, + { + "epoch": 0.5199582442710146, + "grad_norm": 0.6816319823265076, + "learning_rate": 0.0001693127665896223, + "loss": 0.2228, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 5230, + "tokens_per_second_per_gpu": 296.31 + }, + { + "epoch": 0.5209524282944773, + "grad_norm": 0.765450656414032, + "learning_rate": 0.00016919955209900012, + "loss": 0.2764, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 5240, + "tokens_per_second_per_gpu": 389.55 + }, + { + "epoch": 0.52194661231794, + "grad_norm": 0.3426934778690338, + "learning_rate": 0.00016908616715525544, + "loss": 0.3197, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 5250, + "tokens_per_second_per_gpu": 404.47 + }, + { + "epoch": 0.5229407963414028, + "grad_norm": 0.45074665546417236, + "learning_rate": 0.0001689726120376794, + "loss": 0.2624, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 5260, + "tokens_per_second_per_gpu": 362.3 + }, + { + "epoch": 0.5239349803648655, + "grad_norm": 0.4062357246875763, + "learning_rate": 0.00016885888702598218, + "loss": 0.2068, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.52, + "memory/max_allocated (GiB)": 20.52, + "step": 5270, + "tokens_per_second_per_gpu": 383.64 + }, + { + "epoch": 0.5249291643883283, + "grad_norm": 0.4385395050048828, + "learning_rate": 0.00016874499240029253, + "loss": 0.2886, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 5280, + "tokens_per_second_per_gpu": 332.1 + }, + { + "epoch": 0.525923348411791, + "grad_norm": 0.4379644989967346, + "learning_rate": 0.0001686309284411571, + "loss": 0.2305, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.08, + "memory/max_allocated (GiB)": 19.08, + "step": 5290, + "tokens_per_second_per_gpu": 328.75 + }, + { + "epoch": 0.5269175324352537, + "grad_norm": 0.5357609987258911, + "learning_rate": 0.00016851669542953935, + "loss": 0.2526, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 5300, + "tokens_per_second_per_gpu": 396.9 + }, + { + "epoch": 0.5279117164587165, + "grad_norm": 0.5790386199951172, + "learning_rate": 0.00016840229364681948, + "loss": 0.193, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5310, + "tokens_per_second_per_gpu": 313.5 + }, + { + "epoch": 0.5289059004821792, + "grad_norm": 0.4063149690628052, + "learning_rate": 0.00016828772337479318, + "loss": 0.2071, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 5320, + "tokens_per_second_per_gpu": 290.65 + }, + { + "epoch": 0.529900084505642, + "grad_norm": 0.3840952515602112, + "learning_rate": 0.00016817298489567127, + "loss": 0.2086, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 5330, + "tokens_per_second_per_gpu": 372.36 + }, + { + "epoch": 0.5308942685291047, + "grad_norm": 0.36974212527275085, + "learning_rate": 0.0001680580784920789, + "loss": 0.2556, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5340, + "tokens_per_second_per_gpu": 369.63 + }, + { + "epoch": 0.5318884525525674, + "grad_norm": 0.5495437979698181, + "learning_rate": 0.00016794300444705477, + "loss": 0.2667, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 5350, + "tokens_per_second_per_gpu": 327.36 + }, + { + "epoch": 0.5328826365760302, + "grad_norm": 0.5131349563598633, + "learning_rate": 0.0001678277630440506, + "loss": 0.2311, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 5360, + "tokens_per_second_per_gpu": 357.39 + }, + { + "epoch": 0.5338768205994929, + "grad_norm": 0.28221410512924194, + "learning_rate": 0.00016771235456693035, + "loss": 0.2532, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 5370, + "tokens_per_second_per_gpu": 385.44 + }, + { + "epoch": 0.5348710046229557, + "grad_norm": 0.5543202757835388, + "learning_rate": 0.0001675967792999695, + "loss": 0.2999, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.19, + "memory/max_allocated (GiB)": 19.19, + "step": 5380, + "tokens_per_second_per_gpu": 393.99 + }, + { + "epoch": 0.5358651886464184, + "grad_norm": 0.3077482283115387, + "learning_rate": 0.00016748103752785426, + "loss": 0.2071, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 5390, + "tokens_per_second_per_gpu": 372.33 + }, + { + "epoch": 0.5368593726698812, + "grad_norm": 0.6371411681175232, + "learning_rate": 0.00016736512953568117, + "loss": 0.1986, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.43, + "memory/max_allocated (GiB)": 20.43, + "step": 5400, + "tokens_per_second_per_gpu": 325.81 + }, + { + "epoch": 0.5378535566933439, + "grad_norm": 0.4577232003211975, + "learning_rate": 0.0001672490556089561, + "loss": 0.253, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5410, + "tokens_per_second_per_gpu": 388.89 + }, + { + "epoch": 0.5388477407168066, + "grad_norm": 0.2831481099128723, + "learning_rate": 0.00016713281603359366, + "loss": 0.1994, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 5420, + "tokens_per_second_per_gpu": 373.61 + }, + { + "epoch": 0.5398419247402694, + "grad_norm": 0.45899850130081177, + "learning_rate": 0.00016701641109591648, + "loss": 0.1997, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 5430, + "tokens_per_second_per_gpu": 386.1 + }, + { + "epoch": 0.5408361087637321, + "grad_norm": 0.2898833453655243, + "learning_rate": 0.0001668998410826545, + "loss": 0.276, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 5440, + "tokens_per_second_per_gpu": 374.49 + }, + { + "epoch": 0.5418302927871949, + "grad_norm": 0.37996944785118103, + "learning_rate": 0.00016678310628094438, + "loss": 0.2529, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 5450, + "tokens_per_second_per_gpu": 359.7 + }, + { + "epoch": 0.5428244768106577, + "grad_norm": 0.255087673664093, + "learning_rate": 0.0001666662069783285, + "loss": 0.2682, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 5460, + "tokens_per_second_per_gpu": 346.66 + }, + { + "epoch": 0.5438186608341204, + "grad_norm": 0.4574085474014282, + "learning_rate": 0.00016654914346275466, + "loss": 0.2516, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 5470, + "tokens_per_second_per_gpu": 350.42 + }, + { + "epoch": 0.5448128448575832, + "grad_norm": 0.5208483934402466, + "learning_rate": 0.00016643191602257496, + "loss": 0.2549, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 5480, + "tokens_per_second_per_gpu": 320.05 + }, + { + "epoch": 0.5458070288810459, + "grad_norm": 0.31626808643341064, + "learning_rate": 0.00016631452494654541, + "loss": 0.2151, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 5490, + "tokens_per_second_per_gpu": 335.31 + }, + { + "epoch": 0.5468012129045087, + "grad_norm": 0.28286507725715637, + "learning_rate": 0.000166196970523825, + "loss": 0.2847, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 5500, + "tokens_per_second_per_gpu": 406.77 + }, + { + "epoch": 0.5477953969279714, + "grad_norm": 0.3647201955318451, + "learning_rate": 0.00016607925304397517, + "loss": 0.1912, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 5510, + "tokens_per_second_per_gpu": 334.31 + }, + { + "epoch": 0.5487895809514342, + "grad_norm": 0.48184719681739807, + "learning_rate": 0.0001659613727969589, + "loss": 0.2336, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5520, + "tokens_per_second_per_gpu": 355.21 + }, + { + "epoch": 0.5497837649748969, + "grad_norm": 0.5385854244232178, + "learning_rate": 0.00016584333007314017, + "loss": 0.2764, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 5530, + "tokens_per_second_per_gpu": 384.64 + }, + { + "epoch": 0.5507779489983596, + "grad_norm": 0.2823385000228882, + "learning_rate": 0.00016572512516328317, + "loss": 0.3002, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.82, + "memory/max_allocated (GiB)": 18.82, + "step": 5540, + "tokens_per_second_per_gpu": 417.12 + }, + { + "epoch": 0.5517721330218224, + "grad_norm": 0.4133371114730835, + "learning_rate": 0.0001656067583585516, + "loss": 0.2066, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 5550, + "tokens_per_second_per_gpu": 314.99 + }, + { + "epoch": 0.5527663170452851, + "grad_norm": 0.2247128188610077, + "learning_rate": 0.00016548822995050787, + "loss": 0.2582, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 5560, + "tokens_per_second_per_gpu": 341.74 + }, + { + "epoch": 0.5537605010687479, + "grad_norm": 0.5088145136833191, + "learning_rate": 0.0001653695402311125, + "loss": 0.3169, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 5570, + "tokens_per_second_per_gpu": 385.48 + }, + { + "epoch": 0.5547546850922106, + "grad_norm": 0.600646436214447, + "learning_rate": 0.0001652506894927234, + "loss": 0.3391, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 5580, + "tokens_per_second_per_gpu": 338.0 + }, + { + "epoch": 0.5557488691156733, + "grad_norm": 0.34408292174339294, + "learning_rate": 0.00016513167802809502, + "loss": 0.2003, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5590, + "tokens_per_second_per_gpu": 341.35 + }, + { + "epoch": 0.5567430531391361, + "grad_norm": 0.44201189279556274, + "learning_rate": 0.0001650125061303778, + "loss": 0.2494, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 5600, + "tokens_per_second_per_gpu": 386.01 + }, + { + "epoch": 0.5577372371625988, + "grad_norm": 0.5484967827796936, + "learning_rate": 0.00016489317409311717, + "loss": 0.2881, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 5610, + "tokens_per_second_per_gpu": 327.21 + }, + { + "epoch": 0.5587314211860616, + "grad_norm": 0.45674219727516174, + "learning_rate": 0.00016477368221025333, + "loss": 0.2377, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 5620, + "tokens_per_second_per_gpu": 365.25 + }, + { + "epoch": 0.5597256052095243, + "grad_norm": 0.3383237421512604, + "learning_rate": 0.00016465403077612001, + "loss": 0.2251, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5630, + "tokens_per_second_per_gpu": 357.99 + }, + { + "epoch": 0.560719789232987, + "grad_norm": 0.3748398423194885, + "learning_rate": 0.00016453422008544388, + "loss": 0.2279, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 5640, + "tokens_per_second_per_gpu": 335.12 + }, + { + "epoch": 0.5617139732564498, + "grad_norm": 0.4504337012767792, + "learning_rate": 0.00016441425043334413, + "loss": 0.261, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 5650, + "tokens_per_second_per_gpu": 336.53 + }, + { + "epoch": 0.5627081572799125, + "grad_norm": 0.3367341458797455, + "learning_rate": 0.00016429412211533127, + "loss": 0.1855, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5660, + "tokens_per_second_per_gpu": 347.1 + }, + { + "epoch": 0.5637023413033753, + "grad_norm": 0.3390282988548279, + "learning_rate": 0.00016417383542730675, + "loss": 0.2428, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 5670, + "tokens_per_second_per_gpu": 385.32 + }, + { + "epoch": 0.564696525326838, + "grad_norm": 0.6450189352035522, + "learning_rate": 0.00016405339066556212, + "loss": 0.3651, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 5680, + "tokens_per_second_per_gpu": 427.22 + }, + { + "epoch": 0.5656907093503007, + "grad_norm": 0.5655054450035095, + "learning_rate": 0.0001639327881267783, + "loss": 0.2204, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 5690, + "tokens_per_second_per_gpu": 310.84 + }, + { + "epoch": 0.5666848933737635, + "grad_norm": 0.4713475704193115, + "learning_rate": 0.00016381202810802483, + "loss": 0.2294, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.47, + "memory/max_allocated (GiB)": 21.47, + "step": 5700, + "tokens_per_second_per_gpu": 363.21 + }, + { + "epoch": 0.5676790773972262, + "grad_norm": 0.19377703964710236, + "learning_rate": 0.00016369111090675916, + "loss": 0.2522, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 5710, + "tokens_per_second_per_gpu": 412.63 + }, + { + "epoch": 0.568673261420689, + "grad_norm": 0.5475621819496155, + "learning_rate": 0.0001635700368208259, + "loss": 0.3132, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 5720, + "tokens_per_second_per_gpu": 367.47 + }, + { + "epoch": 0.5696674454441517, + "grad_norm": 0.3860287368297577, + "learning_rate": 0.00016344880614845608, + "loss": 0.2623, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5730, + "tokens_per_second_per_gpu": 381.97 + }, + { + "epoch": 0.5706616294676145, + "grad_norm": 0.3809770345687866, + "learning_rate": 0.00016332741918826654, + "loss": 0.2365, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 5740, + "tokens_per_second_per_gpu": 326.73 + }, + { + "epoch": 0.5716558134910772, + "grad_norm": 0.5006048083305359, + "learning_rate": 0.00016320587623925895, + "loss": 0.2661, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 5750, + "tokens_per_second_per_gpu": 334.44 + }, + { + "epoch": 0.5726499975145399, + "grad_norm": 0.2004530131816864, + "learning_rate": 0.00016308417760081936, + "loss": 0.1923, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 5760, + "tokens_per_second_per_gpu": 377.56 + }, + { + "epoch": 0.5736441815380027, + "grad_norm": 0.30563804507255554, + "learning_rate": 0.00016296232357271718, + "loss": 0.2089, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 5770, + "tokens_per_second_per_gpu": 351.29 + }, + { + "epoch": 0.5746383655614654, + "grad_norm": 0.3142051100730896, + "learning_rate": 0.00016284031445510465, + "loss": 0.1931, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 5780, + "tokens_per_second_per_gpu": 327.41 + }, + { + "epoch": 0.5756325495849282, + "grad_norm": 0.4678092300891876, + "learning_rate": 0.000162718150548516, + "loss": 0.3011, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 5790, + "tokens_per_second_per_gpu": 367.12 + }, + { + "epoch": 0.5766267336083909, + "grad_norm": 0.44859957695007324, + "learning_rate": 0.00016259583215386675, + "loss": 0.2855, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5800, + "tokens_per_second_per_gpu": 377.51 + }, + { + "epoch": 0.5776209176318536, + "grad_norm": 0.3087036609649658, + "learning_rate": 0.00016247335957245303, + "loss": 0.2181, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.21, + "memory/max_allocated (GiB)": 18.21, + "step": 5810, + "tokens_per_second_per_gpu": 292.23 + }, + { + "epoch": 0.5786151016553164, + "grad_norm": 0.29946303367614746, + "learning_rate": 0.00016235073310595058, + "loss": 0.203, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 5820, + "tokens_per_second_per_gpu": 294.26 + }, + { + "epoch": 0.5796092856787791, + "grad_norm": 0.41978803277015686, + "learning_rate": 0.0001622279530564144, + "loss": 0.2186, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 5830, + "tokens_per_second_per_gpu": 332.7 + }, + { + "epoch": 0.5806034697022419, + "grad_norm": 0.5330750942230225, + "learning_rate": 0.00016210501972627764, + "loss": 0.2726, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 5840, + "tokens_per_second_per_gpu": 372.14 + }, + { + "epoch": 0.5815976537257046, + "grad_norm": 0.4443431794643402, + "learning_rate": 0.0001619819334183511, + "loss": 0.2121, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 5850, + "tokens_per_second_per_gpu": 329.51 + }, + { + "epoch": 0.5825918377491673, + "grad_norm": 0.4602350890636444, + "learning_rate": 0.00016185869443582237, + "loss": 0.2845, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.85, + "memory/max_allocated (GiB)": 18.85, + "step": 5860, + "tokens_per_second_per_gpu": 334.35 + }, + { + "epoch": 0.5835860217726301, + "grad_norm": 0.2751925587654114, + "learning_rate": 0.00016173530308225513, + "loss": 0.1812, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 5870, + "tokens_per_second_per_gpu": 397.13 + }, + { + "epoch": 0.5845802057960928, + "grad_norm": 0.5334510803222656, + "learning_rate": 0.00016161175966158834, + "loss": 0.2088, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 5880, + "tokens_per_second_per_gpu": 297.42 + }, + { + "epoch": 0.5855743898195556, + "grad_norm": 0.36908817291259766, + "learning_rate": 0.00016148806447813553, + "loss": 0.2197, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5890, + "tokens_per_second_per_gpu": 294.32 + }, + { + "epoch": 0.5865685738430183, + "grad_norm": 0.24384719133377075, + "learning_rate": 0.00016136421783658416, + "loss": 0.2757, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.81, + "memory/max_allocated (GiB)": 18.81, + "step": 5900, + "tokens_per_second_per_gpu": 315.54 + }, + { + "epoch": 0.587562757866481, + "grad_norm": 0.41709259152412415, + "learning_rate": 0.0001612402200419946, + "loss": 0.2858, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 5910, + "tokens_per_second_per_gpu": 385.92 + }, + { + "epoch": 0.5885569418899438, + "grad_norm": 0.5187587738037109, + "learning_rate": 0.00016111607139979967, + "loss": 0.2347, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.64, + "memory/max_allocated (GiB)": 20.64, + "step": 5920, + "tokens_per_second_per_gpu": 377.8 + }, + { + "epoch": 0.5895511259134065, + "grad_norm": 0.29502683877944946, + "learning_rate": 0.00016099177221580373, + "loss": 0.2563, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 5930, + "tokens_per_second_per_gpu": 335.51 + }, + { + "epoch": 0.5905453099368693, + "grad_norm": 0.360385000705719, + "learning_rate": 0.00016086732279618188, + "loss": 0.1982, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 5940, + "tokens_per_second_per_gpu": 315.07 + }, + { + "epoch": 0.591539493960332, + "grad_norm": 0.35591524839401245, + "learning_rate": 0.0001607427234474794, + "loss": 0.2535, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 5950, + "tokens_per_second_per_gpu": 379.0 + }, + { + "epoch": 0.5925336779837947, + "grad_norm": 0.49772289395332336, + "learning_rate": 0.0001606179744766108, + "loss": 0.3105, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5960, + "tokens_per_second_per_gpu": 403.62 + }, + { + "epoch": 0.5935278620072575, + "grad_norm": 0.3001823425292969, + "learning_rate": 0.00016049307619085915, + "loss": 0.2011, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 5970, + "tokens_per_second_per_gpu": 289.33 + }, + { + "epoch": 0.5945220460307203, + "grad_norm": 0.593662679195404, + "learning_rate": 0.00016036802889787536, + "loss": 0.2728, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 5980, + "tokens_per_second_per_gpu": 397.95 + }, + { + "epoch": 0.5955162300541831, + "grad_norm": 0.38041049242019653, + "learning_rate": 0.00016024283290567732, + "loss": 0.2016, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 5990, + "tokens_per_second_per_gpu": 310.64 + }, + { + "epoch": 0.5965104140776458, + "grad_norm": 0.28381025791168213, + "learning_rate": 0.0001601174885226492, + "loss": 0.2, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 6000, + "tokens_per_second_per_gpu": 368.07 + }, + { + "epoch": 0.5975045981011086, + "grad_norm": 0.4913434088230133, + "learning_rate": 0.0001599919960575407, + "loss": 0.2653, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6010, + "tokens_per_second_per_gpu": 365.97 + }, + { + "epoch": 0.5984987821245713, + "grad_norm": 0.4141637086868286, + "learning_rate": 0.00015986635581946638, + "loss": 0.2687, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 6020, + "tokens_per_second_per_gpu": 363.69 + }, + { + "epoch": 0.599492966148034, + "grad_norm": 0.45988166332244873, + "learning_rate": 0.00015974056811790462, + "loss": 0.2625, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6030, + "tokens_per_second_per_gpu": 402.4 + }, + { + "epoch": 0.6004871501714968, + "grad_norm": 0.5327023267745972, + "learning_rate": 0.0001596146332626971, + "loss": 0.2462, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 6040, + "tokens_per_second_per_gpu": 382.41 + }, + { + "epoch": 0.6014813341949595, + "grad_norm": 0.2608936131000519, + "learning_rate": 0.00015948855156404802, + "loss": 0.2171, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6050, + "tokens_per_second_per_gpu": 369.97 + }, + { + "epoch": 0.6024755182184223, + "grad_norm": 0.5496955513954163, + "learning_rate": 0.00015936232333252327, + "loss": 0.3213, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 6060, + "tokens_per_second_per_gpu": 365.39 + }, + { + "epoch": 0.603469702241885, + "grad_norm": 0.40054014325141907, + "learning_rate": 0.00015923594887904964, + "loss": 0.1949, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 6070, + "tokens_per_second_per_gpu": 293.13 + }, + { + "epoch": 0.6044638862653477, + "grad_norm": 0.7320500612258911, + "learning_rate": 0.0001591094285149141, + "loss": 0.2662, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6080, + "tokens_per_second_per_gpu": 336.97 + }, + { + "epoch": 0.6054580702888105, + "grad_norm": 0.484331876039505, + "learning_rate": 0.00015898276255176303, + "loss": 0.2487, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 6090, + "tokens_per_second_per_gpu": 359.19 + }, + { + "epoch": 0.6064522543122732, + "grad_norm": 0.4304184019565582, + "learning_rate": 0.00015885595130160155, + "loss": 0.2546, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6100, + "tokens_per_second_per_gpu": 329.56 + }, + { + "epoch": 0.607446438335736, + "grad_norm": 0.49904513359069824, + "learning_rate": 0.00015872899507679252, + "loss": 0.2622, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6110, + "tokens_per_second_per_gpu": 381.18 + }, + { + "epoch": 0.6084406223591987, + "grad_norm": 0.525607168674469, + "learning_rate": 0.00015860189419005595, + "loss": 0.2424, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 6120, + "tokens_per_second_per_gpu": 331.0 + }, + { + "epoch": 0.6094348063826615, + "grad_norm": 0.3618375360965729, + "learning_rate": 0.0001584746489544682, + "loss": 0.3099, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 6130, + "tokens_per_second_per_gpu": 435.27 + }, + { + "epoch": 0.6104289904061242, + "grad_norm": 0.41309383511543274, + "learning_rate": 0.00015834725968346116, + "loss": 0.2337, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6140, + "tokens_per_second_per_gpu": 381.0 + }, + { + "epoch": 0.6114231744295869, + "grad_norm": 0.37885645031929016, + "learning_rate": 0.00015821972669082156, + "loss": 0.3318, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6150, + "tokens_per_second_per_gpu": 365.41 + }, + { + "epoch": 0.6124173584530497, + "grad_norm": 0.4314253032207489, + "learning_rate": 0.0001580920502906901, + "loss": 0.2213, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6160, + "tokens_per_second_per_gpu": 343.04 + }, + { + "epoch": 0.6134115424765124, + "grad_norm": 0.5435411930084229, + "learning_rate": 0.00015796423079756074, + "loss": 0.243, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 6170, + "tokens_per_second_per_gpu": 365.75 + }, + { + "epoch": 0.6144057264999752, + "grad_norm": 0.31716790795326233, + "learning_rate": 0.00015783626852627992, + "loss": 0.2484, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 6180, + "tokens_per_second_per_gpu": 435.76 + }, + { + "epoch": 0.6153999105234379, + "grad_norm": 0.3328079879283905, + "learning_rate": 0.0001577081637920457, + "loss": 0.223, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6190, + "tokens_per_second_per_gpu": 310.86 + }, + { + "epoch": 0.6163940945469006, + "grad_norm": 0.3827805519104004, + "learning_rate": 0.00015757991691040722, + "loss": 0.2311, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6200, + "tokens_per_second_per_gpu": 333.74 + }, + { + "epoch": 0.6173882785703634, + "grad_norm": 0.3756648004055023, + "learning_rate": 0.00015745152819726356, + "loss": 0.229, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 6210, + "tokens_per_second_per_gpu": 352.29 + }, + { + "epoch": 0.6183824625938261, + "grad_norm": 0.4345835745334625, + "learning_rate": 0.0001573229979688633, + "loss": 0.2694, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 6220, + "tokens_per_second_per_gpu": 411.14 + }, + { + "epoch": 0.6193766466172889, + "grad_norm": 0.23183055222034454, + "learning_rate": 0.00015719432654180357, + "loss": 0.1925, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6230, + "tokens_per_second_per_gpu": 363.23 + }, + { + "epoch": 0.6203708306407516, + "grad_norm": 0.2867846190929413, + "learning_rate": 0.00015706551423302925, + "loss": 0.1506, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6240, + "tokens_per_second_per_gpu": 275.12 + }, + { + "epoch": 0.6213650146642143, + "grad_norm": 0.42951786518096924, + "learning_rate": 0.00015693656135983233, + "loss": 0.2867, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 6250, + "tokens_per_second_per_gpu": 383.43 + }, + { + "epoch": 0.6223591986876771, + "grad_norm": 0.5975165963172913, + "learning_rate": 0.00015680746823985094, + "loss": 0.263, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 6260, + "tokens_per_second_per_gpu": 304.9 + }, + { + "epoch": 0.6233533827111398, + "grad_norm": 0.48659709095954895, + "learning_rate": 0.00015667823519106873, + "loss": 0.2079, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 6270, + "tokens_per_second_per_gpu": 340.54 + }, + { + "epoch": 0.6243475667346026, + "grad_norm": 0.461224764585495, + "learning_rate": 0.00015654886253181402, + "loss": 0.2537, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6280, + "tokens_per_second_per_gpu": 321.0 + }, + { + "epoch": 0.6253417507580653, + "grad_norm": 0.36294886469841003, + "learning_rate": 0.00015641935058075904, + "loss": 0.2009, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 6290, + "tokens_per_second_per_gpu": 308.81 + }, + { + "epoch": 0.626335934781528, + "grad_norm": 0.4274291396141052, + "learning_rate": 0.0001562896996569191, + "loss": 0.2343, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6300, + "tokens_per_second_per_gpu": 319.11 + }, + { + "epoch": 0.6273301188049908, + "grad_norm": 0.47336748242378235, + "learning_rate": 0.00015615991007965176, + "loss": 0.211, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6310, + "tokens_per_second_per_gpu": 355.64 + }, + { + "epoch": 0.6283243028284535, + "grad_norm": 0.4946894347667694, + "learning_rate": 0.00015602998216865624, + "loss": 0.2492, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6320, + "tokens_per_second_per_gpu": 360.15 + }, + { + "epoch": 0.6293184868519163, + "grad_norm": 0.38327473402023315, + "learning_rate": 0.00015589991624397244, + "loss": 0.2308, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.61, + "memory/max_allocated (GiB)": 20.61, + "step": 6330, + "tokens_per_second_per_gpu": 301.77 + }, + { + "epoch": 0.630312670875379, + "grad_norm": 0.46532171964645386, + "learning_rate": 0.00015576971262598024, + "loss": 0.2812, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6340, + "tokens_per_second_per_gpu": 344.05 + }, + { + "epoch": 0.6313068548988418, + "grad_norm": 0.3749048709869385, + "learning_rate": 0.00015563937163539862, + "loss": 0.2415, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 6350, + "tokens_per_second_per_gpu": 348.41 + }, + { + "epoch": 0.6323010389223045, + "grad_norm": 0.2943509519100189, + "learning_rate": 0.000155508893593285, + "loss": 0.2228, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 6360, + "tokens_per_second_per_gpu": 288.63 + }, + { + "epoch": 0.6332952229457672, + "grad_norm": 0.5494127869606018, + "learning_rate": 0.00015537827882103442, + "loss": 0.2499, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 6370, + "tokens_per_second_per_gpu": 356.07 + }, + { + "epoch": 0.63428940696923, + "grad_norm": 0.48988381028175354, + "learning_rate": 0.0001552475276403786, + "loss": 0.2142, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 6380, + "tokens_per_second_per_gpu": 343.8 + }, + { + "epoch": 0.6352835909926927, + "grad_norm": 0.3422715365886688, + "learning_rate": 0.00015511664037338538, + "loss": 0.2364, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 6390, + "tokens_per_second_per_gpu": 409.18 + }, + { + "epoch": 0.6362777750161555, + "grad_norm": 0.6021102070808411, + "learning_rate": 0.00015498561734245776, + "loss": 0.2392, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6400, + "tokens_per_second_per_gpu": 306.59 + }, + { + "epoch": 0.6372719590396182, + "grad_norm": 0.6073122620582581, + "learning_rate": 0.00015485445887033317, + "loss": 0.2798, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6410, + "tokens_per_second_per_gpu": 342.93 + }, + { + "epoch": 0.6382661430630809, + "grad_norm": 0.3407362401485443, + "learning_rate": 0.0001547231652800826, + "loss": 0.2477, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 6420, + "tokens_per_second_per_gpu": 367.05 + }, + { + "epoch": 0.6392603270865437, + "grad_norm": 0.506592869758606, + "learning_rate": 0.00015459173689510994, + "loss": 0.2399, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 6430, + "tokens_per_second_per_gpu": 412.24 + }, + { + "epoch": 0.6402545111100064, + "grad_norm": 0.5419439077377319, + "learning_rate": 0.0001544601740391511, + "loss": 0.1948, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 6440, + "tokens_per_second_per_gpu": 329.23 + }, + { + "epoch": 0.6412486951334692, + "grad_norm": 0.48251059651374817, + "learning_rate": 0.00015432847703627316, + "loss": 0.2146, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 6450, + "tokens_per_second_per_gpu": 317.65 + }, + { + "epoch": 0.6422428791569319, + "grad_norm": 0.22626249492168427, + "learning_rate": 0.0001541966462108737, + "loss": 0.2593, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6460, + "tokens_per_second_per_gpu": 331.91 + }, + { + "epoch": 0.6432370631803946, + "grad_norm": 0.5113493204116821, + "learning_rate": 0.0001540646818876799, + "loss": 0.2162, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 6470, + "tokens_per_second_per_gpu": 311.21 + }, + { + "epoch": 0.6442312472038574, + "grad_norm": 0.3097884953022003, + "learning_rate": 0.0001539325843917478, + "loss": 0.1879, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 6480, + "tokens_per_second_per_gpu": 320.21 + }, + { + "epoch": 0.6452254312273201, + "grad_norm": 0.32837173342704773, + "learning_rate": 0.0001538003540484614, + "loss": 0.217, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.68, + "memory/max_allocated (GiB)": 19.68, + "step": 6490, + "tokens_per_second_per_gpu": 351.68 + }, + { + "epoch": 0.646219615250783, + "grad_norm": 0.519063413143158, + "learning_rate": 0.00015366799118353202, + "loss": 0.2531, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 6500, + "tokens_per_second_per_gpu": 379.3 + }, + { + "epoch": 0.6472137992742457, + "grad_norm": 0.3581913113594055, + "learning_rate": 0.0001535354961229974, + "loss": 0.291, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 6510, + "tokens_per_second_per_gpu": 389.02 + }, + { + "epoch": 0.6482079832977085, + "grad_norm": 0.3630671799182892, + "learning_rate": 0.0001534028691932208, + "loss": 0.2409, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6520, + "tokens_per_second_per_gpu": 313.47 + }, + { + "epoch": 0.6492021673211712, + "grad_norm": 0.3670892119407654, + "learning_rate": 0.00015327011072089044, + "loss": 0.2133, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 6530, + "tokens_per_second_per_gpu": 290.75 + }, + { + "epoch": 0.6501963513446339, + "grad_norm": 0.40198561549186707, + "learning_rate": 0.00015313722103301852, + "loss": 0.27, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.85, + "memory/max_allocated (GiB)": 18.85, + "step": 6540, + "tokens_per_second_per_gpu": 429.99 + }, + { + "epoch": 0.6511905353680967, + "grad_norm": 0.3494684398174286, + "learning_rate": 0.00015300420045694034, + "loss": 0.1676, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6550, + "tokens_per_second_per_gpu": 368.89 + }, + { + "epoch": 0.6521847193915594, + "grad_norm": 0.42560404539108276, + "learning_rate": 0.00015287104932031374, + "loss": 0.2585, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 6560, + "tokens_per_second_per_gpu": 317.83 + }, + { + "epoch": 0.6531789034150222, + "grad_norm": 0.511513352394104, + "learning_rate": 0.00015273776795111813, + "loss": 0.2129, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6570, + "tokens_per_second_per_gpu": 327.26 + }, + { + "epoch": 0.6541730874384849, + "grad_norm": 0.3022279441356659, + "learning_rate": 0.00015260435667765364, + "loss": 0.2674, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 6580, + "tokens_per_second_per_gpu": 332.65 + }, + { + "epoch": 0.6551672714619476, + "grad_norm": 0.3808051347732544, + "learning_rate": 0.00015247081582854053, + "loss": 0.2512, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6590, + "tokens_per_second_per_gpu": 379.8 + }, + { + "epoch": 0.6561614554854104, + "grad_norm": 0.4839475154876709, + "learning_rate": 0.00015233714573271802, + "loss": 0.2376, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6600, + "tokens_per_second_per_gpu": 329.54 + }, + { + "epoch": 0.6571556395088731, + "grad_norm": 0.7145663499832153, + "learning_rate": 0.0001522033467194439, + "loss": 0.2289, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6610, + "tokens_per_second_per_gpu": 327.02 + }, + { + "epoch": 0.6581498235323359, + "grad_norm": 0.4483419358730316, + "learning_rate": 0.00015206941911829336, + "loss": 0.2619, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 6620, + "tokens_per_second_per_gpu": 344.26 + }, + { + "epoch": 0.6591440075557986, + "grad_norm": 0.7042835354804993, + "learning_rate": 0.00015193536325915842, + "loss": 0.3162, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 6630, + "tokens_per_second_per_gpu": 325.39 + }, + { + "epoch": 0.6601381915792613, + "grad_norm": 0.44085246324539185, + "learning_rate": 0.00015180117947224698, + "loss": 0.1955, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 6640, + "tokens_per_second_per_gpu": 356.35 + }, + { + "epoch": 0.6611323756027241, + "grad_norm": 0.32135269045829773, + "learning_rate": 0.00015166686808808208, + "loss": 0.2302, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 6650, + "tokens_per_second_per_gpu": 401.75 + }, + { + "epoch": 0.6621265596261868, + "grad_norm": 0.5171180367469788, + "learning_rate": 0.00015153242943750103, + "loss": 0.251, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 6660, + "tokens_per_second_per_gpu": 328.4 + }, + { + "epoch": 0.6631207436496496, + "grad_norm": 0.5205950140953064, + "learning_rate": 0.00015139786385165462, + "loss": 0.2186, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 6670, + "tokens_per_second_per_gpu": 325.18 + }, + { + "epoch": 0.6641149276731123, + "grad_norm": 0.31780245900154114, + "learning_rate": 0.0001512631716620064, + "loss": 0.1604, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6680, + "tokens_per_second_per_gpu": 319.84 + }, + { + "epoch": 0.665109111696575, + "grad_norm": 0.29278233647346497, + "learning_rate": 0.00015112835320033163, + "loss": 0.266, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6690, + "tokens_per_second_per_gpu": 408.19 + }, + { + "epoch": 0.6661032957200378, + "grad_norm": 0.47382065653800964, + "learning_rate": 0.00015099340879871668, + "loss": 0.1933, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 6700, + "tokens_per_second_per_gpu": 302.45 + }, + { + "epoch": 0.6670974797435005, + "grad_norm": 0.3947311043739319, + "learning_rate": 0.00015085833878955823, + "loss": 0.2225, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 6710, + "tokens_per_second_per_gpu": 418.2 + }, + { + "epoch": 0.6680916637669633, + "grad_norm": 0.5490260720252991, + "learning_rate": 0.00015072314350556213, + "loss": 0.2056, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 6720, + "tokens_per_second_per_gpu": 319.35 + }, + { + "epoch": 0.669085847790426, + "grad_norm": 0.412194162607193, + "learning_rate": 0.000150587823279743, + "loss": 0.2377, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 6730, + "tokens_per_second_per_gpu": 304.73 + }, + { + "epoch": 0.6700800318138888, + "grad_norm": 0.40393805503845215, + "learning_rate": 0.00015045237844542317, + "loss": 0.2622, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 6740, + "tokens_per_second_per_gpu": 335.77 + }, + { + "epoch": 0.6710742158373515, + "grad_norm": 0.5896100401878357, + "learning_rate": 0.00015031680933623188, + "loss": 0.3129, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 6750, + "tokens_per_second_per_gpu": 304.37 + }, + { + "epoch": 0.6720683998608142, + "grad_norm": 0.5198945999145508, + "learning_rate": 0.00015018111628610446, + "loss": 0.2704, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6760, + "tokens_per_second_per_gpu": 380.02 + }, + { + "epoch": 0.673062583884277, + "grad_norm": 0.32067760825157166, + "learning_rate": 0.00015004529962928164, + "loss": 0.2495, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6770, + "tokens_per_second_per_gpu": 369.15 + }, + { + "epoch": 0.6740567679077397, + "grad_norm": 0.49704423546791077, + "learning_rate": 0.0001499093597003085, + "loss": 0.2095, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 6780, + "tokens_per_second_per_gpu": 324.06 + }, + { + "epoch": 0.6750509519312025, + "grad_norm": 0.42155733704566956, + "learning_rate": 0.00014977329683403385, + "loss": 0.1743, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 6790, + "tokens_per_second_per_gpu": 309.09 + }, + { + "epoch": 0.6760451359546652, + "grad_norm": 0.5538848638534546, + "learning_rate": 0.00014963711136560924, + "loss": 0.3424, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6800, + "tokens_per_second_per_gpu": 347.86 + }, + { + "epoch": 0.6770393199781279, + "grad_norm": 0.3429434299468994, + "learning_rate": 0.00014950080363048833, + "loss": 0.2047, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6810, + "tokens_per_second_per_gpu": 341.24 + }, + { + "epoch": 0.6780335040015907, + "grad_norm": 0.39259403944015503, + "learning_rate": 0.0001493643739644258, + "loss": 0.187, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 6820, + "tokens_per_second_per_gpu": 367.18 + }, + { + "epoch": 0.6790276880250534, + "grad_norm": 0.37642526626586914, + "learning_rate": 0.00014922782270347686, + "loss": 0.236, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6830, + "tokens_per_second_per_gpu": 324.51 + }, + { + "epoch": 0.6800218720485162, + "grad_norm": 0.5826324820518494, + "learning_rate": 0.00014909115018399603, + "loss": 0.2494, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.68, + "memory/max_allocated (GiB)": 19.68, + "step": 6840, + "tokens_per_second_per_gpu": 343.26 + }, + { + "epoch": 0.6810160560719789, + "grad_norm": 0.39206662774086, + "learning_rate": 0.00014895435674263662, + "loss": 0.2522, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6850, + "tokens_per_second_per_gpu": 330.16 + }, + { + "epoch": 0.6820102400954416, + "grad_norm": 0.21635837852954865, + "learning_rate": 0.00014881744271634986, + "loss": 0.2534, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 6860, + "tokens_per_second_per_gpu": 319.9 + }, + { + "epoch": 0.6830044241189044, + "grad_norm": 0.25813058018684387, + "learning_rate": 0.00014868040844238386, + "loss": 0.2255, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 6870, + "tokens_per_second_per_gpu": 352.96 + }, + { + "epoch": 0.6839986081423671, + "grad_norm": 0.46098119020462036, + "learning_rate": 0.00014854325425828305, + "loss": 0.2135, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 6880, + "tokens_per_second_per_gpu": 278.19 + }, + { + "epoch": 0.6849927921658299, + "grad_norm": 0.45799604058265686, + "learning_rate": 0.00014840598050188715, + "loss": 0.2283, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6890, + "tokens_per_second_per_gpu": 358.38 + }, + { + "epoch": 0.6859869761892926, + "grad_norm": 0.6016408205032349, + "learning_rate": 0.00014826858751133042, + "loss": 0.2261, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 6900, + "tokens_per_second_per_gpu": 368.61 + }, + { + "epoch": 0.6869811602127553, + "grad_norm": 0.488506555557251, + "learning_rate": 0.00014813107562504084, + "loss": 0.2799, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6910, + "tokens_per_second_per_gpu": 367.4 + }, + { + "epoch": 0.6879753442362181, + "grad_norm": 0.6327788829803467, + "learning_rate": 0.00014799344518173928, + "loss": 0.1868, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6920, + "tokens_per_second_per_gpu": 300.97 + }, + { + "epoch": 0.6889695282596808, + "grad_norm": 0.4955579340457916, + "learning_rate": 0.00014785569652043856, + "loss": 0.2496, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 6930, + "tokens_per_second_per_gpu": 343.13 + }, + { + "epoch": 0.6899637122831436, + "grad_norm": 0.5724585652351379, + "learning_rate": 0.0001477178299804428, + "loss": 0.2611, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.82, + "memory/max_allocated (GiB)": 18.82, + "step": 6940, + "tokens_per_second_per_gpu": 326.77 + }, + { + "epoch": 0.6909578963066063, + "grad_norm": 0.2057613730430603, + "learning_rate": 0.00014757984590134642, + "loss": 0.1107, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 6950, + "tokens_per_second_per_gpu": 293.99 + }, + { + "epoch": 0.691952080330069, + "grad_norm": 0.3206622004508972, + "learning_rate": 0.00014744174462303334, + "loss": 0.2379, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 6960, + "tokens_per_second_per_gpu": 371.29 + }, + { + "epoch": 0.6929462643535318, + "grad_norm": 0.3926986753940582, + "learning_rate": 0.00014730352648567623, + "loss": 0.2558, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6970, + "tokens_per_second_per_gpu": 377.71 + }, + { + "epoch": 0.6939404483769945, + "grad_norm": 0.34591636061668396, + "learning_rate": 0.00014716519182973552, + "loss": 0.2601, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6980, + "tokens_per_second_per_gpu": 352.12 + }, + { + "epoch": 0.6949346324004573, + "grad_norm": 0.5908513069152832, + "learning_rate": 0.00014702674099595876, + "loss": 0.2027, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 6990, + "tokens_per_second_per_gpu": 345.62 + }, + { + "epoch": 0.69592881642392, + "grad_norm": 0.3830493986606598, + "learning_rate": 0.00014688817432537962, + "loss": 0.1987, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7000, + "tokens_per_second_per_gpu": 359.97 + }, + { + "epoch": 0.6969230004473828, + "grad_norm": 0.444762647151947, + "learning_rate": 0.00014674949215931707, + "loss": 0.2059, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 7010, + "tokens_per_second_per_gpu": 289.86 + }, + { + "epoch": 0.6979171844708456, + "grad_norm": 0.31576088070869446, + "learning_rate": 0.00014661069483937458, + "loss": 0.2115, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 7020, + "tokens_per_second_per_gpu": 339.22 + }, + { + "epoch": 0.6989113684943083, + "grad_norm": 0.4755282700061798, + "learning_rate": 0.00014647178270743932, + "loss": 0.265, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 7030, + "tokens_per_second_per_gpu": 329.94 + }, + { + "epoch": 0.6999055525177711, + "grad_norm": 0.4698229134082794, + "learning_rate": 0.00014633275610568123, + "loss": 0.2492, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7040, + "tokens_per_second_per_gpu": 315.54 + }, + { + "epoch": 0.7008997365412338, + "grad_norm": 0.3248315453529358, + "learning_rate": 0.00014619361537655215, + "loss": 0.2412, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 7050, + "tokens_per_second_per_gpu": 294.7 + }, + { + "epoch": 0.7018939205646966, + "grad_norm": 0.48639553785324097, + "learning_rate": 0.0001460543608627852, + "loss": 0.2356, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 7060, + "tokens_per_second_per_gpu": 280.17 + }, + { + "epoch": 0.7028881045881593, + "grad_norm": 0.5937051773071289, + "learning_rate": 0.00014591499290739362, + "loss": 0.1679, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7070, + "tokens_per_second_per_gpu": 336.7 + }, + { + "epoch": 0.703882288611622, + "grad_norm": 0.3488394021987915, + "learning_rate": 0.00014577551185367013, + "loss": 0.2474, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7080, + "tokens_per_second_per_gpu": 400.34 + }, + { + "epoch": 0.7048764726350848, + "grad_norm": 0.4485851526260376, + "learning_rate": 0.0001456359180451861, + "loss": 0.2709, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 7090, + "tokens_per_second_per_gpu": 392.29 + }, + { + "epoch": 0.7058706566585475, + "grad_norm": 0.4746951758861542, + "learning_rate": 0.00014549621182579055, + "loss": 0.2353, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7100, + "tokens_per_second_per_gpu": 330.73 + }, + { + "epoch": 0.7068648406820103, + "grad_norm": 0.5027205944061279, + "learning_rate": 0.00014535639353960942, + "loss": 0.2576, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 7110, + "tokens_per_second_per_gpu": 379.65 + }, + { + "epoch": 0.707859024705473, + "grad_norm": 0.449788361787796, + "learning_rate": 0.00014521646353104472, + "loss": 0.2186, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7120, + "tokens_per_second_per_gpu": 269.74 + }, + { + "epoch": 0.7088532087289358, + "grad_norm": 0.31661751866340637, + "learning_rate": 0.00014507642214477362, + "loss": 0.2481, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 7130, + "tokens_per_second_per_gpu": 302.22 + }, + { + "epoch": 0.7098473927523985, + "grad_norm": 0.3295125663280487, + "learning_rate": 0.00014493626972574765, + "loss": 0.2284, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7140, + "tokens_per_second_per_gpu": 350.96 + }, + { + "epoch": 0.7108415767758612, + "grad_norm": 0.5383651256561279, + "learning_rate": 0.0001447960066191919, + "loss": 0.2427, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 7150, + "tokens_per_second_per_gpu": 345.67 + }, + { + "epoch": 0.711835760799324, + "grad_norm": 0.3970474898815155, + "learning_rate": 0.00014465563317060394, + "loss": 0.2434, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 7160, + "tokens_per_second_per_gpu": 374.3 + }, + { + "epoch": 0.7128299448227867, + "grad_norm": 0.16766348481178284, + "learning_rate": 0.00014451514972575332, + "loss": 0.1649, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7170, + "tokens_per_second_per_gpu": 304.14 + }, + { + "epoch": 0.7138241288462495, + "grad_norm": 0.4426742196083069, + "learning_rate": 0.00014437455663068042, + "loss": 0.2633, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7180, + "tokens_per_second_per_gpu": 374.18 + }, + { + "epoch": 0.7148183128697122, + "grad_norm": 0.4757481515407562, + "learning_rate": 0.00014423385423169575, + "loss": 0.2584, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7190, + "tokens_per_second_per_gpu": 302.88 + }, + { + "epoch": 0.7158124968931749, + "grad_norm": 0.4964188039302826, + "learning_rate": 0.00014409304287537906, + "loss": 0.2386, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 7200, + "tokens_per_second_per_gpu": 325.69 + }, + { + "epoch": 0.7168066809166377, + "grad_norm": 0.5026222467422485, + "learning_rate": 0.0001439521229085785, + "loss": 0.2161, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 7210, + "tokens_per_second_per_gpu": 332.92 + }, + { + "epoch": 0.7178008649401004, + "grad_norm": 0.41850724816322327, + "learning_rate": 0.00014381109467840976, + "loss": 0.2157, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 7220, + "tokens_per_second_per_gpu": 328.35 + }, + { + "epoch": 0.7187950489635632, + "grad_norm": 0.3922070264816284, + "learning_rate": 0.00014366995853225514, + "loss": 0.2112, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7230, + "tokens_per_second_per_gpu": 263.36 + }, + { + "epoch": 0.7197892329870259, + "grad_norm": 0.5679214000701904, + "learning_rate": 0.0001435287148177628, + "loss": 0.2715, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7240, + "tokens_per_second_per_gpu": 434.11 + }, + { + "epoch": 0.7207834170104886, + "grad_norm": 0.5302831530570984, + "learning_rate": 0.0001433873638828458, + "loss": 0.252, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 7250, + "tokens_per_second_per_gpu": 358.75 + }, + { + "epoch": 0.7217776010339514, + "grad_norm": 0.49475687742233276, + "learning_rate": 0.00014324590607568149, + "loss": 0.2613, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 7260, + "tokens_per_second_per_gpu": 377.3 + }, + { + "epoch": 0.7227717850574141, + "grad_norm": 0.4263441264629364, + "learning_rate": 0.00014310434174471024, + "loss": 0.288, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7270, + "tokens_per_second_per_gpu": 378.27 + }, + { + "epoch": 0.7237659690808769, + "grad_norm": 0.4663153886795044, + "learning_rate": 0.000142962671238635, + "loss": 0.2572, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7280, + "tokens_per_second_per_gpu": 390.5 + }, + { + "epoch": 0.7247601531043396, + "grad_norm": 0.3563691973686218, + "learning_rate": 0.0001428208949064201, + "loss": 0.2024, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 7290, + "tokens_per_second_per_gpu": 330.99 + }, + { + "epoch": 0.7257543371278024, + "grad_norm": 0.2805791199207306, + "learning_rate": 0.00014267901309729066, + "loss": 0.2371, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 7300, + "tokens_per_second_per_gpu": 410.4 + }, + { + "epoch": 0.7267485211512651, + "grad_norm": 0.30967897176742554, + "learning_rate": 0.00014253702616073155, + "loss": 0.231, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7310, + "tokens_per_second_per_gpu": 302.94 + }, + { + "epoch": 0.7277427051747278, + "grad_norm": 0.353834867477417, + "learning_rate": 0.00014239493444648658, + "loss": 0.1885, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 7320, + "tokens_per_second_per_gpu": 296.75 + }, + { + "epoch": 0.7287368891981906, + "grad_norm": 0.32480210065841675, + "learning_rate": 0.00014225273830455773, + "loss": 0.2713, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7330, + "tokens_per_second_per_gpu": 389.69 + }, + { + "epoch": 0.7297310732216533, + "grad_norm": 0.6818671226501465, + "learning_rate": 0.00014211043808520405, + "loss": 0.3248, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7340, + "tokens_per_second_per_gpu": 355.32 + }, + { + "epoch": 0.730725257245116, + "grad_norm": 0.5786187648773193, + "learning_rate": 0.0001419680341389412, + "loss": 0.2262, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7350, + "tokens_per_second_per_gpu": 321.29 + }, + { + "epoch": 0.7317194412685788, + "grad_norm": 0.5133084058761597, + "learning_rate": 0.0001418255268165401, + "loss": 0.2653, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7360, + "tokens_per_second_per_gpu": 348.43 + }, + { + "epoch": 0.7327136252920415, + "grad_norm": 0.4247760474681854, + "learning_rate": 0.0001416829164690264, + "loss": 0.2312, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 7370, + "tokens_per_second_per_gpu": 360.14 + }, + { + "epoch": 0.7337078093155043, + "grad_norm": 0.32232165336608887, + "learning_rate": 0.00014154020344767955, + "loss": 0.2825, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7380, + "tokens_per_second_per_gpu": 379.26 + }, + { + "epoch": 0.734701993338967, + "grad_norm": 0.4452918767929077, + "learning_rate": 0.0001413973881040319, + "loss": 0.2205, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7390, + "tokens_per_second_per_gpu": 359.25 + }, + { + "epoch": 0.7356961773624298, + "grad_norm": 0.3855791985988617, + "learning_rate": 0.0001412544707898678, + "loss": 0.2868, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 7400, + "tokens_per_second_per_gpu": 410.81 + }, + { + "epoch": 0.7366903613858925, + "grad_norm": 0.42609113454818726, + "learning_rate": 0.00014111145185722283, + "loss": 0.2523, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 7410, + "tokens_per_second_per_gpu": 331.0 + }, + { + "epoch": 0.7376845454093552, + "grad_norm": 0.47836732864379883, + "learning_rate": 0.00014096833165838283, + "loss": 0.2962, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 7420, + "tokens_per_second_per_gpu": 344.4 + }, + { + "epoch": 0.738678729432818, + "grad_norm": 0.508818507194519, + "learning_rate": 0.0001408251105458831, + "loss": 0.3254, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7430, + "tokens_per_second_per_gpu": 389.2 + }, + { + "epoch": 0.7396729134562807, + "grad_norm": 0.3887844681739807, + "learning_rate": 0.00014068178887250752, + "loss": 0.2353, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 7440, + "tokens_per_second_per_gpu": 380.02 + }, + { + "epoch": 0.7406670974797435, + "grad_norm": 0.41547468304634094, + "learning_rate": 0.00014053836699128765, + "loss": 0.2424, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7450, + "tokens_per_second_per_gpu": 376.26 + }, + { + "epoch": 0.7416612815032062, + "grad_norm": 0.5015019178390503, + "learning_rate": 0.00014039484525550186, + "loss": 0.2329, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 7460, + "tokens_per_second_per_gpu": 351.38 + }, + { + "epoch": 0.742655465526669, + "grad_norm": 0.43546929955482483, + "learning_rate": 0.0001402512240186746, + "loss": 0.2186, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7470, + "tokens_per_second_per_gpu": 303.33 + }, + { + "epoch": 0.7436496495501317, + "grad_norm": 0.5051418542861938, + "learning_rate": 0.0001401075036345753, + "loss": 0.2439, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.19, + "memory/max_allocated (GiB)": 19.19, + "step": 7480, + "tokens_per_second_per_gpu": 375.31 + }, + { + "epoch": 0.7446438335735944, + "grad_norm": 0.35766085982322693, + "learning_rate": 0.0001399636844572176, + "loss": 0.277, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 7490, + "tokens_per_second_per_gpu": 343.63 + }, + { + "epoch": 0.7456380175970572, + "grad_norm": 0.5930467247962952, + "learning_rate": 0.0001398197668408586, + "loss": 0.2474, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 7500, + "tokens_per_second_per_gpu": 396.18 + }, + { + "epoch": 0.7466322016205199, + "grad_norm": 0.4920576810836792, + "learning_rate": 0.00013967575113999777, + "loss": 0.2408, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 7510, + "tokens_per_second_per_gpu": 382.95 + }, + { + "epoch": 0.7476263856439826, + "grad_norm": 0.44312262535095215, + "learning_rate": 0.0001395316377093762, + "loss": 0.2249, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 7520, + "tokens_per_second_per_gpu": 387.85 + }, + { + "epoch": 0.7486205696674454, + "grad_norm": 0.4043440818786621, + "learning_rate": 0.00013938742690397575, + "loss": 0.2141, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 7530, + "tokens_per_second_per_gpu": 360.87 + }, + { + "epoch": 0.7496147536909082, + "grad_norm": 0.3910767138004303, + "learning_rate": 0.00013924311907901813, + "loss": 0.1528, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 7540, + "tokens_per_second_per_gpu": 308.96 + }, + { + "epoch": 0.750608937714371, + "grad_norm": 0.3407839238643646, + "learning_rate": 0.00013909871458996399, + "loss": 0.2192, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7550, + "tokens_per_second_per_gpu": 316.74 + }, + { + "epoch": 0.7516031217378337, + "grad_norm": 0.316240519285202, + "learning_rate": 0.00013895421379251207, + "loss": 0.2317, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 7560, + "tokens_per_second_per_gpu": 332.89 + }, + { + "epoch": 0.7525973057612965, + "grad_norm": 0.49255135655403137, + "learning_rate": 0.00013880961704259846, + "loss": 0.2413, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7570, + "tokens_per_second_per_gpu": 352.94 + }, + { + "epoch": 0.7535914897847592, + "grad_norm": 0.4979618489742279, + "learning_rate": 0.0001386649246963955, + "loss": 0.2434, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 7580, + "tokens_per_second_per_gpu": 307.59 + }, + { + "epoch": 0.754585673808222, + "grad_norm": 0.2949107885360718, + "learning_rate": 0.00013852013711031095, + "loss": 0.2112, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 7590, + "tokens_per_second_per_gpu": 363.41 + }, + { + "epoch": 0.7555798578316847, + "grad_norm": 0.3708727955818176, + "learning_rate": 0.0001383752546409873, + "loss": 0.2232, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7600, + "tokens_per_second_per_gpu": 386.01 + }, + { + "epoch": 0.7565740418551474, + "grad_norm": 0.6432907581329346, + "learning_rate": 0.00013823027764530067, + "loss": 0.2707, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 7610, + "tokens_per_second_per_gpu": 313.62 + }, + { + "epoch": 0.7575682258786102, + "grad_norm": 0.3710288405418396, + "learning_rate": 0.00013808520648036005, + "loss": 0.2353, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7620, + "tokens_per_second_per_gpu": 432.88 + }, + { + "epoch": 0.7585624099020729, + "grad_norm": 0.3577297031879425, + "learning_rate": 0.00013794004150350636, + "loss": 0.212, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 7630, + "tokens_per_second_per_gpu": 323.27 + }, + { + "epoch": 0.7595565939255357, + "grad_norm": 0.4883553385734558, + "learning_rate": 0.00013779478307231164, + "loss": 0.2747, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7640, + "tokens_per_second_per_gpu": 292.24 + }, + { + "epoch": 0.7605507779489984, + "grad_norm": 0.19372917711734772, + "learning_rate": 0.00013764943154457812, + "loss": 0.233, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7650, + "tokens_per_second_per_gpu": 375.4 + }, + { + "epoch": 0.7615449619724611, + "grad_norm": 0.46450668573379517, + "learning_rate": 0.00013750398727833735, + "loss": 0.219, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 7660, + "tokens_per_second_per_gpu": 351.49 + }, + { + "epoch": 0.7625391459959239, + "grad_norm": 0.3964915871620178, + "learning_rate": 0.00013735845063184921, + "loss": 0.2376, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7670, + "tokens_per_second_per_gpu": 309.02 + }, + { + "epoch": 0.7635333300193866, + "grad_norm": 0.6207079887390137, + "learning_rate": 0.00013721282196360127, + "loss": 0.2547, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7680, + "tokens_per_second_per_gpu": 338.05 + }, + { + "epoch": 0.7645275140428494, + "grad_norm": 0.2084685117006302, + "learning_rate": 0.00013706710163230773, + "loss": 0.2504, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7690, + "tokens_per_second_per_gpu": 294.32 + }, + { + "epoch": 0.7655216980663121, + "grad_norm": 0.4136933386325836, + "learning_rate": 0.0001369212899969086, + "loss": 0.1809, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7700, + "tokens_per_second_per_gpu": 314.54 + }, + { + "epoch": 0.7665158820897748, + "grad_norm": 0.529629111289978, + "learning_rate": 0.0001367753874165687, + "loss": 0.255, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7710, + "tokens_per_second_per_gpu": 357.08 + }, + { + "epoch": 0.7675100661132376, + "grad_norm": 0.36684682965278625, + "learning_rate": 0.0001366293942506769, + "loss": 0.2128, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 7720, + "tokens_per_second_per_gpu": 318.04 + }, + { + "epoch": 0.7685042501367003, + "grad_norm": 0.40612316131591797, + "learning_rate": 0.00013648331085884527, + "loss": 0.2159, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 7730, + "tokens_per_second_per_gpu": 364.09 + }, + { + "epoch": 0.7694984341601631, + "grad_norm": 0.13119497895240784, + "learning_rate": 0.0001363371376009081, + "loss": 0.2255, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7740, + "tokens_per_second_per_gpu": 336.24 + }, + { + "epoch": 0.7704926181836258, + "grad_norm": 0.5006715655326843, + "learning_rate": 0.00013619087483692099, + "loss": 0.2595, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7750, + "tokens_per_second_per_gpu": 338.82 + }, + { + "epoch": 0.7714868022070885, + "grad_norm": 0.3994678258895874, + "learning_rate": 0.00013604452292716003, + "loss": 0.203, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 7760, + "tokens_per_second_per_gpu": 327.9 + }, + { + "epoch": 0.7724809862305513, + "grad_norm": 0.17447052896022797, + "learning_rate": 0.00013589808223212087, + "loss": 0.2537, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7770, + "tokens_per_second_per_gpu": 376.09 + }, + { + "epoch": 0.773475170254014, + "grad_norm": 0.5262983441352844, + "learning_rate": 0.000135751553112518, + "loss": 0.2112, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7780, + "tokens_per_second_per_gpu": 338.13 + }, + { + "epoch": 0.7744693542774768, + "grad_norm": 0.32633262872695923, + "learning_rate": 0.00013560493592928356, + "loss": 0.2235, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 7790, + "tokens_per_second_per_gpu": 331.86 + }, + { + "epoch": 0.7754635383009395, + "grad_norm": 0.4296337068080902, + "learning_rate": 0.00013545823104356663, + "loss": 0.297, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7800, + "tokens_per_second_per_gpu": 370.8 + }, + { + "epoch": 0.7764577223244022, + "grad_norm": 0.5057851672172546, + "learning_rate": 0.00013531143881673237, + "loss": 0.1952, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7810, + "tokens_per_second_per_gpu": 321.52 + }, + { + "epoch": 0.777451906347865, + "grad_norm": 0.49617013335227966, + "learning_rate": 0.00013516455961036104, + "loss": 0.2589, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 7820, + "tokens_per_second_per_gpu": 299.57 + }, + { + "epoch": 0.7784460903713277, + "grad_norm": 0.3173094689846039, + "learning_rate": 0.00013501759378624722, + "loss": 0.2328, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7830, + "tokens_per_second_per_gpu": 349.07 + }, + { + "epoch": 0.7794402743947905, + "grad_norm": 0.4631012976169586, + "learning_rate": 0.00013487054170639877, + "loss": 0.2472, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7840, + "tokens_per_second_per_gpu": 361.13 + }, + { + "epoch": 0.7804344584182532, + "grad_norm": 0.3672430217266083, + "learning_rate": 0.000134723403733036, + "loss": 0.2183, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 7850, + "tokens_per_second_per_gpu": 324.51 + }, + { + "epoch": 0.781428642441716, + "grad_norm": 0.5141401886940002, + "learning_rate": 0.00013457618022859092, + "loss": 0.3104, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.47, + "memory/max_allocated (GiB)": 21.47, + "step": 7860, + "tokens_per_second_per_gpu": 384.61 + }, + { + "epoch": 0.7824228264651787, + "grad_norm": 0.43661215901374817, + "learning_rate": 0.00013442887155570607, + "loss": 0.2228, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 7870, + "tokens_per_second_per_gpu": 359.28 + }, + { + "epoch": 0.7834170104886414, + "grad_norm": 0.375987708568573, + "learning_rate": 0.00013428147807723387, + "loss": 0.2215, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 7880, + "tokens_per_second_per_gpu": 327.47 + }, + { + "epoch": 0.7844111945121042, + "grad_norm": 0.2800423204898834, + "learning_rate": 0.00013413400015623562, + "loss": 0.263, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 7890, + "tokens_per_second_per_gpu": 367.65 + }, + { + "epoch": 0.7854053785355669, + "grad_norm": 0.44610151648521423, + "learning_rate": 0.00013398643815598063, + "loss": 0.2533, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7900, + "tokens_per_second_per_gpu": 364.29 + }, + { + "epoch": 0.7863995625590297, + "grad_norm": 0.6232859492301941, + "learning_rate": 0.0001338387924399452, + "loss": 0.2273, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 7910, + "tokens_per_second_per_gpu": 335.13 + }, + { + "epoch": 0.7873937465824924, + "grad_norm": 0.3155955374240875, + "learning_rate": 0.00013369106337181202, + "loss": 0.2007, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 7920, + "tokens_per_second_per_gpu": 357.92 + }, + { + "epoch": 0.7883879306059551, + "grad_norm": 0.47753843665122986, + "learning_rate": 0.00013354325131546902, + "loss": 0.1722, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7930, + "tokens_per_second_per_gpu": 350.28 + }, + { + "epoch": 0.7893821146294179, + "grad_norm": 0.6098092198371887, + "learning_rate": 0.0001333953566350085, + "loss": 0.2172, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 7940, + "tokens_per_second_per_gpu": 328.34 + }, + { + "epoch": 0.7903762986528806, + "grad_norm": 0.40892454981803894, + "learning_rate": 0.00013324737969472628, + "loss": 0.2365, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 7950, + "tokens_per_second_per_gpu": 376.03 + }, + { + "epoch": 0.7913704826763434, + "grad_norm": 0.6622501015663147, + "learning_rate": 0.00013309932085912092, + "loss": 0.265, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7960, + "tokens_per_second_per_gpu": 401.88 + }, + { + "epoch": 0.7923646666998061, + "grad_norm": 0.5111701488494873, + "learning_rate": 0.00013295118049289255, + "loss": 0.2164, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7970, + "tokens_per_second_per_gpu": 359.69 + }, + { + "epoch": 0.7933588507232688, + "grad_norm": 0.5144445300102234, + "learning_rate": 0.00013280295896094224, + "loss": 0.2567, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 7980, + "tokens_per_second_per_gpu": 301.89 + }, + { + "epoch": 0.7943530347467316, + "grad_norm": 0.5570478439331055, + "learning_rate": 0.00013265465662837093, + "loss": 0.1934, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 7990, + "tokens_per_second_per_gpu": 336.55 + }, + { + "epoch": 0.7953472187701943, + "grad_norm": 0.33518052101135254, + "learning_rate": 0.00013250627386047866, + "loss": 0.2247, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8000, + "tokens_per_second_per_gpu": 335.15 + }, + { + "epoch": 0.7963414027936571, + "grad_norm": 0.508229672908783, + "learning_rate": 0.0001323578110227635, + "loss": 0.1587, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 8010, + "tokens_per_second_per_gpu": 351.3 + }, + { + "epoch": 0.7973355868171198, + "grad_norm": 0.3688840866088867, + "learning_rate": 0.0001322092684809208, + "loss": 0.1929, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 8020, + "tokens_per_second_per_gpu": 289.68 + }, + { + "epoch": 0.7983297708405825, + "grad_norm": 0.31160444021224976, + "learning_rate": 0.00013206064660084227, + "loss": 0.2318, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8030, + "tokens_per_second_per_gpu": 351.62 + }, + { + "epoch": 0.7993239548640453, + "grad_norm": 0.46203359961509705, + "learning_rate": 0.000131911945748615, + "loss": 0.2808, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8040, + "tokens_per_second_per_gpu": 360.06 + }, + { + "epoch": 0.800318138887508, + "grad_norm": 0.4416126608848572, + "learning_rate": 0.00013176316629052054, + "loss": 0.2065, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8050, + "tokens_per_second_per_gpu": 312.23 + }, + { + "epoch": 0.8013123229109709, + "grad_norm": 0.16777446866035461, + "learning_rate": 0.00013161430859303427, + "loss": 0.1713, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 8060, + "tokens_per_second_per_gpu": 347.86 + }, + { + "epoch": 0.8023065069344336, + "grad_norm": 0.3447447121143341, + "learning_rate": 0.0001314653730228241, + "loss": 0.1954, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8070, + "tokens_per_second_per_gpu": 372.63 + }, + { + "epoch": 0.8033006909578964, + "grad_norm": 0.4045270085334778, + "learning_rate": 0.0001313163599467498, + "loss": 0.1929, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8080, + "tokens_per_second_per_gpu": 355.89 + }, + { + "epoch": 0.8042948749813591, + "grad_norm": 0.462365984916687, + "learning_rate": 0.00013116726973186208, + "loss": 0.2551, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.45, + "memory/max_allocated (GiB)": 20.45, + "step": 8090, + "tokens_per_second_per_gpu": 361.83 + }, + { + "epoch": 0.8052890590048218, + "grad_norm": 0.5786636471748352, + "learning_rate": 0.00013101810274540168, + "loss": 0.2499, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 8100, + "tokens_per_second_per_gpu": 300.27 + }, + { + "epoch": 0.8062832430282846, + "grad_norm": 0.3487481474876404, + "learning_rate": 0.0001308688593547984, + "loss": 0.2186, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 17.11, + "memory/max_allocated (GiB)": 17.11, + "step": 8110, + "tokens_per_second_per_gpu": 312.52 + }, + { + "epoch": 0.8072774270517473, + "grad_norm": 0.3610248863697052, + "learning_rate": 0.00013071953992767015, + "loss": 0.2167, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 8120, + "tokens_per_second_per_gpu": 328.39 + }, + { + "epoch": 0.8082716110752101, + "grad_norm": 0.37153443694114685, + "learning_rate": 0.00013057014483182242, + "loss": 0.241, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 8130, + "tokens_per_second_per_gpu": 344.6 + }, + { + "epoch": 0.8092657950986728, + "grad_norm": 0.3705120086669922, + "learning_rate": 0.00013042067443524681, + "loss": 0.2749, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 8140, + "tokens_per_second_per_gpu": 332.87 + }, + { + "epoch": 0.8102599791221355, + "grad_norm": 0.3014324903488159, + "learning_rate": 0.00013027112910612052, + "loss": 0.1438, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8150, + "tokens_per_second_per_gpu": 308.05 + }, + { + "epoch": 0.8112541631455983, + "grad_norm": 0.5011573433876038, + "learning_rate": 0.00013012150921280527, + "loss": 0.2032, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 8160, + "tokens_per_second_per_gpu": 296.89 + }, + { + "epoch": 0.812248347169061, + "grad_norm": 0.3287215530872345, + "learning_rate": 0.00012997181512384653, + "loss": 0.2055, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 8170, + "tokens_per_second_per_gpu": 405.75 + }, + { + "epoch": 0.8132425311925238, + "grad_norm": 0.7308348417282104, + "learning_rate": 0.00012982204720797245, + "loss": 0.2805, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8180, + "tokens_per_second_per_gpu": 443.15 + }, + { + "epoch": 0.8142367152159865, + "grad_norm": 0.5808447599411011, + "learning_rate": 0.00012967220583409304, + "loss": 0.2066, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8190, + "tokens_per_second_per_gpu": 401.43 + }, + { + "epoch": 0.8152308992394492, + "grad_norm": 0.31127744913101196, + "learning_rate": 0.0001295222913712993, + "loss": 0.2464, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 8200, + "tokens_per_second_per_gpu": 343.59 + }, + { + "epoch": 0.816225083262912, + "grad_norm": 0.2803351581096649, + "learning_rate": 0.00012937230418886224, + "loss": 0.1986, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 8210, + "tokens_per_second_per_gpu": 401.58 + }, + { + "epoch": 0.8172192672863747, + "grad_norm": 0.45312055945396423, + "learning_rate": 0.000129222244656232, + "loss": 0.1904, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 8220, + "tokens_per_second_per_gpu": 290.19 + }, + { + "epoch": 0.8182134513098375, + "grad_norm": 0.4169121980667114, + "learning_rate": 0.0001290721131430369, + "loss": 0.2074, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 8230, + "tokens_per_second_per_gpu": 320.72 + }, + { + "epoch": 0.8192076353333002, + "grad_norm": 0.5725305080413818, + "learning_rate": 0.0001289219100190826, + "loss": 0.2809, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 8240, + "tokens_per_second_per_gpu": 334.56 + }, + { + "epoch": 0.820201819356763, + "grad_norm": 0.4698241055011749, + "learning_rate": 0.00012877163565435114, + "loss": 0.1873, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.66, + "memory/max_allocated (GiB)": 19.66, + "step": 8250, + "tokens_per_second_per_gpu": 319.13 + }, + { + "epoch": 0.8211960033802257, + "grad_norm": 0.33453720808029175, + "learning_rate": 0.000128621290419, + "loss": 0.2172, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8260, + "tokens_per_second_per_gpu": 306.65 + }, + { + "epoch": 0.8221901874036884, + "grad_norm": 0.20304298400878906, + "learning_rate": 0.00012847087468336135, + "loss": 0.2102, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8270, + "tokens_per_second_per_gpu": 311.33 + }, + { + "epoch": 0.8231843714271512, + "grad_norm": 0.27367445826530457, + "learning_rate": 0.00012832038881794086, + "loss": 0.2437, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8280, + "tokens_per_second_per_gpu": 417.4 + }, + { + "epoch": 0.8241785554506139, + "grad_norm": 0.3914351463317871, + "learning_rate": 0.00012816983319341712, + "loss": 0.2692, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 8290, + "tokens_per_second_per_gpu": 342.88 + }, + { + "epoch": 0.8251727394740767, + "grad_norm": 0.5103800296783447, + "learning_rate": 0.00012801920818064034, + "loss": 0.2341, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 8300, + "tokens_per_second_per_gpu": 362.66 + }, + { + "epoch": 0.8261669234975394, + "grad_norm": 0.1988827884197235, + "learning_rate": 0.00012786851415063185, + "loss": 0.2141, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 8310, + "tokens_per_second_per_gpu": 404.61 + }, + { + "epoch": 0.8271611075210021, + "grad_norm": 0.482526570558548, + "learning_rate": 0.00012771775147458288, + "loss": 0.2341, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8320, + "tokens_per_second_per_gpu": 366.94 + }, + { + "epoch": 0.8281552915444649, + "grad_norm": 0.5179364085197449, + "learning_rate": 0.0001275669205238537, + "loss": 0.2458, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 8330, + "tokens_per_second_per_gpu": 308.79 + }, + { + "epoch": 0.8291494755679276, + "grad_norm": 0.4961225390434265, + "learning_rate": 0.00012741602166997288, + "loss": 0.2324, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 8340, + "tokens_per_second_per_gpu": 312.63 + }, + { + "epoch": 0.8301436595913904, + "grad_norm": 0.6281317472457886, + "learning_rate": 0.0001272650552846362, + "loss": 0.2808, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8350, + "tokens_per_second_per_gpu": 365.38 + }, + { + "epoch": 0.8311378436148531, + "grad_norm": 0.3268338739871979, + "learning_rate": 0.00012711402173970574, + "loss": 0.2125, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8360, + "tokens_per_second_per_gpu": 381.45 + }, + { + "epoch": 0.8321320276383158, + "grad_norm": 0.5050214529037476, + "learning_rate": 0.00012696292140720907, + "loss": 0.3039, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.67, + "memory/max_allocated (GiB)": 19.67, + "step": 8370, + "tokens_per_second_per_gpu": 394.88 + }, + { + "epoch": 0.8331262116617786, + "grad_norm": 0.3680170178413391, + "learning_rate": 0.00012681175465933822, + "loss": 0.1876, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 8380, + "tokens_per_second_per_gpu": 310.52 + }, + { + "epoch": 0.8341203956852413, + "grad_norm": 0.6084402799606323, + "learning_rate": 0.00012666052186844883, + "loss": 0.2137, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 8390, + "tokens_per_second_per_gpu": 308.84 + }, + { + "epoch": 0.8351145797087041, + "grad_norm": 0.41174978017807007, + "learning_rate": 0.00012650922340705925, + "loss": 0.2423, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8400, + "tokens_per_second_per_gpu": 389.04 + }, + { + "epoch": 0.8361087637321668, + "grad_norm": 0.3895050883293152, + "learning_rate": 0.0001263578596478496, + "loss": 0.2144, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.99, + "memory/max_allocated (GiB)": 19.99, + "step": 8410, + "tokens_per_second_per_gpu": 325.56 + }, + { + "epoch": 0.8371029477556295, + "grad_norm": 0.2492019683122635, + "learning_rate": 0.00012620643096366077, + "loss": 0.2292, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8420, + "tokens_per_second_per_gpu": 312.76 + }, + { + "epoch": 0.8380971317790923, + "grad_norm": 0.5276951789855957, + "learning_rate": 0.0001260549377274936, + "loss": 0.2566, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 8430, + "tokens_per_second_per_gpu": 341.75 + }, + { + "epoch": 0.839091315802555, + "grad_norm": 0.3949599266052246, + "learning_rate": 0.00012590338031250796, + "loss": 0.2108, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 8440, + "tokens_per_second_per_gpu": 370.88 + }, + { + "epoch": 0.8400854998260178, + "grad_norm": 0.3476475179195404, + "learning_rate": 0.00012575175909202186, + "loss": 0.1811, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8450, + "tokens_per_second_per_gpu": 336.59 + }, + { + "epoch": 0.8410796838494805, + "grad_norm": 0.26099368929862976, + "learning_rate": 0.00012560007443951032, + "loss": 0.2144, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 8460, + "tokens_per_second_per_gpu": 353.23 + }, + { + "epoch": 0.8420738678729432, + "grad_norm": 0.4799298346042633, + "learning_rate": 0.00012544832672860474, + "loss": 0.1781, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8470, + "tokens_per_second_per_gpu": 360.11 + }, + { + "epoch": 0.843068051896406, + "grad_norm": 0.46986958384513855, + "learning_rate": 0.0001252965163330918, + "loss": 0.209, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 8480, + "tokens_per_second_per_gpu": 393.35 + }, + { + "epoch": 0.8440622359198687, + "grad_norm": 1.179408311843872, + "learning_rate": 0.00012514464362691258, + "loss": 0.2061, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8490, + "tokens_per_second_per_gpu": 377.29 + }, + { + "epoch": 0.8450564199433315, + "grad_norm": 0.2782179117202759, + "learning_rate": 0.0001249927089841617, + "loss": 0.2643, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8500, + "tokens_per_second_per_gpu": 352.65 + }, + { + "epoch": 0.8460506039667942, + "grad_norm": 0.5103908181190491, + "learning_rate": 0.00012484071277908622, + "loss": 0.2086, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 8510, + "tokens_per_second_per_gpu": 313.7 + }, + { + "epoch": 0.847044787990257, + "grad_norm": 0.34362590312957764, + "learning_rate": 0.000124688655386085, + "loss": 0.2545, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8520, + "tokens_per_second_per_gpu": 357.23 + }, + { + "epoch": 0.8480389720137197, + "grad_norm": 0.5372098088264465, + "learning_rate": 0.00012453653717970747, + "loss": 0.2191, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 8530, + "tokens_per_second_per_gpu": 275.82 + }, + { + "epoch": 0.8490331560371824, + "grad_norm": 0.21182872354984283, + "learning_rate": 0.00012438435853465296, + "loss": 0.2291, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 8540, + "tokens_per_second_per_gpu": 394.59 + }, + { + "epoch": 0.8500273400606452, + "grad_norm": 0.4921363890171051, + "learning_rate": 0.0001242321198257696, + "loss": 0.2142, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 8550, + "tokens_per_second_per_gpu": 342.56 + }, + { + "epoch": 0.8510215240841079, + "grad_norm": 0.3340144157409668, + "learning_rate": 0.00012407982142805356, + "loss": 0.2034, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8560, + "tokens_per_second_per_gpu": 381.5 + }, + { + "epoch": 0.8520157081075707, + "grad_norm": 0.45434701442718506, + "learning_rate": 0.00012392746371664797, + "loss": 0.2031, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 8570, + "tokens_per_second_per_gpu": 337.22 + }, + { + "epoch": 0.8530098921310335, + "grad_norm": 0.2646021544933319, + "learning_rate": 0.00012377504706684206, + "loss": 0.1807, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8580, + "tokens_per_second_per_gpu": 369.87 + }, + { + "epoch": 0.8540040761544962, + "grad_norm": 0.4125272333621979, + "learning_rate": 0.00012362257185407022, + "loss": 0.2258, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 8590, + "tokens_per_second_per_gpu": 374.23 + }, + { + "epoch": 0.854998260177959, + "grad_norm": 0.3133401572704315, + "learning_rate": 0.00012347003845391118, + "loss": 0.1624, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.98, + "memory/max_allocated (GiB)": 19.98, + "step": 8600, + "tokens_per_second_per_gpu": 336.61 + }, + { + "epoch": 0.8559924442014217, + "grad_norm": 0.23082919418811798, + "learning_rate": 0.00012331744724208694, + "loss": 0.207, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8610, + "tokens_per_second_per_gpu": 316.78 + }, + { + "epoch": 0.8569866282248845, + "grad_norm": 0.45513761043548584, + "learning_rate": 0.00012316479859446187, + "loss": 0.2465, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8620, + "tokens_per_second_per_gpu": 355.67 + }, + { + "epoch": 0.8579808122483472, + "grad_norm": 0.4129338562488556, + "learning_rate": 0.00012301209288704184, + "loss": 0.2563, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 8630, + "tokens_per_second_per_gpu": 323.02 + }, + { + "epoch": 0.85897499627181, + "grad_norm": 0.37343230843544006, + "learning_rate": 0.00012285933049597335, + "loss": 0.154, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8640, + "tokens_per_second_per_gpu": 326.66 + }, + { + "epoch": 0.8599691802952727, + "grad_norm": 0.32739633321762085, + "learning_rate": 0.00012270651179754243, + "loss": 0.2135, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8650, + "tokens_per_second_per_gpu": 340.36 + }, + { + "epoch": 0.8609633643187354, + "grad_norm": 0.5008605718612671, + "learning_rate": 0.0001225536371681738, + "loss": 0.2221, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8660, + "tokens_per_second_per_gpu": 295.75 + }, + { + "epoch": 0.8619575483421982, + "grad_norm": 2.8313498497009277, + "learning_rate": 0.00012240070698443, + "loss": 0.2402, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8670, + "tokens_per_second_per_gpu": 343.07 + }, + { + "epoch": 0.8629517323656609, + "grad_norm": 3.499013662338257, + "learning_rate": 0.00012224772162301042, + "loss": 0.2588, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.58, + "memory/max_allocated (GiB)": 21.58, + "step": 8680, + "tokens_per_second_per_gpu": 348.94 + }, + { + "epoch": 0.8639459163891237, + "grad_norm": 0.4069509208202362, + "learning_rate": 0.0001220946814607503, + "loss": 0.2485, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 8690, + "tokens_per_second_per_gpu": 365.05 + }, + { + "epoch": 0.8649401004125864, + "grad_norm": 0.5977867841720581, + "learning_rate": 0.00012194158687461992, + "loss": 0.2119, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 8700, + "tokens_per_second_per_gpu": 326.01 + }, + { + "epoch": 0.8659342844360491, + "grad_norm": 6.747653961181641, + "learning_rate": 0.00012178843824172361, + "loss": 0.2719, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 8710, + "tokens_per_second_per_gpu": 390.15 + }, + { + "epoch": 0.8669284684595119, + "grad_norm": 0.46154770255088806, + "learning_rate": 0.00012163523593929884, + "loss": 0.1836, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 8720, + "tokens_per_second_per_gpu": 343.33 + }, + { + "epoch": 0.8679226524829746, + "grad_norm": 0.272504985332489, + "learning_rate": 0.00012148198034471524, + "loss": 0.2419, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8730, + "tokens_per_second_per_gpu": 347.62 + }, + { + "epoch": 0.8689168365064374, + "grad_norm": 0.4848499596118927, + "learning_rate": 0.00012132867183547372, + "loss": 0.2379, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 8740, + "tokens_per_second_per_gpu": 315.42 + }, + { + "epoch": 0.8699110205299001, + "grad_norm": 0.42282259464263916, + "learning_rate": 0.00012117531078920556, + "loss": 0.2358, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 8750, + "tokens_per_second_per_gpu": 404.19 + }, + { + "epoch": 0.8709052045533628, + "grad_norm": 0.5138429999351501, + "learning_rate": 0.00012102189758367142, + "loss": 0.2602, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 8760, + "tokens_per_second_per_gpu": 350.33 + }, + { + "epoch": 0.8718993885768256, + "grad_norm": 0.2951951324939728, + "learning_rate": 0.00012086843259676041, + "loss": 0.303, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8770, + "tokens_per_second_per_gpu": 396.42 + }, + { + "epoch": 0.8728935726002883, + "grad_norm": 0.4597817361354828, + "learning_rate": 0.00012071491620648934, + "loss": 0.2519, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.68, + "memory/max_allocated (GiB)": 19.68, + "step": 8780, + "tokens_per_second_per_gpu": 338.59 + }, + { + "epoch": 0.8738877566237511, + "grad_norm": 0.13618378341197968, + "learning_rate": 0.00012056134879100138, + "loss": 0.2235, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.43, + "memory/max_allocated (GiB)": 20.43, + "step": 8790, + "tokens_per_second_per_gpu": 325.39 + }, + { + "epoch": 0.8748819406472138, + "grad_norm": 0.5435792207717896, + "learning_rate": 0.00012040773072856566, + "loss": 0.2563, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8800, + "tokens_per_second_per_gpu": 346.95 + }, + { + "epoch": 0.8758761246706765, + "grad_norm": 0.24607907235622406, + "learning_rate": 0.00012025406239757588, + "loss": 0.1721, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8810, + "tokens_per_second_per_gpu": 305.91 + }, + { + "epoch": 0.8768703086941393, + "grad_norm": 0.3879210650920868, + "learning_rate": 0.00012010034417654962, + "loss": 0.2026, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 8820, + "tokens_per_second_per_gpu": 305.29 + }, + { + "epoch": 0.877864492717602, + "grad_norm": 0.4256599545478821, + "learning_rate": 0.00011994657644412734, + "loss": 0.1985, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 8830, + "tokens_per_second_per_gpu": 330.52 + }, + { + "epoch": 0.8788586767410648, + "grad_norm": 0.2512848973274231, + "learning_rate": 0.00011979275957907146, + "loss": 0.2153, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8840, + "tokens_per_second_per_gpu": 321.37 + }, + { + "epoch": 0.8798528607645275, + "grad_norm": 0.38337549567222595, + "learning_rate": 0.00011963889396026547, + "loss": 0.2383, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8850, + "tokens_per_second_per_gpu": 359.62 + }, + { + "epoch": 0.8808470447879903, + "grad_norm": 0.3447147309780121, + "learning_rate": 0.00011948497996671286, + "loss": 0.2304, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8860, + "tokens_per_second_per_gpu": 332.81 + }, + { + "epoch": 0.881841228811453, + "grad_norm": 0.40846845507621765, + "learning_rate": 0.00011933101797753637, + "loss": 0.2297, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8870, + "tokens_per_second_per_gpu": 384.1 + }, + { + "epoch": 0.8828354128349157, + "grad_norm": 0.5454410910606384, + "learning_rate": 0.0001191770083719769, + "loss": 0.2242, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8880, + "tokens_per_second_per_gpu": 387.99 + }, + { + "epoch": 0.8838295968583785, + "grad_norm": 0.4706827700138092, + "learning_rate": 0.00011902295152939262, + "loss": 0.2381, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 8890, + "tokens_per_second_per_gpu": 304.64 + }, + { + "epoch": 0.8848237808818412, + "grad_norm": 0.6073552370071411, + "learning_rate": 0.00011886884782925816, + "loss": 0.2417, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 8900, + "tokens_per_second_per_gpu": 315.52 + }, + { + "epoch": 0.885817964905304, + "grad_norm": 0.3200027644634247, + "learning_rate": 0.00011871469765116346, + "loss": 0.2117, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 8910, + "tokens_per_second_per_gpu": 306.02 + }, + { + "epoch": 0.8868121489287667, + "grad_norm": 0.42333486676216125, + "learning_rate": 0.00011856050137481301, + "loss": 0.2552, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 8920, + "tokens_per_second_per_gpu": 355.23 + }, + { + "epoch": 0.8878063329522294, + "grad_norm": 0.2578692138195038, + "learning_rate": 0.00011840625938002481, + "loss": 0.1743, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 8930, + "tokens_per_second_per_gpu": 356.78 + }, + { + "epoch": 0.8888005169756922, + "grad_norm": 0.3321487605571747, + "learning_rate": 0.00011825197204672952, + "loss": 0.2637, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8940, + "tokens_per_second_per_gpu": 424.07 + }, + { + "epoch": 0.8897947009991549, + "grad_norm": 0.3663140833377838, + "learning_rate": 0.00011809763975496944, + "loss": 0.2272, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8950, + "tokens_per_second_per_gpu": 360.15 + }, + { + "epoch": 0.8907888850226177, + "grad_norm": 0.2619111239910126, + "learning_rate": 0.00011794326288489761, + "loss": 0.2723, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.03, + "memory/max_allocated (GiB)": 20.03, + "step": 8960, + "tokens_per_second_per_gpu": 357.64 + }, + { + "epoch": 0.8917830690460804, + "grad_norm": 0.4359273612499237, + "learning_rate": 0.0001177888418167769, + "loss": 0.2633, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 8970, + "tokens_per_second_per_gpu": 406.63 + }, + { + "epoch": 0.8927772530695431, + "grad_norm": 0.29716983437538147, + "learning_rate": 0.00011763437693097903, + "loss": 0.2789, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8980, + "tokens_per_second_per_gpu": 386.97 + }, + { + "epoch": 0.8937714370930059, + "grad_norm": 0.6009801626205444, + "learning_rate": 0.00011747986860798368, + "loss": 0.2057, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.43, + "memory/max_allocated (GiB)": 20.43, + "step": 8990, + "tokens_per_second_per_gpu": 318.77 + }, + { + "epoch": 0.8947656211164686, + "grad_norm": 0.4606180191040039, + "learning_rate": 0.0001173253172283775, + "loss": 0.2232, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9000, + "tokens_per_second_per_gpu": 283.3 + }, + { + "epoch": 0.8957598051399314, + "grad_norm": 0.3887878358364105, + "learning_rate": 0.00011717072317285318, + "loss": 0.2175, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9010, + "tokens_per_second_per_gpu": 313.78 + }, + { + "epoch": 0.8967539891633941, + "grad_norm": 0.3322821855545044, + "learning_rate": 0.0001170160868222086, + "loss": 0.2491, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9020, + "tokens_per_second_per_gpu": 339.47 + }, + { + "epoch": 0.8977481731868568, + "grad_norm": 0.5119565725326538, + "learning_rate": 0.00011686140855734571, + "loss": 0.2568, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.66, + "memory/max_allocated (GiB)": 19.66, + "step": 9030, + "tokens_per_second_per_gpu": 322.62 + }, + { + "epoch": 0.8987423572103196, + "grad_norm": 0.32013964653015137, + "learning_rate": 0.00011670668875926982, + "loss": 0.3019, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9040, + "tokens_per_second_per_gpu": 356.77 + }, + { + "epoch": 0.8997365412337823, + "grad_norm": 0.19132547080516815, + "learning_rate": 0.00011655192780908849, + "loss": 0.1927, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.16, + "memory/max_allocated (GiB)": 18.16, + "step": 9050, + "tokens_per_second_per_gpu": 306.79 + }, + { + "epoch": 0.9007307252572451, + "grad_norm": 0.4459245502948761, + "learning_rate": 0.00011639712608801059, + "loss": 0.2013, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 9060, + "tokens_per_second_per_gpu": 369.24 + }, + { + "epoch": 0.9017249092807078, + "grad_norm": 0.3540112376213074, + "learning_rate": 0.00011624228397734556, + "loss": 0.1513, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9070, + "tokens_per_second_per_gpu": 323.78 + }, + { + "epoch": 0.9027190933041706, + "grad_norm": 0.3951474130153656, + "learning_rate": 0.00011608740185850219, + "loss": 0.2055, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9080, + "tokens_per_second_per_gpu": 358.92 + }, + { + "epoch": 0.9037132773276333, + "grad_norm": 0.3915760815143585, + "learning_rate": 0.00011593248011298791, + "loss": 0.2148, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9090, + "tokens_per_second_per_gpu": 397.61 + }, + { + "epoch": 0.9047074613510961, + "grad_norm": 0.49453213810920715, + "learning_rate": 0.00011577751912240771, + "loss": 0.187, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9100, + "tokens_per_second_per_gpu": 379.24 + }, + { + "epoch": 0.9057016453745589, + "grad_norm": 0.5185866951942444, + "learning_rate": 0.00011562251926846326, + "loss": 0.219, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9110, + "tokens_per_second_per_gpu": 307.18 + }, + { + "epoch": 0.9066958293980216, + "grad_norm": 0.44589415192604065, + "learning_rate": 0.00011546748093295195, + "loss": 0.2127, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 9120, + "tokens_per_second_per_gpu": 360.35 + }, + { + "epoch": 0.9076900134214844, + "grad_norm": 0.4594690501689911, + "learning_rate": 0.00011531240449776594, + "loss": 0.2057, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9130, + "tokens_per_second_per_gpu": 351.45 + }, + { + "epoch": 0.9086841974449471, + "grad_norm": 0.4069642722606659, + "learning_rate": 0.00011515729034489133, + "loss": 0.2213, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 9140, + "tokens_per_second_per_gpu": 331.15 + }, + { + "epoch": 0.9096783814684098, + "grad_norm": 0.5103911757469177, + "learning_rate": 0.00011500213885640705, + "loss": 0.2258, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 9150, + "tokens_per_second_per_gpu": 323.85 + }, + { + "epoch": 0.9106725654918726, + "grad_norm": 0.5270497798919678, + "learning_rate": 0.00011484695041448399, + "loss": 0.2709, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 9160, + "tokens_per_second_per_gpu": 363.69 + }, + { + "epoch": 0.9116667495153353, + "grad_norm": 0.48765021562576294, + "learning_rate": 0.00011469172540138407, + "loss": 0.1935, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 9170, + "tokens_per_second_per_gpu": 329.09 + }, + { + "epoch": 0.9126609335387981, + "grad_norm": 0.4249947667121887, + "learning_rate": 0.00011453646419945934, + "loss": 0.2296, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 9180, + "tokens_per_second_per_gpu": 412.86 + }, + { + "epoch": 0.9136551175622608, + "grad_norm": 0.36251431703567505, + "learning_rate": 0.00011438116719115089, + "loss": 0.2361, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 9190, + "tokens_per_second_per_gpu": 343.29 + }, + { + "epoch": 0.9146493015857236, + "grad_norm": 0.3858913481235504, + "learning_rate": 0.00011422583475898814, + "loss": 0.2446, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9200, + "tokens_per_second_per_gpu": 331.61 + }, + { + "epoch": 0.9156434856091863, + "grad_norm": 0.43422338366508484, + "learning_rate": 0.00011407046728558768, + "loss": 0.1683, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 9210, + "tokens_per_second_per_gpu": 303.63 + }, + { + "epoch": 0.916637669632649, + "grad_norm": 0.43746358156204224, + "learning_rate": 0.00011391506515365245, + "loss": 0.1423, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9220, + "tokens_per_second_per_gpu": 337.04 + }, + { + "epoch": 0.9176318536561118, + "grad_norm": 0.30726879835128784, + "learning_rate": 0.00011375962874597073, + "loss": 0.179, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9230, + "tokens_per_second_per_gpu": 312.89 + }, + { + "epoch": 0.9186260376795745, + "grad_norm": 0.5088397264480591, + "learning_rate": 0.00011360415844541523, + "loss": 0.2571, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9240, + "tokens_per_second_per_gpu": 359.77 + }, + { + "epoch": 0.9196202217030373, + "grad_norm": 0.545219898223877, + "learning_rate": 0.00011344865463494219, + "loss": 0.2228, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 9250, + "tokens_per_second_per_gpu": 337.28 + }, + { + "epoch": 0.9206144057265, + "grad_norm": 0.5700411796569824, + "learning_rate": 0.00011329311769759035, + "loss": 0.2236, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 9260, + "tokens_per_second_per_gpu": 340.23 + }, + { + "epoch": 0.9216085897499627, + "grad_norm": 0.3975035548210144, + "learning_rate": 0.00011313754801648003, + "loss": 0.2487, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9270, + "tokens_per_second_per_gpu": 345.97 + }, + { + "epoch": 0.9226027737734255, + "grad_norm": 0.5616829991340637, + "learning_rate": 0.00011298194597481226, + "loss": 0.2511, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9280, + "tokens_per_second_per_gpu": 331.41 + }, + { + "epoch": 0.9235969577968882, + "grad_norm": 0.38982534408569336, + "learning_rate": 0.00011282631195586777, + "loss": 0.2809, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 9290, + "tokens_per_second_per_gpu": 361.76 + }, + { + "epoch": 0.924591141820351, + "grad_norm": 0.531318187713623, + "learning_rate": 0.00011267064634300603, + "loss": 0.2608, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9300, + "tokens_per_second_per_gpu": 393.15 + }, + { + "epoch": 0.9255853258438137, + "grad_norm": 0.5956375002861023, + "learning_rate": 0.00011251494951966437, + "loss": 0.2229, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 9310, + "tokens_per_second_per_gpu": 284.92 + }, + { + "epoch": 0.9265795098672764, + "grad_norm": 0.42666998505592346, + "learning_rate": 0.0001123592218693569, + "loss": 0.1985, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 9320, + "tokens_per_second_per_gpu": 362.3 + }, + { + "epoch": 0.9275736938907392, + "grad_norm": 0.4182765781879425, + "learning_rate": 0.00011220346377567381, + "loss": 0.2535, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 9330, + "tokens_per_second_per_gpu": 348.83 + }, + { + "epoch": 0.9285678779142019, + "grad_norm": 0.5867879390716553, + "learning_rate": 0.00011204767562228017, + "loss": 0.2309, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9340, + "tokens_per_second_per_gpu": 334.46 + }, + { + "epoch": 0.9295620619376647, + "grad_norm": 0.27041056752204895, + "learning_rate": 0.00011189185779291515, + "loss": 0.232, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 9350, + "tokens_per_second_per_gpu": 331.56 + }, + { + "epoch": 0.9305562459611274, + "grad_norm": 0.5081501603126526, + "learning_rate": 0.00011173601067139099, + "loss": 0.2399, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9360, + "tokens_per_second_per_gpu": 336.84 + }, + { + "epoch": 0.9315504299845901, + "grad_norm": 0.7966908812522888, + "learning_rate": 0.00011158013464159208, + "loss": 0.2606, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 9370, + "tokens_per_second_per_gpu": 313.66 + }, + { + "epoch": 0.9325446140080529, + "grad_norm": 0.45302364230155945, + "learning_rate": 0.00011142423008747403, + "loss": 0.1581, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 9380, + "tokens_per_second_per_gpu": 322.72 + }, + { + "epoch": 0.9335387980315156, + "grad_norm": 0.5959784984588623, + "learning_rate": 0.00011126829739306271, + "loss": 0.2115, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.1, + "memory/max_allocated (GiB)": 19.1, + "step": 9390, + "tokens_per_second_per_gpu": 304.85 + }, + { + "epoch": 0.9345329820549784, + "grad_norm": 0.5530646443367004, + "learning_rate": 0.00011111233694245328, + "loss": 0.1854, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 9400, + "tokens_per_second_per_gpu": 347.73 + }, + { + "epoch": 0.9355271660784411, + "grad_norm": 0.4127480387687683, + "learning_rate": 0.00011095634911980933, + "loss": 0.2307, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9410, + "tokens_per_second_per_gpu": 325.27 + }, + { + "epoch": 0.9365213501019038, + "grad_norm": 0.4038192629814148, + "learning_rate": 0.0001108003343093618, + "loss": 0.1953, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9420, + "tokens_per_second_per_gpu": 327.3 + }, + { + "epoch": 0.9375155341253666, + "grad_norm": 0.4245215654373169, + "learning_rate": 0.00011064429289540821, + "loss": 0.2505, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9430, + "tokens_per_second_per_gpu": 337.4 + }, + { + "epoch": 0.9385097181488293, + "grad_norm": 0.4768611192703247, + "learning_rate": 0.00011048822526231148, + "loss": 0.1584, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 9440, + "tokens_per_second_per_gpu": 314.23 + }, + { + "epoch": 0.9395039021722921, + "grad_norm": 0.3262840807437897, + "learning_rate": 0.00011033213179449917, + "loss": 0.2287, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 9450, + "tokens_per_second_per_gpu": 358.18 + }, + { + "epoch": 0.9404980861957548, + "grad_norm": 0.5882810354232788, + "learning_rate": 0.00011017601287646251, + "loss": 0.206, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9460, + "tokens_per_second_per_gpu": 405.83 + }, + { + "epoch": 0.9414922702192176, + "grad_norm": 0.4905533492565155, + "learning_rate": 0.0001100198688927554, + "loss": 0.2209, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 9470, + "tokens_per_second_per_gpu": 327.29 + }, + { + "epoch": 0.9424864542426803, + "grad_norm": 0.5608656406402588, + "learning_rate": 0.00010986370022799346, + "loss": 0.2418, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 9480, + "tokens_per_second_per_gpu": 335.39 + }, + { + "epoch": 0.943480638266143, + "grad_norm": 0.4696904718875885, + "learning_rate": 0.00010970750726685309, + "loss": 0.2742, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 9490, + "tokens_per_second_per_gpu": 399.98 + }, + { + "epoch": 0.9444748222896058, + "grad_norm": 0.423969030380249, + "learning_rate": 0.00010955129039407062, + "loss": 0.2259, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9500, + "tokens_per_second_per_gpu": 336.84 + }, + { + "epoch": 0.9454690063130685, + "grad_norm": 0.438812792301178, + "learning_rate": 0.0001093950499944412, + "loss": 0.1855, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 9510, + "tokens_per_second_per_gpu": 318.59 + }, + { + "epoch": 0.9464631903365313, + "grad_norm": 0.5361111164093018, + "learning_rate": 0.00010923878645281794, + "loss": 0.2713, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9520, + "tokens_per_second_per_gpu": 354.0 + }, + { + "epoch": 0.947457374359994, + "grad_norm": 0.39556553959846497, + "learning_rate": 0.000109082500154111, + "loss": 0.2539, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 9530, + "tokens_per_second_per_gpu": 342.96 + }, + { + "epoch": 0.9484515583834567, + "grad_norm": 0.42129939794540405, + "learning_rate": 0.00010892619148328654, + "loss": 0.2282, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9540, + "tokens_per_second_per_gpu": 375.18 + }, + { + "epoch": 0.9494457424069195, + "grad_norm": 0.49444761872291565, + "learning_rate": 0.00010876986082536584, + "loss": 0.2342, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.68, + "memory/max_allocated (GiB)": 19.68, + "step": 9550, + "tokens_per_second_per_gpu": 394.82 + }, + { + "epoch": 0.9504399264303822, + "grad_norm": 0.3940083682537079, + "learning_rate": 0.0001086135085654244, + "loss": 0.1993, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 9560, + "tokens_per_second_per_gpu": 346.47 + }, + { + "epoch": 0.951434110453845, + "grad_norm": 0.40422961115837097, + "learning_rate": 0.00010845713508859088, + "loss": 0.2479, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 9570, + "tokens_per_second_per_gpu": 338.94 + }, + { + "epoch": 0.9524282944773077, + "grad_norm": 0.5008521676063538, + "learning_rate": 0.00010830074078004615, + "loss": 0.2217, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 9580, + "tokens_per_second_per_gpu": 390.26 + }, + { + "epoch": 0.9534224785007704, + "grad_norm": 0.37146544456481934, + "learning_rate": 0.00010814432602502246, + "loss": 0.2178, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 9590, + "tokens_per_second_per_gpu": 347.55 + }, + { + "epoch": 0.9544166625242332, + "grad_norm": 0.4229485094547272, + "learning_rate": 0.00010798789120880246, + "loss": 0.2183, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9600, + "tokens_per_second_per_gpu": 311.66 + }, + { + "epoch": 0.9554108465476959, + "grad_norm": 0.38394778966903687, + "learning_rate": 0.00010783143671671813, + "loss": 0.2663, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 9610, + "tokens_per_second_per_gpu": 329.51 + }, + { + "epoch": 0.9564050305711588, + "grad_norm": 0.4587366580963135, + "learning_rate": 0.00010767496293414996, + "loss": 0.2776, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9620, + "tokens_per_second_per_gpu": 464.41 + }, + { + "epoch": 0.9573992145946215, + "grad_norm": 0.5589081645011902, + "learning_rate": 0.0001075184702465259, + "loss": 0.2387, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.47, + "memory/max_allocated (GiB)": 21.47, + "step": 9630, + "tokens_per_second_per_gpu": 358.44 + }, + { + "epoch": 0.9583933986180843, + "grad_norm": 0.6210941672325134, + "learning_rate": 0.0001073619590393206, + "loss": 0.2587, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9640, + "tokens_per_second_per_gpu": 373.16 + }, + { + "epoch": 0.959387582641547, + "grad_norm": 0.6241095066070557, + "learning_rate": 0.0001072054296980542, + "loss": 0.2378, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9650, + "tokens_per_second_per_gpu": 385.04 + }, + { + "epoch": 0.9603817666650097, + "grad_norm": 0.40504932403564453, + "learning_rate": 0.00010704888260829156, + "loss": 0.2767, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9660, + "tokens_per_second_per_gpu": 384.99 + }, + { + "epoch": 0.9613759506884725, + "grad_norm": 0.4956580400466919, + "learning_rate": 0.0001068923181556412, + "loss": 0.2388, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 9670, + "tokens_per_second_per_gpu": 378.96 + }, + { + "epoch": 0.9623701347119352, + "grad_norm": 0.2848256230354309, + "learning_rate": 0.00010673573672575454, + "loss": 0.2238, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9680, + "tokens_per_second_per_gpu": 359.23 + }, + { + "epoch": 0.963364318735398, + "grad_norm": 0.38930168747901917, + "learning_rate": 0.00010657913870432468, + "loss": 0.2305, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 9690, + "tokens_per_second_per_gpu": 309.48 + }, + { + "epoch": 0.9643585027588607, + "grad_norm": 0.4527674913406372, + "learning_rate": 0.00010642252447708563, + "loss": 0.2731, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9700, + "tokens_per_second_per_gpu": 349.69 + }, + { + "epoch": 0.9653526867823234, + "grad_norm": 0.4257194697856903, + "learning_rate": 0.00010626589442981138, + "loss": 0.2635, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 9710, + "tokens_per_second_per_gpu": 414.72 + }, + { + "epoch": 0.9663468708057862, + "grad_norm": 0.3962489068508148, + "learning_rate": 0.00010610924894831483, + "loss": 0.2862, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 9720, + "tokens_per_second_per_gpu": 365.4 + }, + { + "epoch": 0.9673410548292489, + "grad_norm": 0.29012176394462585, + "learning_rate": 0.00010595258841844688, + "loss": 0.2353, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 9730, + "tokens_per_second_per_gpu": 364.81 + }, + { + "epoch": 0.9683352388527117, + "grad_norm": 0.5301884412765503, + "learning_rate": 0.00010579591322609559, + "loss": 0.1947, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.61, + "memory/max_allocated (GiB)": 20.61, + "step": 9740, + "tokens_per_second_per_gpu": 369.66 + }, + { + "epoch": 0.9693294228761744, + "grad_norm": 0.41399112343788147, + "learning_rate": 0.000105639223757185, + "loss": 0.1859, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.98, + "memory/max_allocated (GiB)": 19.98, + "step": 9750, + "tokens_per_second_per_gpu": 314.55 + }, + { + "epoch": 0.9703236068996371, + "grad_norm": 0.3453201353549957, + "learning_rate": 0.00010548252039767443, + "loss": 0.1971, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9760, + "tokens_per_second_per_gpu": 368.71 + }, + { + "epoch": 0.9713177909230999, + "grad_norm": 0.5453227162361145, + "learning_rate": 0.00010532580353355734, + "loss": 0.3006, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.47, + "memory/max_allocated (GiB)": 21.47, + "step": 9770, + "tokens_per_second_per_gpu": 420.65 + }, + { + "epoch": 0.9723119749465626, + "grad_norm": 0.42438754439353943, + "learning_rate": 0.00010516907355086055, + "loss": 0.2027, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 9780, + "tokens_per_second_per_gpu": 396.2 + }, + { + "epoch": 0.9733061589700254, + "grad_norm": 0.5116370916366577, + "learning_rate": 0.00010501233083564306, + "loss": 0.1968, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 9790, + "tokens_per_second_per_gpu": 345.38 + }, + { + "epoch": 0.9743003429934881, + "grad_norm": 0.23473972082138062, + "learning_rate": 0.00010485557577399536, + "loss": 0.1676, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9800, + "tokens_per_second_per_gpu": 386.72 + }, + { + "epoch": 0.9752945270169509, + "grad_norm": 0.5427513718605042, + "learning_rate": 0.00010469880875203827, + "loss": 0.2066, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 9810, + "tokens_per_second_per_gpu": 372.28 + }, + { + "epoch": 0.9762887110404136, + "grad_norm": 0.4717273712158203, + "learning_rate": 0.00010454203015592214, + "loss": 0.2097, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9820, + "tokens_per_second_per_gpu": 317.9 + }, + { + "epoch": 0.9772828950638763, + "grad_norm": 0.2797994017601013, + "learning_rate": 0.00010438524037182573, + "loss": 0.3073, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9830, + "tokens_per_second_per_gpu": 371.67 + }, + { + "epoch": 0.9782770790873391, + "grad_norm": 0.3694850504398346, + "learning_rate": 0.00010422843978595542, + "loss": 0.2309, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 9840, + "tokens_per_second_per_gpu": 426.94 + }, + { + "epoch": 0.9792712631108018, + "grad_norm": 0.3863958716392517, + "learning_rate": 0.00010407162878454423, + "loss": 0.2697, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 9850, + "tokens_per_second_per_gpu": 338.41 + }, + { + "epoch": 0.9802654471342646, + "grad_norm": 0.22021318972110748, + "learning_rate": 0.00010391480775385078, + "loss": 0.1866, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9860, + "tokens_per_second_per_gpu": 331.0 + }, + { + "epoch": 0.9812596311577273, + "grad_norm": 0.3698722720146179, + "learning_rate": 0.00010375797708015844, + "loss": 0.1992, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 9870, + "tokens_per_second_per_gpu": 328.05 + }, + { + "epoch": 0.98225381518119, + "grad_norm": 0.45455843210220337, + "learning_rate": 0.00010360113714977428, + "loss": 0.2508, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 9880, + "tokens_per_second_per_gpu": 409.92 + }, + { + "epoch": 0.9832479992046528, + "grad_norm": 0.3562302887439728, + "learning_rate": 0.00010344428834902822, + "loss": 0.187, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 9890, + "tokens_per_second_per_gpu": 262.76 + }, + { + "epoch": 0.9842421832281155, + "grad_norm": 0.5735921263694763, + "learning_rate": 0.00010328743106427197, + "loss": 0.2517, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 9900, + "tokens_per_second_per_gpu": 338.79 + }, + { + "epoch": 0.9852363672515783, + "grad_norm": 0.47819098830223083, + "learning_rate": 0.00010313056568187818, + "loss": 0.2298, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 9910, + "tokens_per_second_per_gpu": 351.88 + }, + { + "epoch": 0.986230551275041, + "grad_norm": 0.29087749123573303, + "learning_rate": 0.00010297369258823948, + "loss": 0.1617, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 9920, + "tokens_per_second_per_gpu": 311.57 + }, + { + "epoch": 0.9872247352985037, + "grad_norm": 0.5746156573295593, + "learning_rate": 0.00010281681216976742, + "loss": 0.2215, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9930, + "tokens_per_second_per_gpu": 360.68 + }, + { + "epoch": 0.9882189193219665, + "grad_norm": 0.5306881666183472, + "learning_rate": 0.00010265992481289164, + "loss": 0.2165, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 9940, + "tokens_per_second_per_gpu": 350.01 + }, + { + "epoch": 0.9892131033454292, + "grad_norm": 0.5628572106361389, + "learning_rate": 0.00010250303090405886, + "loss": 0.2246, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9950, + "tokens_per_second_per_gpu": 331.08 + }, + { + "epoch": 0.990207287368892, + "grad_norm": 0.38296106457710266, + "learning_rate": 0.00010234613082973195, + "loss": 0.2294, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 9960, + "tokens_per_second_per_gpu": 310.12 + }, + { + "epoch": 0.9912014713923547, + "grad_norm": 0.5048671960830688, + "learning_rate": 0.00010218922497638893, + "loss": 0.2263, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9970, + "tokens_per_second_per_gpu": 360.03 + }, + { + "epoch": 0.9921956554158174, + "grad_norm": 0.2620450258255005, + "learning_rate": 0.00010203231373052205, + "loss": 0.2391, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 9980, + "tokens_per_second_per_gpu": 397.95 + }, + { + "epoch": 0.9931898394392802, + "grad_norm": 0.37807098031044006, + "learning_rate": 0.00010187539747863693, + "loss": 0.2654, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 9990, + "tokens_per_second_per_gpu": 413.64 + }, + { + "epoch": 0.9941840234627429, + "grad_norm": 0.3059203624725342, + "learning_rate": 0.00010171847660725147, + "loss": 0.2081, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 10000, + "tokens_per_second_per_gpu": 317.3 + }, + { + "epoch": 0.9951782074862057, + "grad_norm": 0.38892289996147156, + "learning_rate": 0.0001015615515028949, + "loss": 0.2826, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 10010, + "tokens_per_second_per_gpu": 379.68 + }, + { + "epoch": 0.9961723915096684, + "grad_norm": 0.22751818597316742, + "learning_rate": 0.00010140462255210696, + "loss": 0.2674, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 10020, + "tokens_per_second_per_gpu": 362.08 + }, + { + "epoch": 0.9971665755331311, + "grad_norm": 0.3578786551952362, + "learning_rate": 0.00010124769014143678, + "loss": 0.2125, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 10030, + "tokens_per_second_per_gpu": 367.81 + }, + { + "epoch": 0.9981607595565939, + "grad_norm": 0.32971230149269104, + "learning_rate": 0.00010109075465744208, + "loss": 0.1599, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 10040, + "tokens_per_second_per_gpu": 336.7 + }, + { + "epoch": 0.9991549435800566, + "grad_norm": 0.4897179901599884, + "learning_rate": 0.00010093381648668813, + "loss": 0.2485, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 10050, + "tokens_per_second_per_gpu": 303.41 + }, + { + "epoch": 1.0000994184023462, + "grad_norm": 0.23543600738048553, + "learning_rate": 0.00010077687601574678, + "loss": 0.2974, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 10060, + "tokens_per_second_per_gpu": 335.91 + }, + { + "epoch": 1.001093602425809, + "grad_norm": 0.30785781145095825, + "learning_rate": 0.00010061993363119566, + "loss": 0.155, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 10070, + "tokens_per_second_per_gpu": 313.67 + }, + { + "epoch": 1.0020877864492717, + "grad_norm": 0.36587706208229065, + "learning_rate": 0.00010046298971961695, + "loss": 0.2054, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 10080, + "tokens_per_second_per_gpu": 388.65 + }, + { + "epoch": 1.0030819704727345, + "grad_norm": 0.5172699093818665, + "learning_rate": 0.0001003060446675967, + "loss": 0.1678, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 10090, + "tokens_per_second_per_gpu": 356.25 + }, + { + "epoch": 1.0040761544961971, + "grad_norm": 0.3362785279750824, + "learning_rate": 0.00010014909886172377, + "loss": 0.1414, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 10100, + "tokens_per_second_per_gpu": 317.06 + }, + { + "epoch": 1.00507033851966, + "grad_norm": 0.5255736112594604, + "learning_rate": 9.99921526885888e-05, + "loss": 0.2086, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 10110, + "tokens_per_second_per_gpu": 428.98 + }, + { + "epoch": 1.0060645225431228, + "grad_norm": 0.4546601474285126, + "learning_rate": 9.983520653478343e-05, + "loss": 0.1895, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.64, + "memory/max_allocated (GiB)": 20.64, + "step": 10120, + "tokens_per_second_per_gpu": 304.97 + }, + { + "epoch": 1.0070587065665855, + "grad_norm": 0.33736681938171387, + "learning_rate": 9.967826078689919e-05, + "loss": 0.1643, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.67, + "memory/max_allocated (GiB)": 19.67, + "step": 10130, + "tokens_per_second_per_gpu": 337.84 + }, + { + "epoch": 1.0080528905900483, + "grad_norm": 0.2856805622577667, + "learning_rate": 9.952131583152665e-05, + "loss": 0.1171, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 10140, + "tokens_per_second_per_gpu": 320.63 + }, + { + "epoch": 1.009047074613511, + "grad_norm": 0.35713866353034973, + "learning_rate": 9.936437205525437e-05, + "loss": 0.1597, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 10150, + "tokens_per_second_per_gpu": 365.04 + }, + { + "epoch": 1.0100412586369738, + "grad_norm": 0.4092197120189667, + "learning_rate": 9.920742984466809e-05, + "loss": 0.2034, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 10160, + "tokens_per_second_per_gpu": 331.18 + }, + { + "epoch": 1.0110354426604364, + "grad_norm": 0.36163678765296936, + "learning_rate": 9.905048958634958e-05, + "loss": 0.1466, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 10170, + "tokens_per_second_per_gpu": 416.08 + }, + { + "epoch": 1.0120296266838993, + "grad_norm": 0.5861209630966187, + "learning_rate": 9.889355166687593e-05, + "loss": 0.1191, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 10180, + "tokens_per_second_per_gpu": 317.4 + }, + { + "epoch": 1.013023810707362, + "grad_norm": 0.4099638760089874, + "learning_rate": 9.873661647281836e-05, + "loss": 0.1707, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 10190, + "tokens_per_second_per_gpu": 380.48 + }, + { + "epoch": 1.0140179947308248, + "grad_norm": 0.5533301830291748, + "learning_rate": 9.857968439074142e-05, + "loss": 0.2481, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 10200, + "tokens_per_second_per_gpu": 355.99 + }, + { + "epoch": 1.0150121787542874, + "grad_norm": 0.4641655385494232, + "learning_rate": 9.842275580720205e-05, + "loss": 0.1997, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 10210, + "tokens_per_second_per_gpu": 392.6 + }, + { + "epoch": 1.0160063627777502, + "grad_norm": 0.5175334215164185, + "learning_rate": 9.826583110874847e-05, + "loss": 0.1672, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 10220, + "tokens_per_second_per_gpu": 338.18 + }, + { + "epoch": 1.0170005468012129, + "grad_norm": 0.2824496924877167, + "learning_rate": 9.810891068191942e-05, + "loss": 0.152, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 10230, + "tokens_per_second_per_gpu": 384.17 + }, + { + "epoch": 1.0179947308246757, + "grad_norm": 0.5829957127571106, + "learning_rate": 9.795199491324302e-05, + "loss": 0.1287, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 10240, + "tokens_per_second_per_gpu": 306.4 + }, + { + "epoch": 1.0189889148481384, + "grad_norm": 0.33510005474090576, + "learning_rate": 9.779508418923604e-05, + "loss": 0.1509, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 10250, + "tokens_per_second_per_gpu": 368.11 + }, + { + "epoch": 1.0199830988716012, + "grad_norm": 0.4509134888648987, + "learning_rate": 9.763817889640267e-05, + "loss": 0.1762, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 10260, + "tokens_per_second_per_gpu": 370.17 + }, + { + "epoch": 1.0209772828950638, + "grad_norm": 0.5804489254951477, + "learning_rate": 9.74812794212339e-05, + "loss": 0.1569, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 10270, + "tokens_per_second_per_gpu": 321.78 + }, + { + "epoch": 1.0219714669185267, + "grad_norm": 0.5096241235733032, + "learning_rate": 9.732438615020623e-05, + "loss": 0.1669, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 10280, + "tokens_per_second_per_gpu": 333.99 + }, + { + "epoch": 1.0229656509419893, + "grad_norm": 0.6617136001586914, + "learning_rate": 9.716749946978102e-05, + "loss": 0.1857, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 10290, + "tokens_per_second_per_gpu": 260.5 + }, + { + "epoch": 1.0239598349654522, + "grad_norm": 0.5293213725090027, + "learning_rate": 9.701061976640323e-05, + "loss": 0.1787, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 10300, + "tokens_per_second_per_gpu": 382.76 + }, + { + "epoch": 1.0249540189889148, + "grad_norm": 0.6056047081947327, + "learning_rate": 9.685374742650083e-05, + "loss": 0.1412, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 10310, + "tokens_per_second_per_gpu": 350.37 + }, + { + "epoch": 1.0259482030123777, + "grad_norm": 0.2792462110519409, + "learning_rate": 9.669688283648344e-05, + "loss": 0.1149, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 10320, + "tokens_per_second_per_gpu": 269.87 + }, + { + "epoch": 1.0269423870358403, + "grad_norm": 0.4464641511440277, + "learning_rate": 9.654002638274176e-05, + "loss": 0.1829, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 10330, + "tokens_per_second_per_gpu": 411.8 + }, + { + "epoch": 1.0279365710593031, + "grad_norm": 0.4715180993080139, + "learning_rate": 9.638317845164639e-05, + "loss": 0.1695, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 10340, + "tokens_per_second_per_gpu": 271.86 + }, + { + "epoch": 1.0289307550827658, + "grad_norm": 0.43288466334342957, + "learning_rate": 9.622633942954693e-05, + "loss": 0.1405, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 10350, + "tokens_per_second_per_gpu": 371.24 + }, + { + "epoch": 1.0299249391062286, + "grad_norm": 0.4133097529411316, + "learning_rate": 9.606950970277106e-05, + "loss": 0.1613, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 10360, + "tokens_per_second_per_gpu": 335.26 + }, + { + "epoch": 1.0309191231296913, + "grad_norm": 0.36935463547706604, + "learning_rate": 9.591268965762348e-05, + "loss": 0.1809, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 10370, + "tokens_per_second_per_gpu": 363.08 + }, + { + "epoch": 1.031913307153154, + "grad_norm": 0.6092913746833801, + "learning_rate": 9.57558796803852e-05, + "loss": 0.1507, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 10380, + "tokens_per_second_per_gpu": 296.26 + }, + { + "epoch": 1.0329074911766167, + "grad_norm": 0.24895969033241272, + "learning_rate": 9.559908015731223e-05, + "loss": 0.1674, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 10390, + "tokens_per_second_per_gpu": 306.56 + }, + { + "epoch": 1.0339016752000796, + "grad_norm": 0.3319539725780487, + "learning_rate": 9.544229147463502e-05, + "loss": 0.1683, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 10400, + "tokens_per_second_per_gpu": 365.18 + }, + { + "epoch": 1.0348958592235422, + "grad_norm": 0.3731245994567871, + "learning_rate": 9.528551401855718e-05, + "loss": 0.1347, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 10410, + "tokens_per_second_per_gpu": 304.91 + }, + { + "epoch": 1.035890043247005, + "grad_norm": 0.42913174629211426, + "learning_rate": 9.512874817525474e-05, + "loss": 0.1558, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 10420, + "tokens_per_second_per_gpu": 368.64 + }, + { + "epoch": 1.0368842272704677, + "grad_norm": 0.5695579648017883, + "learning_rate": 9.49719943308751e-05, + "loss": 0.1816, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 10430, + "tokens_per_second_per_gpu": 382.9 + }, + { + "epoch": 1.0378784112939305, + "grad_norm": 0.659853458404541, + "learning_rate": 9.481525287153616e-05, + "loss": 0.1885, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 10440, + "tokens_per_second_per_gpu": 362.22 + }, + { + "epoch": 1.0388725953173932, + "grad_norm": 0.5355104207992554, + "learning_rate": 9.465852418332518e-05, + "loss": 0.192, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 10450, + "tokens_per_second_per_gpu": 422.56 + }, + { + "epoch": 1.039866779340856, + "grad_norm": 0.6618303060531616, + "learning_rate": 9.450180865229807e-05, + "loss": 0.1468, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 10460, + "tokens_per_second_per_gpu": 331.93 + }, + { + "epoch": 1.0408609633643187, + "grad_norm": 0.4398798644542694, + "learning_rate": 9.434510666447838e-05, + "loss": 0.1703, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 10470, + "tokens_per_second_per_gpu": 332.46 + }, + { + "epoch": 1.0418551473877815, + "grad_norm": 0.25756803154945374, + "learning_rate": 9.41884186058561e-05, + "loss": 0.1676, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 10480, + "tokens_per_second_per_gpu": 332.3 + }, + { + "epoch": 1.0428493314112441, + "grad_norm": 0.2897964417934418, + "learning_rate": 9.403174486238714e-05, + "loss": 0.1372, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 10490, + "tokens_per_second_per_gpu": 315.04 + }, + { + "epoch": 1.043843515434707, + "grad_norm": 0.2930709719657898, + "learning_rate": 9.387508581999197e-05, + "loss": 0.1843, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 10500, + "tokens_per_second_per_gpu": 364.97 + }, + { + "epoch": 1.0448376994581696, + "grad_norm": 0.3903422951698303, + "learning_rate": 9.371844186455501e-05, + "loss": 0.2168, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 10510, + "tokens_per_second_per_gpu": 377.16 + }, + { + "epoch": 1.0458318834816325, + "grad_norm": 0.3957577049732208, + "learning_rate": 9.356181338192332e-05, + "loss": 0.1676, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 10520, + "tokens_per_second_per_gpu": 306.27 + }, + { + "epoch": 1.046826067505095, + "grad_norm": 0.5264449119567871, + "learning_rate": 9.340520075790606e-05, + "loss": 0.1574, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 10530, + "tokens_per_second_per_gpu": 322.3 + }, + { + "epoch": 1.047820251528558, + "grad_norm": 0.5075907111167908, + "learning_rate": 9.324860437827312e-05, + "loss": 0.1808, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 10540, + "tokens_per_second_per_gpu": 375.27 + }, + { + "epoch": 1.0488144355520206, + "grad_norm": 0.28377336263656616, + "learning_rate": 9.309202462875457e-05, + "loss": 0.1528, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 10550, + "tokens_per_second_per_gpu": 356.22 + }, + { + "epoch": 1.0498086195754834, + "grad_norm": 0.4088289439678192, + "learning_rate": 9.293546189503938e-05, + "loss": 0.1717, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 10560, + "tokens_per_second_per_gpu": 355.64 + }, + { + "epoch": 1.050802803598946, + "grad_norm": 0.1233486533164978, + "learning_rate": 9.27789165627747e-05, + "loss": 0.1417, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 10570, + "tokens_per_second_per_gpu": 378.24 + }, + { + "epoch": 1.051796987622409, + "grad_norm": 0.54539555311203, + "learning_rate": 9.26223890175647e-05, + "loss": 0.131, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 10580, + "tokens_per_second_per_gpu": 321.17 + }, + { + "epoch": 1.0527911716458715, + "grad_norm": 0.3036106526851654, + "learning_rate": 9.246587964496984e-05, + "loss": 0.1601, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 10590, + "tokens_per_second_per_gpu": 327.08 + }, + { + "epoch": 1.0537853556693344, + "grad_norm": 0.27965182065963745, + "learning_rate": 9.230938883050581e-05, + "loss": 0.1604, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 10600, + "tokens_per_second_per_gpu": 350.41 + }, + { + "epoch": 1.054779539692797, + "grad_norm": 0.33622997999191284, + "learning_rate": 9.215291695964252e-05, + "loss": 0.1158, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 10610, + "tokens_per_second_per_gpu": 317.77 + }, + { + "epoch": 1.0557737237162599, + "grad_norm": 0.35641202330589294, + "learning_rate": 9.199646441780332e-05, + "loss": 0.1452, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 10620, + "tokens_per_second_per_gpu": 362.54 + }, + { + "epoch": 1.0567679077397227, + "grad_norm": 0.612156331539154, + "learning_rate": 9.184003159036379e-05, + "loss": 0.2526, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 10630, + "tokens_per_second_per_gpu": 406.46 + }, + { + "epoch": 1.0577620917631854, + "grad_norm": 0.38296809792518616, + "learning_rate": 9.168361886265113e-05, + "loss": 0.162, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 10640, + "tokens_per_second_per_gpu": 319.6 + }, + { + "epoch": 1.0587562757866482, + "grad_norm": 0.35586997866630554, + "learning_rate": 9.15272266199429e-05, + "loss": 0.1507, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 10650, + "tokens_per_second_per_gpu": 336.67 + }, + { + "epoch": 1.0597504598101108, + "grad_norm": 0.2866521179676056, + "learning_rate": 9.13708552474663e-05, + "loss": 0.1771, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 10660, + "tokens_per_second_per_gpu": 376.96 + }, + { + "epoch": 1.0607446438335737, + "grad_norm": 0.3502759635448456, + "learning_rate": 9.1214505130397e-05, + "loss": 0.1399, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.64, + "memory/max_allocated (GiB)": 20.64, + "step": 10670, + "tokens_per_second_per_gpu": 341.29 + }, + { + "epoch": 1.0617388278570363, + "grad_norm": 0.3619881570339203, + "learning_rate": 9.105817665385846e-05, + "loss": 0.1584, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 10680, + "tokens_per_second_per_gpu": 336.33 + }, + { + "epoch": 1.0627330118804992, + "grad_norm": 0.6374160051345825, + "learning_rate": 9.090187020292068e-05, + "loss": 0.2043, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 10690, + "tokens_per_second_per_gpu": 381.46 + }, + { + "epoch": 1.0637271959039618, + "grad_norm": 0.486200749874115, + "learning_rate": 9.074558616259954e-05, + "loss": 0.1564, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 10700, + "tokens_per_second_per_gpu": 337.08 + }, + { + "epoch": 1.0647213799274247, + "grad_norm": 0.8452271223068237, + "learning_rate": 9.058932491785564e-05, + "loss": 0.1719, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 10710, + "tokens_per_second_per_gpu": 342.56 + }, + { + "epoch": 1.0657155639508873, + "grad_norm": 0.5657733082771301, + "learning_rate": 9.043308685359344e-05, + "loss": 0.184, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 10720, + "tokens_per_second_per_gpu": 331.5 + }, + { + "epoch": 1.0667097479743501, + "grad_norm": 0.3643028438091278, + "learning_rate": 9.027687235466038e-05, + "loss": 0.1483, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 10730, + "tokens_per_second_per_gpu": 335.73 + }, + { + "epoch": 1.0677039319978128, + "grad_norm": 0.6992392539978027, + "learning_rate": 9.012068180584569e-05, + "loss": 0.1743, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 10740, + "tokens_per_second_per_gpu": 370.01 + }, + { + "epoch": 1.0686981160212756, + "grad_norm": 0.5826465487480164, + "learning_rate": 8.996451559187981e-05, + "loss": 0.1825, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 10750, + "tokens_per_second_per_gpu": 345.47 + }, + { + "epoch": 1.0696923000447383, + "grad_norm": 0.3841560184955597, + "learning_rate": 8.980837409743304e-05, + "loss": 0.205, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 10760, + "tokens_per_second_per_gpu": 396.05 + }, + { + "epoch": 1.070686484068201, + "grad_norm": 0.6366297006607056, + "learning_rate": 8.965225770711493e-05, + "loss": 0.1274, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 10770, + "tokens_per_second_per_gpu": 318.68 + }, + { + "epoch": 1.0716806680916637, + "grad_norm": 0.3577064275741577, + "learning_rate": 8.94961668054731e-05, + "loss": 0.1986, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 10780, + "tokens_per_second_per_gpu": 380.85 + }, + { + "epoch": 1.0726748521151266, + "grad_norm": 0.4104318618774414, + "learning_rate": 8.934010177699252e-05, + "loss": 0.1747, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 10790, + "tokens_per_second_per_gpu": 301.88 + }, + { + "epoch": 1.0736690361385892, + "grad_norm": 0.6288489103317261, + "learning_rate": 8.918406300609424e-05, + "loss": 0.1995, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 10800, + "tokens_per_second_per_gpu": 334.51 + }, + { + "epoch": 1.074663220162052, + "grad_norm": 0.4445495009422302, + "learning_rate": 8.902805087713482e-05, + "loss": 0.1503, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 10810, + "tokens_per_second_per_gpu": 300.06 + }, + { + "epoch": 1.0756574041855147, + "grad_norm": 0.4280208945274353, + "learning_rate": 8.887206577440502e-05, + "loss": 0.1843, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 10820, + "tokens_per_second_per_gpu": 371.92 + }, + { + "epoch": 1.0766515882089776, + "grad_norm": 0.5971205830574036, + "learning_rate": 8.871610808212918e-05, + "loss": 0.202, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 10830, + "tokens_per_second_per_gpu": 361.5 + }, + { + "epoch": 1.0776457722324402, + "grad_norm": 0.20293696224689484, + "learning_rate": 8.856017818446402e-05, + "loss": 0.191, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.46, + "memory/max_allocated (GiB)": 20.46, + "step": 10840, + "tokens_per_second_per_gpu": 292.31 + }, + { + "epoch": 1.078639956255903, + "grad_norm": 0.5575738549232483, + "learning_rate": 8.840427646549788e-05, + "loss": 0.1699, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 10850, + "tokens_per_second_per_gpu": 313.54 + }, + { + "epoch": 1.0796341402793657, + "grad_norm": 0.5824712514877319, + "learning_rate": 8.824840330924959e-05, + "loss": 0.191, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 10860, + "tokens_per_second_per_gpu": 344.34 + }, + { + "epoch": 1.0806283243028285, + "grad_norm": 0.6753024458885193, + "learning_rate": 8.809255909966771e-05, + "loss": 0.1255, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 10870, + "tokens_per_second_per_gpu": 298.62 + }, + { + "epoch": 1.0816225083262911, + "grad_norm": 0.6008898019790649, + "learning_rate": 8.793674422062949e-05, + "loss": 0.1856, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 10880, + "tokens_per_second_per_gpu": 399.03 + }, + { + "epoch": 1.082616692349754, + "grad_norm": 0.507643461227417, + "learning_rate": 8.778095905593986e-05, + "loss": 0.1631, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 10890, + "tokens_per_second_per_gpu": 359.61 + }, + { + "epoch": 1.0836108763732166, + "grad_norm": 0.43057945370674133, + "learning_rate": 8.762520398933065e-05, + "loss": 0.1861, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 10900, + "tokens_per_second_per_gpu": 359.02 + }, + { + "epoch": 1.0846050603966795, + "grad_norm": 0.20500341057777405, + "learning_rate": 8.746947940445946e-05, + "loss": 0.1294, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 10910, + "tokens_per_second_per_gpu": 333.37 + }, + { + "epoch": 1.085599244420142, + "grad_norm": 0.5553747415542603, + "learning_rate": 8.73137856849089e-05, + "loss": 0.1633, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 10920, + "tokens_per_second_per_gpu": 269.33 + }, + { + "epoch": 1.086593428443605, + "grad_norm": 0.4564213156700134, + "learning_rate": 8.715812321418546e-05, + "loss": 0.1754, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 10930, + "tokens_per_second_per_gpu": 367.99 + }, + { + "epoch": 1.0875876124670676, + "grad_norm": 0.3952537178993225, + "learning_rate": 8.700249237571879e-05, + "loss": 0.1829, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 10940, + "tokens_per_second_per_gpu": 370.04 + }, + { + "epoch": 1.0885817964905304, + "grad_norm": 0.45536085963249207, + "learning_rate": 8.684689355286045e-05, + "loss": 0.1488, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 10950, + "tokens_per_second_per_gpu": 357.96 + }, + { + "epoch": 1.089575980513993, + "grad_norm": 0.45513296127319336, + "learning_rate": 8.669132712888328e-05, + "loss": 0.187, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 10960, + "tokens_per_second_per_gpu": 376.08 + }, + { + "epoch": 1.090570164537456, + "grad_norm": 0.44525572657585144, + "learning_rate": 8.653579348698021e-05, + "loss": 0.142, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 10970, + "tokens_per_second_per_gpu": 307.76 + }, + { + "epoch": 1.0915643485609186, + "grad_norm": 0.5557878613471985, + "learning_rate": 8.638029301026351e-05, + "loss": 0.2135, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.82, + "memory/max_allocated (GiB)": 19.82, + "step": 10980, + "tokens_per_second_per_gpu": 408.3 + }, + { + "epoch": 1.0925585325843814, + "grad_norm": 0.3728674054145813, + "learning_rate": 8.622482608176374e-05, + "loss": 0.174, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 10990, + "tokens_per_second_per_gpu": 309.34 + }, + { + "epoch": 1.093552716607844, + "grad_norm": 0.38084620237350464, + "learning_rate": 8.606939308442877e-05, + "loss": 0.1752, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.24, + "memory/max_allocated (GiB)": 19.24, + "step": 11000, + "tokens_per_second_per_gpu": 280.4 + }, + { + "epoch": 1.0945469006313069, + "grad_norm": 0.47729748487472534, + "learning_rate": 8.591399440112296e-05, + "loss": 0.1787, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 11010, + "tokens_per_second_per_gpu": 344.64 + }, + { + "epoch": 1.0955410846547695, + "grad_norm": 0.567870020866394, + "learning_rate": 8.575863041462603e-05, + "loss": 0.176, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 11020, + "tokens_per_second_per_gpu": 353.03 + }, + { + "epoch": 1.0965352686782324, + "grad_norm": 0.37033653259277344, + "learning_rate": 8.560330150763243e-05, + "loss": 0.1765, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 11030, + "tokens_per_second_per_gpu": 329.79 + }, + { + "epoch": 1.097529452701695, + "grad_norm": 0.6800894141197205, + "learning_rate": 8.544800806274998e-05, + "loss": 0.2017, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 11040, + "tokens_per_second_per_gpu": 303.81 + }, + { + "epoch": 1.0985236367251578, + "grad_norm": 0.6861316561698914, + "learning_rate": 8.529275046249934e-05, + "loss": 0.245, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 11050, + "tokens_per_second_per_gpu": 400.94 + }, + { + "epoch": 1.0995178207486205, + "grad_norm": 0.5849536657333374, + "learning_rate": 8.513752908931273e-05, + "loss": 0.1653, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 11060, + "tokens_per_second_per_gpu": 383.81 + }, + { + "epoch": 1.1005120047720833, + "grad_norm": 0.43626952171325684, + "learning_rate": 8.498234432553328e-05, + "loss": 0.1368, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 11070, + "tokens_per_second_per_gpu": 332.97 + }, + { + "epoch": 1.101506188795546, + "grad_norm": 0.5107852816581726, + "learning_rate": 8.482719655341374e-05, + "loss": 0.1739, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.54, + "memory/max_allocated (GiB)": 21.54, + "step": 11080, + "tokens_per_second_per_gpu": 333.13 + }, + { + "epoch": 1.1025003728190088, + "grad_norm": 0.4791623651981354, + "learning_rate": 8.467208615511599e-05, + "loss": 0.1172, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 11090, + "tokens_per_second_per_gpu": 310.57 + }, + { + "epoch": 1.1034945568424714, + "grad_norm": 0.4793543219566345, + "learning_rate": 8.451701351270965e-05, + "loss": 0.168, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 11100, + "tokens_per_second_per_gpu": 280.35 + }, + { + "epoch": 1.1044887408659343, + "grad_norm": 0.41162610054016113, + "learning_rate": 8.436197900817145e-05, + "loss": 0.1585, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 11110, + "tokens_per_second_per_gpu": 368.7 + }, + { + "epoch": 1.105482924889397, + "grad_norm": 0.7751170992851257, + "learning_rate": 8.420698302338407e-05, + "loss": 0.165, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 11120, + "tokens_per_second_per_gpu": 345.65 + }, + { + "epoch": 1.1064771089128598, + "grad_norm": 0.2921695113182068, + "learning_rate": 8.405202594013546e-05, + "loss": 0.1529, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.81, + "memory/max_allocated (GiB)": 19.81, + "step": 11130, + "tokens_per_second_per_gpu": 338.72 + }, + { + "epoch": 1.1074712929363226, + "grad_norm": 0.5489593744277954, + "learning_rate": 8.389710814011764e-05, + "loss": 0.201, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 11140, + "tokens_per_second_per_gpu": 401.46 + }, + { + "epoch": 1.1084654769597853, + "grad_norm": 0.5984140634536743, + "learning_rate": 8.37422300049259e-05, + "loss": 0.1832, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.46, + "memory/max_allocated (GiB)": 20.46, + "step": 11150, + "tokens_per_second_per_gpu": 314.29 + }, + { + "epoch": 1.1094596609832479, + "grad_norm": 0.4428146779537201, + "learning_rate": 8.358739191605783e-05, + "loss": 0.1515, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 11160, + "tokens_per_second_per_gpu": 295.44 + }, + { + "epoch": 1.1104538450067107, + "grad_norm": 0.4830903112888336, + "learning_rate": 8.343259425491234e-05, + "loss": 0.1363, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 11170, + "tokens_per_second_per_gpu": 313.11 + }, + { + "epoch": 1.1114480290301736, + "grad_norm": 0.8339301943778992, + "learning_rate": 8.327783740278882e-05, + "loss": 0.1691, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 11180, + "tokens_per_second_per_gpu": 309.19 + }, + { + "epoch": 1.1124422130536362, + "grad_norm": 0.30678462982177734, + "learning_rate": 8.312312174088606e-05, + "loss": 0.1616, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.18, + "memory/max_allocated (GiB)": 18.18, + "step": 11190, + "tokens_per_second_per_gpu": 349.77 + }, + { + "epoch": 1.113436397077099, + "grad_norm": 0.1882571429014206, + "learning_rate": 8.296844765030147e-05, + "loss": 0.1807, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 11200, + "tokens_per_second_per_gpu": 331.03 + }, + { + "epoch": 1.1144305811005617, + "grad_norm": 0.5480899214744568, + "learning_rate": 8.281381551203e-05, + "loss": 0.1784, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 11210, + "tokens_per_second_per_gpu": 367.42 + }, + { + "epoch": 1.1154247651240246, + "grad_norm": 0.3690171241760254, + "learning_rate": 8.265922570696336e-05, + "loss": 0.1421, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 11220, + "tokens_per_second_per_gpu": 338.8 + }, + { + "epoch": 1.1164189491474872, + "grad_norm": 0.3193921744823456, + "learning_rate": 8.250467861588879e-05, + "loss": 0.1304, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 11230, + "tokens_per_second_per_gpu": 316.67 + }, + { + "epoch": 1.11741313317095, + "grad_norm": 0.4590331017971039, + "learning_rate": 8.235017461948858e-05, + "loss": 0.1901, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 11240, + "tokens_per_second_per_gpu": 371.35 + }, + { + "epoch": 1.1184073171944127, + "grad_norm": 0.5001057386398315, + "learning_rate": 8.219571409833862e-05, + "loss": 0.1895, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 11250, + "tokens_per_second_per_gpu": 379.59 + }, + { + "epoch": 1.1194015012178755, + "grad_norm": 0.5432204008102417, + "learning_rate": 8.204129743290783e-05, + "loss": 0.1579, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 11260, + "tokens_per_second_per_gpu": 294.03 + }, + { + "epoch": 1.1203956852413381, + "grad_norm": 0.4477890431880951, + "learning_rate": 8.188692500355716e-05, + "loss": 0.23, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 11270, + "tokens_per_second_per_gpu": 348.24 + }, + { + "epoch": 1.121389869264801, + "grad_norm": 0.5133992433547974, + "learning_rate": 8.173259719053847e-05, + "loss": 0.1724, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 11280, + "tokens_per_second_per_gpu": 379.58 + }, + { + "epoch": 1.1223840532882636, + "grad_norm": 0.369495153427124, + "learning_rate": 8.157831437399383e-05, + "loss": 0.1225, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 11290, + "tokens_per_second_per_gpu": 341.16 + }, + { + "epoch": 1.1233782373117265, + "grad_norm": 0.30792126059532166, + "learning_rate": 8.142407693395438e-05, + "loss": 0.1445, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 11300, + "tokens_per_second_per_gpu": 301.05 + }, + { + "epoch": 1.124372421335189, + "grad_norm": 0.3198167085647583, + "learning_rate": 8.126988525033958e-05, + "loss": 0.1675, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 11310, + "tokens_per_second_per_gpu": 310.65 + }, + { + "epoch": 1.125366605358652, + "grad_norm": 0.6837435960769653, + "learning_rate": 8.111573970295607e-05, + "loss": 0.2455, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 11320, + "tokens_per_second_per_gpu": 354.62 + }, + { + "epoch": 1.1263607893821146, + "grad_norm": 0.5170619487762451, + "learning_rate": 8.096164067149701e-05, + "loss": 0.1768, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 11330, + "tokens_per_second_per_gpu": 287.06 + }, + { + "epoch": 1.1273549734055774, + "grad_norm": 0.4704976975917816, + "learning_rate": 8.080758853554075e-05, + "loss": 0.1772, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 11340, + "tokens_per_second_per_gpu": 308.38 + }, + { + "epoch": 1.12834915742904, + "grad_norm": 0.5932011008262634, + "learning_rate": 8.065358367455038e-05, + "loss": 0.1593, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 11350, + "tokens_per_second_per_gpu": 348.84 + }, + { + "epoch": 1.129343341452503, + "grad_norm": 0.6632476449012756, + "learning_rate": 8.049962646787235e-05, + "loss": 0.1394, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 11360, + "tokens_per_second_per_gpu": 250.63 + }, + { + "epoch": 1.1303375254759656, + "grad_norm": 0.5370444655418396, + "learning_rate": 8.034571729473587e-05, + "loss": 0.1832, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 11370, + "tokens_per_second_per_gpu": 326.76 + }, + { + "epoch": 1.1313317094994284, + "grad_norm": 0.42281845211982727, + "learning_rate": 8.019185653425168e-05, + "loss": 0.1373, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 11380, + "tokens_per_second_per_gpu": 307.3 + }, + { + "epoch": 1.132325893522891, + "grad_norm": 0.27983081340789795, + "learning_rate": 8.00380445654114e-05, + "loss": 0.1607, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 11390, + "tokens_per_second_per_gpu": 330.95 + }, + { + "epoch": 1.1333200775463539, + "grad_norm": 0.6440351009368896, + "learning_rate": 7.988428176708644e-05, + "loss": 0.2021, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 11400, + "tokens_per_second_per_gpu": 397.31 + }, + { + "epoch": 1.1343142615698165, + "grad_norm": 0.4090474843978882, + "learning_rate": 7.9730568518027e-05, + "loss": 0.2105, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 11410, + "tokens_per_second_per_gpu": 342.86 + }, + { + "epoch": 1.1353084455932794, + "grad_norm": 0.2903016209602356, + "learning_rate": 7.957690519686137e-05, + "loss": 0.1956, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 11420, + "tokens_per_second_per_gpu": 362.76 + }, + { + "epoch": 1.136302629616742, + "grad_norm": 0.5340930223464966, + "learning_rate": 7.942329218209474e-05, + "loss": 0.1982, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 11430, + "tokens_per_second_per_gpu": 312.98 + }, + { + "epoch": 1.1372968136402049, + "grad_norm": 0.515466570854187, + "learning_rate": 7.926972985210848e-05, + "loss": 0.1499, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 11440, + "tokens_per_second_per_gpu": 289.34 + }, + { + "epoch": 1.1382909976636675, + "grad_norm": 0.29830217361450195, + "learning_rate": 7.911621858515901e-05, + "loss": 0.1676, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 11450, + "tokens_per_second_per_gpu": 360.42 + }, + { + "epoch": 1.1392851816871303, + "grad_norm": 0.4327114224433899, + "learning_rate": 7.896275875937709e-05, + "loss": 0.1715, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.68, + "memory/max_allocated (GiB)": 19.68, + "step": 11460, + "tokens_per_second_per_gpu": 334.12 + }, + { + "epoch": 1.140279365710593, + "grad_norm": 0.5234288573265076, + "learning_rate": 7.880935075276663e-05, + "loss": 0.1645, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 11470, + "tokens_per_second_per_gpu": 263.98 + }, + { + "epoch": 1.1412735497340558, + "grad_norm": 0.35436221957206726, + "learning_rate": 7.865599494320402e-05, + "loss": 0.1548, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 11480, + "tokens_per_second_per_gpu": 328.71 + }, + { + "epoch": 1.1422677337575184, + "grad_norm": 0.5308985710144043, + "learning_rate": 7.850269170843702e-05, + "loss": 0.1843, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.23, + "memory/max_allocated (GiB)": 19.23, + "step": 11490, + "tokens_per_second_per_gpu": 341.98 + }, + { + "epoch": 1.1432619177809813, + "grad_norm": 0.4634507894515991, + "learning_rate": 7.834944142608394e-05, + "loss": 0.2258, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 11500, + "tokens_per_second_per_gpu": 409.58 + }, + { + "epoch": 1.144256101804444, + "grad_norm": 0.5410236716270447, + "learning_rate": 7.819624447363252e-05, + "loss": 0.1471, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 11510, + "tokens_per_second_per_gpu": 252.15 + }, + { + "epoch": 1.1452502858279068, + "grad_norm": 0.3912404477596283, + "learning_rate": 7.80431012284393e-05, + "loss": 0.1358, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 11520, + "tokens_per_second_per_gpu": 310.18 + }, + { + "epoch": 1.1462444698513694, + "grad_norm": 0.2692343592643738, + "learning_rate": 7.789001206772849e-05, + "loss": 0.1441, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 11530, + "tokens_per_second_per_gpu": 230.07 + }, + { + "epoch": 1.1472386538748323, + "grad_norm": 0.44046950340270996, + "learning_rate": 7.773697736859098e-05, + "loss": 0.1424, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 11540, + "tokens_per_second_per_gpu": 351.3 + }, + { + "epoch": 1.148232837898295, + "grad_norm": 0.48867931962013245, + "learning_rate": 7.758399750798364e-05, + "loss": 0.194, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 11550, + "tokens_per_second_per_gpu": 280.59 + }, + { + "epoch": 1.1492270219217577, + "grad_norm": 0.3554067313671112, + "learning_rate": 7.743107286272812e-05, + "loss": 0.1481, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 11560, + "tokens_per_second_per_gpu": 367.0 + }, + { + "epoch": 1.1502212059452204, + "grad_norm": 0.4472406506538391, + "learning_rate": 7.727820380951022e-05, + "loss": 0.1755, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 11570, + "tokens_per_second_per_gpu": 291.81 + }, + { + "epoch": 1.1512153899686832, + "grad_norm": 0.47527438402175903, + "learning_rate": 7.712539072487867e-05, + "loss": 0.18, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.99, + "memory/max_allocated (GiB)": 19.99, + "step": 11580, + "tokens_per_second_per_gpu": 312.04 + }, + { + "epoch": 1.1522095739921459, + "grad_norm": 0.4945676624774933, + "learning_rate": 7.697263398524448e-05, + "loss": 0.1479, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 11590, + "tokens_per_second_per_gpu": 271.43 + }, + { + "epoch": 1.1532037580156087, + "grad_norm": 0.3966761529445648, + "learning_rate": 7.681993396687968e-05, + "loss": 0.1455, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 11600, + "tokens_per_second_per_gpu": 324.63 + }, + { + "epoch": 1.1541979420390716, + "grad_norm": 0.34350907802581787, + "learning_rate": 7.666729104591678e-05, + "loss": 0.1397, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 11610, + "tokens_per_second_per_gpu": 292.42 + }, + { + "epoch": 1.1551921260625342, + "grad_norm": 0.5656160712242126, + "learning_rate": 7.651470559834747e-05, + "loss": 0.1825, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 11620, + "tokens_per_second_per_gpu": 367.89 + }, + { + "epoch": 1.1561863100859968, + "grad_norm": 0.4410739541053772, + "learning_rate": 7.636217800002203e-05, + "loss": 0.1718, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 11630, + "tokens_per_second_per_gpu": 365.13 + }, + { + "epoch": 1.1571804941094597, + "grad_norm": 0.5351715087890625, + "learning_rate": 7.620970862664811e-05, + "loss": 0.1638, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 11640, + "tokens_per_second_per_gpu": 301.44 + }, + { + "epoch": 1.1581746781329225, + "grad_norm": 0.5122233033180237, + "learning_rate": 7.605729785379005e-05, + "loss": 0.2124, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.46, + "memory/max_allocated (GiB)": 20.46, + "step": 11650, + "tokens_per_second_per_gpu": 339.63 + }, + { + "epoch": 1.1591688621563851, + "grad_norm": 0.49694183468818665, + "learning_rate": 7.590494605686781e-05, + "loss": 0.1758, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 11660, + "tokens_per_second_per_gpu": 332.59 + }, + { + "epoch": 1.1601630461798478, + "grad_norm": 0.5573239922523499, + "learning_rate": 7.5752653611156e-05, + "loss": 0.1506, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 11670, + "tokens_per_second_per_gpu": 326.32 + }, + { + "epoch": 1.1611572302033106, + "grad_norm": 0.4641178250312805, + "learning_rate": 7.560042089178319e-05, + "loss": 0.1212, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 11680, + "tokens_per_second_per_gpu": 253.53 + }, + { + "epoch": 1.1621514142267735, + "grad_norm": 0.2381938248872757, + "learning_rate": 7.544824827373064e-05, + "loss": 0.1489, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 11690, + "tokens_per_second_per_gpu": 326.27 + }, + { + "epoch": 1.1631455982502361, + "grad_norm": 0.5554631948471069, + "learning_rate": 7.529613613183174e-05, + "loss": 0.2159, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 11700, + "tokens_per_second_per_gpu": 373.55 + }, + { + "epoch": 1.1641397822736987, + "grad_norm": 0.3771873414516449, + "learning_rate": 7.514408484077081e-05, + "loss": 0.1905, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 11710, + "tokens_per_second_per_gpu": 313.67 + }, + { + "epoch": 1.1651339662971616, + "grad_norm": 0.5581318736076355, + "learning_rate": 7.499209477508238e-05, + "loss": 0.1409, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 11720, + "tokens_per_second_per_gpu": 259.67 + }, + { + "epoch": 1.1661281503206244, + "grad_norm": 0.4785107970237732, + "learning_rate": 7.484016630915003e-05, + "loss": 0.1236, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 11730, + "tokens_per_second_per_gpu": 309.21 + }, + { + "epoch": 1.167122334344087, + "grad_norm": 0.48833346366882324, + "learning_rate": 7.468829981720574e-05, + "loss": 0.1259, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.41, + "memory/max_allocated (GiB)": 21.41, + "step": 11740, + "tokens_per_second_per_gpu": 348.37 + }, + { + "epoch": 1.16811651836755, + "grad_norm": 0.3307853639125824, + "learning_rate": 7.453649567332871e-05, + "loss": 0.1447, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 11750, + "tokens_per_second_per_gpu": 261.54 + }, + { + "epoch": 1.1691107023910126, + "grad_norm": 0.6457106471061707, + "learning_rate": 7.438475425144469e-05, + "loss": 0.1734, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 11760, + "tokens_per_second_per_gpu": 323.76 + }, + { + "epoch": 1.1701048864144754, + "grad_norm": 0.4975489675998688, + "learning_rate": 7.423307592532484e-05, + "loss": 0.1597, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.18, + "memory/max_allocated (GiB)": 18.18, + "step": 11770, + "tokens_per_second_per_gpu": 333.31 + }, + { + "epoch": 1.171099070437938, + "grad_norm": 0.5595365166664124, + "learning_rate": 7.408146106858496e-05, + "loss": 0.1448, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 11780, + "tokens_per_second_per_gpu": 302.61 + }, + { + "epoch": 1.172093254461401, + "grad_norm": 0.5762649178504944, + "learning_rate": 7.392991005468449e-05, + "loss": 0.1829, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 11790, + "tokens_per_second_per_gpu": 323.36 + }, + { + "epoch": 1.1730874384848635, + "grad_norm": 0.4875398278236389, + "learning_rate": 7.377842325692557e-05, + "loss": 0.1615, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.54, + "memory/max_allocated (GiB)": 21.54, + "step": 11800, + "tokens_per_second_per_gpu": 304.43 + }, + { + "epoch": 1.1740816225083264, + "grad_norm": 0.7832923531532288, + "learning_rate": 7.362700104845226e-05, + "loss": 0.1935, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 11810, + "tokens_per_second_per_gpu": 332.32 + }, + { + "epoch": 1.175075806531789, + "grad_norm": 0.39051079750061035, + "learning_rate": 7.34756438022494e-05, + "loss": 0.1337, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 11820, + "tokens_per_second_per_gpu": 287.23 + }, + { + "epoch": 1.1760699905552519, + "grad_norm": 0.3700825273990631, + "learning_rate": 7.332435189114194e-05, + "loss": 0.1535, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 11830, + "tokens_per_second_per_gpu": 341.88 + }, + { + "epoch": 1.1770641745787145, + "grad_norm": 0.6411652565002441, + "learning_rate": 7.317312568779375e-05, + "loss": 0.1586, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 11840, + "tokens_per_second_per_gpu": 334.43 + }, + { + "epoch": 1.1780583586021773, + "grad_norm": 0.46753886342048645, + "learning_rate": 7.302196556470701e-05, + "loss": 0.1639, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.66, + "memory/max_allocated (GiB)": 19.66, + "step": 11850, + "tokens_per_second_per_gpu": 329.98 + }, + { + "epoch": 1.17905254262564, + "grad_norm": 0.7362424731254578, + "learning_rate": 7.287087189422099e-05, + "loss": 0.2674, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 11860, + "tokens_per_second_per_gpu": 360.74 + }, + { + "epoch": 1.1800467266491028, + "grad_norm": 0.5120696425437927, + "learning_rate": 7.271984504851141e-05, + "loss": 0.166, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.81, + "memory/max_allocated (GiB)": 19.81, + "step": 11870, + "tokens_per_second_per_gpu": 321.01 + }, + { + "epoch": 1.1810409106725654, + "grad_norm": 0.5142634510993958, + "learning_rate": 7.256888539958923e-05, + "loss": 0.1663, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 11880, + "tokens_per_second_per_gpu": 348.0 + }, + { + "epoch": 1.1820350946960283, + "grad_norm": 0.5077565908432007, + "learning_rate": 7.241799331930006e-05, + "loss": 0.2012, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 11890, + "tokens_per_second_per_gpu": 333.88 + }, + { + "epoch": 1.183029278719491, + "grad_norm": 0.5016006231307983, + "learning_rate": 7.226716917932289e-05, + "loss": 0.1986, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 11900, + "tokens_per_second_per_gpu": 349.37 + }, + { + "epoch": 1.1840234627429538, + "grad_norm": 0.5038326382637024, + "learning_rate": 7.21164133511695e-05, + "loss": 0.1379, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 11910, + "tokens_per_second_per_gpu": 331.98 + }, + { + "epoch": 1.1850176467664164, + "grad_norm": 0.44399407505989075, + "learning_rate": 7.196572620618336e-05, + "loss": 0.1479, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.64, + "memory/max_allocated (GiB)": 20.64, + "step": 11920, + "tokens_per_second_per_gpu": 295.3 + }, + { + "epoch": 1.1860118307898793, + "grad_norm": 0.5076953172683716, + "learning_rate": 7.181510811553874e-05, + "loss": 0.2141, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 11930, + "tokens_per_second_per_gpu": 342.74 + }, + { + "epoch": 1.187006014813342, + "grad_norm": 0.39150622487068176, + "learning_rate": 7.166455945023989e-05, + "loss": 0.1896, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 11940, + "tokens_per_second_per_gpu": 341.43 + }, + { + "epoch": 1.1880001988368047, + "grad_norm": 0.305494099855423, + "learning_rate": 7.151408058111991e-05, + "loss": 0.1467, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 11950, + "tokens_per_second_per_gpu": 271.68 + }, + { + "epoch": 1.1889943828602674, + "grad_norm": 0.5669624209403992, + "learning_rate": 7.136367187884014e-05, + "loss": 0.1725, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 11960, + "tokens_per_second_per_gpu": 305.75 + }, + { + "epoch": 1.1899885668837302, + "grad_norm": 0.7032368183135986, + "learning_rate": 7.121333371388889e-05, + "loss": 0.164, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 11970, + "tokens_per_second_per_gpu": 247.59 + }, + { + "epoch": 1.1909827509071929, + "grad_norm": 0.5729434490203857, + "learning_rate": 7.106306645658095e-05, + "loss": 0.1808, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 11980, + "tokens_per_second_per_gpu": 358.52 + }, + { + "epoch": 1.1919769349306557, + "grad_norm": 0.6809588074684143, + "learning_rate": 7.091287047705626e-05, + "loss": 0.1459, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 11990, + "tokens_per_second_per_gpu": 316.28 + }, + { + "epoch": 1.1929711189541183, + "grad_norm": 0.34328803420066833, + "learning_rate": 7.076274614527934e-05, + "loss": 0.1697, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 12000, + "tokens_per_second_per_gpu": 338.71 + }, + { + "epoch": 1.1939653029775812, + "grad_norm": 0.7516032457351685, + "learning_rate": 7.061269383103804e-05, + "loss": 0.1699, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 12010, + "tokens_per_second_per_gpu": 342.16 + }, + { + "epoch": 1.1949594870010438, + "grad_norm": 0.4296916425228119, + "learning_rate": 7.046271390394303e-05, + "loss": 0.1585, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 12020, + "tokens_per_second_per_gpu": 367.62 + }, + { + "epoch": 1.1959536710245067, + "grad_norm": 0.2905307412147522, + "learning_rate": 7.031280673342648e-05, + "loss": 0.1239, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 12030, + "tokens_per_second_per_gpu": 303.39 + }, + { + "epoch": 1.1969478550479693, + "grad_norm": 0.2632843554019928, + "learning_rate": 7.016297268874152e-05, + "loss": 0.1541, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 12040, + "tokens_per_second_per_gpu": 354.37 + }, + { + "epoch": 1.1979420390714322, + "grad_norm": 0.41677621006965637, + "learning_rate": 7.001321213896099e-05, + "loss": 0.125, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.81, + "memory/max_allocated (GiB)": 19.81, + "step": 12050, + "tokens_per_second_per_gpu": 372.01 + }, + { + "epoch": 1.1989362230948948, + "grad_norm": 0.6199091672897339, + "learning_rate": 6.98635254529768e-05, + "loss": 0.1617, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 12060, + "tokens_per_second_per_gpu": 336.05 + }, + { + "epoch": 1.1999304071183576, + "grad_norm": 0.6814326643943787, + "learning_rate": 6.971391299949895e-05, + "loss": 0.1585, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 12070, + "tokens_per_second_per_gpu": 285.66 + }, + { + "epoch": 1.2009245911418203, + "grad_norm": 0.5412986874580383, + "learning_rate": 6.956437514705447e-05, + "loss": 0.2202, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.07, + "memory/max_allocated (GiB)": 18.07, + "step": 12080, + "tokens_per_second_per_gpu": 333.0 + }, + { + "epoch": 1.2019187751652831, + "grad_norm": 0.6325846314430237, + "learning_rate": 6.941491226398675e-05, + "loss": 0.1899, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 12090, + "tokens_per_second_per_gpu": 333.08 + }, + { + "epoch": 1.2029129591887457, + "grad_norm": 0.5769904255867004, + "learning_rate": 6.926552471845439e-05, + "loss": 0.1916, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 12100, + "tokens_per_second_per_gpu": 341.54 + }, + { + "epoch": 1.2039071432122086, + "grad_norm": 0.5062241554260254, + "learning_rate": 6.911621287843058e-05, + "loss": 0.1523, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 12110, + "tokens_per_second_per_gpu": 317.78 + }, + { + "epoch": 1.2049013272356712, + "grad_norm": 0.6168234348297119, + "learning_rate": 6.896697711170183e-05, + "loss": 0.1967, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 12120, + "tokens_per_second_per_gpu": 339.37 + }, + { + "epoch": 1.205895511259134, + "grad_norm": 0.3159475326538086, + "learning_rate": 6.881781778586745e-05, + "loss": 0.1494, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 12130, + "tokens_per_second_per_gpu": 314.84 + }, + { + "epoch": 1.2068896952825967, + "grad_norm": 0.697409451007843, + "learning_rate": 6.866873526833838e-05, + "loss": 0.1419, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 12140, + "tokens_per_second_per_gpu": 286.26 + }, + { + "epoch": 1.2078838793060596, + "grad_norm": 0.4400809407234192, + "learning_rate": 6.851972992633636e-05, + "loss": 0.1646, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 12150, + "tokens_per_second_per_gpu": 276.17 + }, + { + "epoch": 1.2088780633295224, + "grad_norm": 0.4484960436820984, + "learning_rate": 6.837080212689302e-05, + "loss": 0.1717, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 12160, + "tokens_per_second_per_gpu": 275.23 + }, + { + "epoch": 1.209872247352985, + "grad_norm": 0.24361641705036163, + "learning_rate": 6.822195223684906e-05, + "loss": 0.1457, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 12170, + "tokens_per_second_per_gpu": 309.09 + }, + { + "epoch": 1.2108664313764477, + "grad_norm": 0.4752653241157532, + "learning_rate": 6.807318062285314e-05, + "loss": 0.1669, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 12180, + "tokens_per_second_per_gpu": 310.15 + }, + { + "epoch": 1.2118606153999105, + "grad_norm": 0.437656968832016, + "learning_rate": 6.792448765136124e-05, + "loss": 0.1572, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 12190, + "tokens_per_second_per_gpu": 283.06 + }, + { + "epoch": 1.2128547994233734, + "grad_norm": 0.8095329999923706, + "learning_rate": 6.777587368863558e-05, + "loss": 0.1974, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 12200, + "tokens_per_second_per_gpu": 345.42 + }, + { + "epoch": 1.213848983446836, + "grad_norm": 0.48734039068222046, + "learning_rate": 6.762733910074372e-05, + "loss": 0.1423, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 12210, + "tokens_per_second_per_gpu": 289.48 + }, + { + "epoch": 1.2148431674702986, + "grad_norm": 0.6078053712844849, + "learning_rate": 6.747888425355783e-05, + "loss": 0.1729, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 12220, + "tokens_per_second_per_gpu": 326.08 + }, + { + "epoch": 1.2158373514937615, + "grad_norm": 0.4170425832271576, + "learning_rate": 6.733050951275347e-05, + "loss": 0.1939, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 12230, + "tokens_per_second_per_gpu": 326.45 + }, + { + "epoch": 1.2168315355172243, + "grad_norm": 0.6094039678573608, + "learning_rate": 6.71822152438091e-05, + "loss": 0.1701, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 12240, + "tokens_per_second_per_gpu": 344.33 + }, + { + "epoch": 1.217825719540687, + "grad_norm": 0.5931222438812256, + "learning_rate": 6.703400181200472e-05, + "loss": 0.168, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 12250, + "tokens_per_second_per_gpu": 343.43 + }, + { + "epoch": 1.2188199035641498, + "grad_norm": 0.5588046908378601, + "learning_rate": 6.688586958242144e-05, + "loss": 0.1449, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 12260, + "tokens_per_second_per_gpu": 323.04 + }, + { + "epoch": 1.2198140875876125, + "grad_norm": 0.7040362358093262, + "learning_rate": 6.673781891994018e-05, + "loss": 0.1925, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 12270, + "tokens_per_second_per_gpu": 280.89 + }, + { + "epoch": 1.2208082716110753, + "grad_norm": 0.46226373314857483, + "learning_rate": 6.658985018924104e-05, + "loss": 0.1817, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 12280, + "tokens_per_second_per_gpu": 363.62 + }, + { + "epoch": 1.221802455634538, + "grad_norm": 0.6106900572776794, + "learning_rate": 6.644196375480228e-05, + "loss": 0.1602, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 12290, + "tokens_per_second_per_gpu": 242.36 + }, + { + "epoch": 1.2227966396580008, + "grad_norm": 0.3931345045566559, + "learning_rate": 6.629415998089947e-05, + "loss": 0.2175, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 12300, + "tokens_per_second_per_gpu": 319.03 + }, + { + "epoch": 1.2237908236814634, + "grad_norm": 0.5207247734069824, + "learning_rate": 6.61464392316045e-05, + "loss": 0.1477, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 12310, + "tokens_per_second_per_gpu": 324.44 + }, + { + "epoch": 1.2247850077049263, + "grad_norm": 0.709812343120575, + "learning_rate": 6.599880187078479e-05, + "loss": 0.1891, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 12320, + "tokens_per_second_per_gpu": 357.92 + }, + { + "epoch": 1.225779191728389, + "grad_norm": 0.7825199961662292, + "learning_rate": 6.585124826210245e-05, + "loss": 0.1395, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 12330, + "tokens_per_second_per_gpu": 265.35 + }, + { + "epoch": 1.2267733757518517, + "grad_norm": 0.6053150296211243, + "learning_rate": 6.570377876901311e-05, + "loss": 0.1813, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 12340, + "tokens_per_second_per_gpu": 374.3 + }, + { + "epoch": 1.2277675597753144, + "grad_norm": 0.6844688653945923, + "learning_rate": 6.555639375476532e-05, + "loss": 0.2049, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 12350, + "tokens_per_second_per_gpu": 318.45 + }, + { + "epoch": 1.2287617437987772, + "grad_norm": 0.5875769853591919, + "learning_rate": 6.540909358239954e-05, + "loss": 0.1821, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 12360, + "tokens_per_second_per_gpu": 312.63 + }, + { + "epoch": 1.2297559278222399, + "grad_norm": 0.4794519543647766, + "learning_rate": 6.526187861474727e-05, + "loss": 0.1581, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 12370, + "tokens_per_second_per_gpu": 344.97 + }, + { + "epoch": 1.2307501118457027, + "grad_norm": 0.4545304775238037, + "learning_rate": 6.511474921442997e-05, + "loss": 0.1482, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 12380, + "tokens_per_second_per_gpu": 307.3 + }, + { + "epoch": 1.2317442958691653, + "grad_norm": 0.4457061290740967, + "learning_rate": 6.496770574385858e-05, + "loss": 0.1688, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 12390, + "tokens_per_second_per_gpu": 357.22 + }, + { + "epoch": 1.2327384798926282, + "grad_norm": 0.5178716778755188, + "learning_rate": 6.482074856523215e-05, + "loss": 0.1487, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 12400, + "tokens_per_second_per_gpu": 317.6 + }, + { + "epoch": 1.2337326639160908, + "grad_norm": 0.2689170837402344, + "learning_rate": 6.467387804053731e-05, + "loss": 0.208, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 12410, + "tokens_per_second_per_gpu": 330.76 + }, + { + "epoch": 1.2347268479395537, + "grad_norm": 0.5642197132110596, + "learning_rate": 6.45270945315472e-05, + "loss": 0.1667, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 12420, + "tokens_per_second_per_gpu": 299.73 + }, + { + "epoch": 1.2357210319630163, + "grad_norm": 0.38734033703804016, + "learning_rate": 6.438039839982066e-05, + "loss": 0.1317, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 12430, + "tokens_per_second_per_gpu": 298.98 + }, + { + "epoch": 1.2367152159864792, + "grad_norm": 0.39504683017730713, + "learning_rate": 6.42337900067012e-05, + "loss": 0.1357, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 12440, + "tokens_per_second_per_gpu": 359.51 + }, + { + "epoch": 1.2377094000099418, + "grad_norm": 0.38826897740364075, + "learning_rate": 6.408726971331631e-05, + "loss": 0.1919, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 12450, + "tokens_per_second_per_gpu": 327.89 + }, + { + "epoch": 1.2387035840334046, + "grad_norm": 0.6274026036262512, + "learning_rate": 6.39408378805765e-05, + "loss": 0.1613, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 12460, + "tokens_per_second_per_gpu": 325.25 + }, + { + "epoch": 1.2396977680568673, + "grad_norm": 0.5444367527961731, + "learning_rate": 6.379449486917421e-05, + "loss": 0.1976, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 12470, + "tokens_per_second_per_gpu": 398.28 + }, + { + "epoch": 1.2406919520803301, + "grad_norm": 0.40487316250801086, + "learning_rate": 6.364824103958331e-05, + "loss": 0.1205, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 12480, + "tokens_per_second_per_gpu": 326.23 + }, + { + "epoch": 1.2416861361037927, + "grad_norm": 0.55726557970047, + "learning_rate": 6.350207675205781e-05, + "loss": 0.1472, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 12490, + "tokens_per_second_per_gpu": 319.36 + }, + { + "epoch": 1.2426803201272556, + "grad_norm": 0.5732585787773132, + "learning_rate": 6.335600236663131e-05, + "loss": 0.173, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 12500, + "tokens_per_second_per_gpu": 323.39 + }, + { + "epoch": 1.2436745041507182, + "grad_norm": 0.47033485770225525, + "learning_rate": 6.321001824311583e-05, + "loss": 0.1712, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 12510, + "tokens_per_second_per_gpu": 300.06 + }, + { + "epoch": 1.244668688174181, + "grad_norm": 0.44385817646980286, + "learning_rate": 6.306412474110122e-05, + "loss": 0.143, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 12520, + "tokens_per_second_per_gpu": 312.49 + }, + { + "epoch": 1.2456628721976437, + "grad_norm": 0.5871978402137756, + "learning_rate": 6.291832221995388e-05, + "loss": 0.1772, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 12530, + "tokens_per_second_per_gpu": 289.93 + }, + { + "epoch": 1.2466570562211066, + "grad_norm": 0.6150623559951782, + "learning_rate": 6.277261103881638e-05, + "loss": 0.134, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 12540, + "tokens_per_second_per_gpu": 312.62 + }, + { + "epoch": 1.2476512402445692, + "grad_norm": 0.6367026567459106, + "learning_rate": 6.262699155660601e-05, + "loss": 0.2192, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 12550, + "tokens_per_second_per_gpu": 334.54 + }, + { + "epoch": 1.248645424268032, + "grad_norm": 0.29298296570777893, + "learning_rate": 6.248146413201444e-05, + "loss": 0.1669, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 12560, + "tokens_per_second_per_gpu": 313.23 + }, + { + "epoch": 1.2496396082914947, + "grad_norm": 0.27840226888656616, + "learning_rate": 6.233602912350639e-05, + "loss": 0.1134, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 12570, + "tokens_per_second_per_gpu": 333.22 + }, + { + "epoch": 1.2506337923149575, + "grad_norm": 0.49214980006217957, + "learning_rate": 6.219068688931908e-05, + "loss": 0.1726, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 12580, + "tokens_per_second_per_gpu": 394.03 + }, + { + "epoch": 1.2516279763384204, + "grad_norm": 0.5485401153564453, + "learning_rate": 6.20454377874612e-05, + "loss": 0.1014, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 12590, + "tokens_per_second_per_gpu": 262.31 + }, + { + "epoch": 1.252622160361883, + "grad_norm": 0.5006467700004578, + "learning_rate": 6.190028217571186e-05, + "loss": 0.1883, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 12600, + "tokens_per_second_per_gpu": 350.5 + }, + { + "epoch": 1.2536163443853456, + "grad_norm": 0.4242802560329437, + "learning_rate": 6.175522041162016e-05, + "loss": 0.1559, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 12610, + "tokens_per_second_per_gpu": 319.2 + }, + { + "epoch": 1.2546105284088085, + "grad_norm": 0.532721996307373, + "learning_rate": 6.161025285250373e-05, + "loss": 0.1551, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 12620, + "tokens_per_second_per_gpu": 342.02 + }, + { + "epoch": 1.2556047124322713, + "grad_norm": 0.6736690998077393, + "learning_rate": 6.146537985544843e-05, + "loss": 0.1887, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 12630, + "tokens_per_second_per_gpu": 312.69 + }, + { + "epoch": 1.256598896455734, + "grad_norm": 0.4985780417919159, + "learning_rate": 6.132060177730698e-05, + "loss": 0.1409, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 12640, + "tokens_per_second_per_gpu": 330.02 + }, + { + "epoch": 1.2575930804791966, + "grad_norm": 49.2108268737793, + "learning_rate": 6.117591897469847e-05, + "loss": 0.1823, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 12650, + "tokens_per_second_per_gpu": 319.61 + }, + { + "epoch": 1.2585872645026595, + "grad_norm": 0.4028538465499878, + "learning_rate": 6.1031331804007154e-05, + "loss": 0.2102, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 12660, + "tokens_per_second_per_gpu": 355.74 + }, + { + "epoch": 1.2595814485261223, + "grad_norm": 0.47767624258995056, + "learning_rate": 6.0886840621381856e-05, + "loss": 0.153, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 12670, + "tokens_per_second_per_gpu": 381.38 + }, + { + "epoch": 1.260575632549585, + "grad_norm": 0.3719147741794586, + "learning_rate": 6.0742445782734825e-05, + "loss": 0.15, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 12680, + "tokens_per_second_per_gpu": 351.13 + }, + { + "epoch": 1.2615698165730476, + "grad_norm": 0.32970771193504333, + "learning_rate": 6.0598147643741124e-05, + "loss": 0.144, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 12690, + "tokens_per_second_per_gpu": 285.76 + }, + { + "epoch": 1.2625640005965104, + "grad_norm": 0.40551966428756714, + "learning_rate": 6.045394655983753e-05, + "loss": 0.149, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 12700, + "tokens_per_second_per_gpu": 275.71 + }, + { + "epoch": 1.2635581846199733, + "grad_norm": 0.40349385142326355, + "learning_rate": 6.0309842886221826e-05, + "loss": 0.1615, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.16, + "memory/max_allocated (GiB)": 18.16, + "step": 12710, + "tokens_per_second_per_gpu": 374.56 + }, + { + "epoch": 1.264552368643436, + "grad_norm": 0.5158276557922363, + "learning_rate": 6.0165836977851796e-05, + "loss": 0.1828, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 12720, + "tokens_per_second_per_gpu": 404.51 + }, + { + "epoch": 1.2655465526668985, + "grad_norm": 0.42637962102890015, + "learning_rate": 6.0021929189444416e-05, + "loss": 0.1535, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 12730, + "tokens_per_second_per_gpu": 268.86 + }, + { + "epoch": 1.2665407366903614, + "grad_norm": 0.3736035227775574, + "learning_rate": 5.987811987547504e-05, + "loss": 0.1774, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 12740, + "tokens_per_second_per_gpu": 410.2 + }, + { + "epoch": 1.2675349207138242, + "grad_norm": 0.479159414768219, + "learning_rate": 5.9734409390176315e-05, + "loss": 0.1869, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 12750, + "tokens_per_second_per_gpu": 290.14 + }, + { + "epoch": 1.2685291047372869, + "grad_norm": 0.4126437306404114, + "learning_rate": 5.959079808753765e-05, + "loss": 0.1731, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 12760, + "tokens_per_second_per_gpu": 335.34 + }, + { + "epoch": 1.2695232887607495, + "grad_norm": 0.5344352722167969, + "learning_rate": 5.944728632130392e-05, + "loss": 0.1612, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 12770, + "tokens_per_second_per_gpu": 304.8 + }, + { + "epoch": 1.2705174727842123, + "grad_norm": 0.4766976833343506, + "learning_rate": 5.9303874444975005e-05, + "loss": 0.1296, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 12780, + "tokens_per_second_per_gpu": 292.47 + }, + { + "epoch": 1.2715116568076752, + "grad_norm": 0.42096179723739624, + "learning_rate": 5.9160562811804644e-05, + "loss": 0.181, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 12790, + "tokens_per_second_per_gpu": 368.43 + }, + { + "epoch": 1.2725058408311378, + "grad_norm": 0.41255730390548706, + "learning_rate": 5.901735177479972e-05, + "loss": 0.1565, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.23, + "memory/max_allocated (GiB)": 19.23, + "step": 12800, + "tokens_per_second_per_gpu": 291.39 + }, + { + "epoch": 1.2735000248546005, + "grad_norm": 0.4876402020454407, + "learning_rate": 5.8874241686719234e-05, + "loss": 0.1288, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 12810, + "tokens_per_second_per_gpu": 297.94 + }, + { + "epoch": 1.2744942088780633, + "grad_norm": 0.47068750858306885, + "learning_rate": 5.873123290007363e-05, + "loss": 0.1144, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 12820, + "tokens_per_second_per_gpu": 293.08 + }, + { + "epoch": 1.2754883929015262, + "grad_norm": 0.5882211327552795, + "learning_rate": 5.8588325767123694e-05, + "loss": 0.1776, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 12830, + "tokens_per_second_per_gpu": 338.75 + }, + { + "epoch": 1.2764825769249888, + "grad_norm": 0.43064388632774353, + "learning_rate": 5.844552063987997e-05, + "loss": 0.1555, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 12840, + "tokens_per_second_per_gpu": 342.44 + }, + { + "epoch": 1.2774767609484516, + "grad_norm": 0.6673071980476379, + "learning_rate": 5.830281787010166e-05, + "loss": 0.1722, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 12850, + "tokens_per_second_per_gpu": 338.48 + }, + { + "epoch": 1.2784709449719143, + "grad_norm": 0.6424853801727295, + "learning_rate": 5.8160217809295826e-05, + "loss": 0.1647, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 12860, + "tokens_per_second_per_gpu": 354.7 + }, + { + "epoch": 1.2794651289953771, + "grad_norm": 0.5545419454574585, + "learning_rate": 5.801772080871659e-05, + "loss": 0.1304, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 12870, + "tokens_per_second_per_gpu": 326.79 + }, + { + "epoch": 1.2804593130188398, + "grad_norm": 0.5145922899246216, + "learning_rate": 5.787532721936413e-05, + "loss": 0.1984, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 12880, + "tokens_per_second_per_gpu": 360.98 + }, + { + "epoch": 1.2814534970423026, + "grad_norm": 0.4928934574127197, + "learning_rate": 5.7733037391984024e-05, + "loss": 0.1239, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.18, + "memory/max_allocated (GiB)": 18.18, + "step": 12890, + "tokens_per_second_per_gpu": 291.25 + }, + { + "epoch": 1.2824476810657652, + "grad_norm": 0.5820671916007996, + "learning_rate": 5.759085167706611e-05, + "loss": 0.1584, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 12900, + "tokens_per_second_per_gpu": 324.13 + }, + { + "epoch": 1.283441865089228, + "grad_norm": 0.42122918367385864, + "learning_rate": 5.7448770424843926e-05, + "loss": 0.1653, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 12910, + "tokens_per_second_per_gpu": 316.56 + }, + { + "epoch": 1.2844360491126907, + "grad_norm": 0.30348268151283264, + "learning_rate": 5.730679398529355e-05, + "loss": 0.1152, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 12920, + "tokens_per_second_per_gpu": 338.17 + }, + { + "epoch": 1.2854302331361536, + "grad_norm": 0.4230218529701233, + "learning_rate": 5.716492270813305e-05, + "loss": 0.1603, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 12930, + "tokens_per_second_per_gpu": 371.96 + }, + { + "epoch": 1.2864244171596162, + "grad_norm": 0.4649568200111389, + "learning_rate": 5.7023156942821274e-05, + "loss": 0.1618, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 12940, + "tokens_per_second_per_gpu": 365.8 + }, + { + "epoch": 1.287418601183079, + "grad_norm": 0.7270090579986572, + "learning_rate": 5.688149703855732e-05, + "loss": 0.1442, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 12950, + "tokens_per_second_per_gpu": 326.17 + }, + { + "epoch": 1.2884127852065417, + "grad_norm": 0.4988739490509033, + "learning_rate": 5.6739943344279455e-05, + "loss": 0.2068, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 12960, + "tokens_per_second_per_gpu": 347.14 + }, + { + "epoch": 1.2894069692300045, + "grad_norm": 0.5430411100387573, + "learning_rate": 5.65984962086644e-05, + "loss": 0.2245, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 12970, + "tokens_per_second_per_gpu": 428.47 + }, + { + "epoch": 1.2904011532534672, + "grad_norm": 0.6165452003479004, + "learning_rate": 5.645715598012626e-05, + "loss": 0.1454, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 12980, + "tokens_per_second_per_gpu": 316.19 + }, + { + "epoch": 1.29139533727693, + "grad_norm": 0.525731086730957, + "learning_rate": 5.631592300681593e-05, + "loss": 0.186, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 12990, + "tokens_per_second_per_gpu": 324.42 + }, + { + "epoch": 1.2923895213003926, + "grad_norm": 0.843223512172699, + "learning_rate": 5.617479763662011e-05, + "loss": 0.1512, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 13000, + "tokens_per_second_per_gpu": 288.5 + }, + { + "epoch": 1.2933837053238555, + "grad_norm": 0.557815432548523, + "learning_rate": 5.6033780217160346e-05, + "loss": 0.1475, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 13010, + "tokens_per_second_per_gpu": 306.45 + }, + { + "epoch": 1.2943778893473181, + "grad_norm": 0.4504510462284088, + "learning_rate": 5.589287109579242e-05, + "loss": 0.1323, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 13020, + "tokens_per_second_per_gpu": 353.31 + }, + { + "epoch": 1.295372073370781, + "grad_norm": 0.6401469707489014, + "learning_rate": 5.575207061960519e-05, + "loss": 0.1525, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.67, + "memory/max_allocated (GiB)": 19.67, + "step": 13030, + "tokens_per_second_per_gpu": 341.06 + }, + { + "epoch": 1.2963662573942436, + "grad_norm": 0.603769838809967, + "learning_rate": 5.561137913542008e-05, + "loss": 0.2351, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 13040, + "tokens_per_second_per_gpu": 391.65 + }, + { + "epoch": 1.2973604414177065, + "grad_norm": 0.2735394239425659, + "learning_rate": 5.5470796989789874e-05, + "loss": 0.1351, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 13050, + "tokens_per_second_per_gpu": 321.21 + }, + { + "epoch": 1.298354625441169, + "grad_norm": 0.46087950468063354, + "learning_rate": 5.533032452899818e-05, + "loss": 0.204, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 13060, + "tokens_per_second_per_gpu": 353.8 + }, + { + "epoch": 1.299348809464632, + "grad_norm": 0.48193469643592834, + "learning_rate": 5.518996209905829e-05, + "loss": 0.1539, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 13070, + "tokens_per_second_per_gpu": 293.56 + }, + { + "epoch": 1.3003429934880946, + "grad_norm": 0.4664101302623749, + "learning_rate": 5.5049710045712596e-05, + "loss": 0.1775, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 13080, + "tokens_per_second_per_gpu": 283.89 + }, + { + "epoch": 1.3013371775115574, + "grad_norm": 0.4421910047531128, + "learning_rate": 5.490956871443149e-05, + "loss": 0.1428, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 13090, + "tokens_per_second_per_gpu": 332.54 + }, + { + "epoch": 1.3023313615350203, + "grad_norm": 0.5099435448646545, + "learning_rate": 5.4769538450412706e-05, + "loss": 0.2018, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 13100, + "tokens_per_second_per_gpu": 366.63 + }, + { + "epoch": 1.303325545558483, + "grad_norm": 0.4641667604446411, + "learning_rate": 5.462961959858042e-05, + "loss": 0.1445, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 13110, + "tokens_per_second_per_gpu": 363.8 + }, + { + "epoch": 1.3043197295819455, + "grad_norm": 0.3798070251941681, + "learning_rate": 5.448981250358429e-05, + "loss": 0.1464, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 13120, + "tokens_per_second_per_gpu": 301.06 + }, + { + "epoch": 1.3053139136054084, + "grad_norm": 0.49534204602241516, + "learning_rate": 5.435011750979881e-05, + "loss": 0.2212, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 13130, + "tokens_per_second_per_gpu": 337.64 + }, + { + "epoch": 1.3063080976288712, + "grad_norm": 0.586558997631073, + "learning_rate": 5.421053496132218e-05, + "loss": 0.1626, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 13140, + "tokens_per_second_per_gpu": 334.07 + }, + { + "epoch": 1.3073022816523339, + "grad_norm": 0.2836264967918396, + "learning_rate": 5.40710652019758e-05, + "loss": 0.1488, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 13150, + "tokens_per_second_per_gpu": 335.48 + }, + { + "epoch": 1.3082964656757965, + "grad_norm": 0.6480442881584167, + "learning_rate": 5.3931708575303096e-05, + "loss": 0.1667, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 13160, + "tokens_per_second_per_gpu": 322.57 + }, + { + "epoch": 1.3092906496992593, + "grad_norm": 0.5337100625038147, + "learning_rate": 5.379246542456897e-05, + "loss": 0.1342, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 13170, + "tokens_per_second_per_gpu": 349.13 + }, + { + "epoch": 1.3102848337227222, + "grad_norm": 0.4983210265636444, + "learning_rate": 5.365333609275864e-05, + "loss": 0.1469, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 13180, + "tokens_per_second_per_gpu": 329.8 + }, + { + "epoch": 1.3112790177461848, + "grad_norm": 0.4759822487831116, + "learning_rate": 5.351432092257716e-05, + "loss": 0.1507, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 13190, + "tokens_per_second_per_gpu": 282.24 + }, + { + "epoch": 1.3122732017696475, + "grad_norm": 0.2128848135471344, + "learning_rate": 5.3375420256448175e-05, + "loss": 0.1901, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 13200, + "tokens_per_second_per_gpu": 351.2 + }, + { + "epoch": 1.3132673857931103, + "grad_norm": 0.23509669303894043, + "learning_rate": 5.323663443651345e-05, + "loss": 0.1259, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 13210, + "tokens_per_second_per_gpu": 364.3 + }, + { + "epoch": 1.3142615698165732, + "grad_norm": 0.4961312711238861, + "learning_rate": 5.309796380463174e-05, + "loss": 0.1589, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 13220, + "tokens_per_second_per_gpu": 288.96 + }, + { + "epoch": 1.3152557538400358, + "grad_norm": 0.6080997586250305, + "learning_rate": 5.295940870237817e-05, + "loss": 0.2004, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 13230, + "tokens_per_second_per_gpu": 350.85 + }, + { + "epoch": 1.3162499378634984, + "grad_norm": 0.4995987117290497, + "learning_rate": 5.2820969471043204e-05, + "loss": 0.1845, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 13240, + "tokens_per_second_per_gpu": 406.17 + }, + { + "epoch": 1.3172441218869613, + "grad_norm": 0.4047294557094574, + "learning_rate": 5.2682646451631945e-05, + "loss": 0.1338, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 13250, + "tokens_per_second_per_gpu": 330.72 + }, + { + "epoch": 1.3182383059104241, + "grad_norm": 0.6713951826095581, + "learning_rate": 5.254443998486327e-05, + "loss": 0.2004, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 13260, + "tokens_per_second_per_gpu": 366.71 + }, + { + "epoch": 1.3192324899338868, + "grad_norm": 0.3636043667793274, + "learning_rate": 5.240635041116884e-05, + "loss": 0.1612, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 13270, + "tokens_per_second_per_gpu": 379.85 + }, + { + "epoch": 1.3202266739573494, + "grad_norm": 0.6122763156890869, + "learning_rate": 5.226837807069251e-05, + "loss": 0.1599, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 13280, + "tokens_per_second_per_gpu": 301.88 + }, + { + "epoch": 1.3212208579808122, + "grad_norm": 0.3268527686595917, + "learning_rate": 5.213052330328929e-05, + "loss": 0.152, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 13290, + "tokens_per_second_per_gpu": 316.71 + }, + { + "epoch": 1.322215042004275, + "grad_norm": 0.27024024724960327, + "learning_rate": 5.199278644852464e-05, + "loss": 0.2408, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 13300, + "tokens_per_second_per_gpu": 338.21 + }, + { + "epoch": 1.3232092260277377, + "grad_norm": 0.34981921315193176, + "learning_rate": 5.18551678456735e-05, + "loss": 0.1638, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 13310, + "tokens_per_second_per_gpu": 320.81 + }, + { + "epoch": 1.3242034100512003, + "grad_norm": 0.44818684458732605, + "learning_rate": 5.1717667833719627e-05, + "loss": 0.1738, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 13320, + "tokens_per_second_per_gpu": 352.42 + }, + { + "epoch": 1.3251975940746632, + "grad_norm": 0.37409475445747375, + "learning_rate": 5.1580286751354545e-05, + "loss": 0.1738, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 13330, + "tokens_per_second_per_gpu": 319.43 + }, + { + "epoch": 1.326191778098126, + "grad_norm": 0.48389047384262085, + "learning_rate": 5.144302493697697e-05, + "loss": 0.1665, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 13340, + "tokens_per_second_per_gpu": 402.78 + }, + { + "epoch": 1.3271859621215887, + "grad_norm": 0.3536114990711212, + "learning_rate": 5.13058827286917e-05, + "loss": 0.131, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 13350, + "tokens_per_second_per_gpu": 343.79 + }, + { + "epoch": 1.3281801461450515, + "grad_norm": 0.40804916620254517, + "learning_rate": 5.116886046430903e-05, + "loss": 0.1576, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 13360, + "tokens_per_second_per_gpu": 297.51 + }, + { + "epoch": 1.3291743301685142, + "grad_norm": 0.5419421792030334, + "learning_rate": 5.10319584813437e-05, + "loss": 0.1632, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 13370, + "tokens_per_second_per_gpu": 309.55 + }, + { + "epoch": 1.330168514191977, + "grad_norm": 0.3658435642719269, + "learning_rate": 5.089517711701426e-05, + "loss": 0.1439, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 13380, + "tokens_per_second_per_gpu": 300.73 + }, + { + "epoch": 1.3311626982154396, + "grad_norm": 0.3239598572254181, + "learning_rate": 5.075851670824212e-05, + "loss": 0.1575, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 13390, + "tokens_per_second_per_gpu": 334.4 + }, + { + "epoch": 1.3321568822389025, + "grad_norm": 0.5001199245452881, + "learning_rate": 5.0621977591650773e-05, + "loss": 0.1593, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.43, + "memory/max_allocated (GiB)": 20.43, + "step": 13400, + "tokens_per_second_per_gpu": 355.87 + }, + { + "epoch": 1.3331510662623651, + "grad_norm": 0.5077064633369446, + "learning_rate": 5.048556010356491e-05, + "loss": 0.1502, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 13410, + "tokens_per_second_per_gpu": 320.65 + }, + { + "epoch": 1.334145250285828, + "grad_norm": 0.573063313961029, + "learning_rate": 5.0349264580009616e-05, + "loss": 0.16, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 13420, + "tokens_per_second_per_gpu": 365.59 + }, + { + "epoch": 1.3351394343092906, + "grad_norm": 0.6947848200798035, + "learning_rate": 5.021309135670959e-05, + "loss": 0.1975, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.69, + "memory/max_allocated (GiB)": 19.69, + "step": 13430, + "tokens_per_second_per_gpu": 366.35 + }, + { + "epoch": 1.3361336183327535, + "grad_norm": 0.6331676244735718, + "learning_rate": 5.007704076908825e-05, + "loss": 0.1626, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 13440, + "tokens_per_second_per_gpu": 377.99 + }, + { + "epoch": 1.337127802356216, + "grad_norm": 0.6537986397743225, + "learning_rate": 4.994111315226697e-05, + "loss": 0.1846, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 13450, + "tokens_per_second_per_gpu": 409.47 + }, + { + "epoch": 1.338121986379679, + "grad_norm": 0.27798891067504883, + "learning_rate": 4.980530884106416e-05, + "loss": 0.1483, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 13460, + "tokens_per_second_per_gpu": 297.57 + }, + { + "epoch": 1.3391161704031416, + "grad_norm": 0.455538272857666, + "learning_rate": 4.9669628169994586e-05, + "loss": 0.1544, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 13470, + "tokens_per_second_per_gpu": 331.23 + }, + { + "epoch": 1.3401103544266044, + "grad_norm": 0.8112931251525879, + "learning_rate": 4.9534071473268375e-05, + "loss": 0.1629, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.43, + "memory/max_allocated (GiB)": 20.43, + "step": 13480, + "tokens_per_second_per_gpu": 326.59 + }, + { + "epoch": 1.341104538450067, + "grad_norm": 0.541522204875946, + "learning_rate": 4.939863908479037e-05, + "loss": 0.1238, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 13490, + "tokens_per_second_per_gpu": 304.61 + }, + { + "epoch": 1.34209872247353, + "grad_norm": 0.574373185634613, + "learning_rate": 4.9263331338159105e-05, + "loss": 0.1664, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 13500, + "tokens_per_second_per_gpu": 341.93 + }, + { + "epoch": 1.3430929064969925, + "grad_norm": 0.35874566435813904, + "learning_rate": 4.9128148566666186e-05, + "loss": 0.1402, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 13510, + "tokens_per_second_per_gpu": 328.7 + }, + { + "epoch": 1.3440870905204554, + "grad_norm": 0.2937113046646118, + "learning_rate": 4.899309110329541e-05, + "loss": 0.1482, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 13520, + "tokens_per_second_per_gpu": 328.58 + }, + { + "epoch": 1.345081274543918, + "grad_norm": 0.6201068162918091, + "learning_rate": 4.885815928072176e-05, + "loss": 0.14, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 13530, + "tokens_per_second_per_gpu": 336.28 + }, + { + "epoch": 1.3460754585673809, + "grad_norm": 0.5328193306922913, + "learning_rate": 4.872335343131088e-05, + "loss": 0.1536, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 13540, + "tokens_per_second_per_gpu": 337.94 + }, + { + "epoch": 1.3470696425908435, + "grad_norm": 0.6596990823745728, + "learning_rate": 4.8588673887118054e-05, + "loss": 0.1956, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 13550, + "tokens_per_second_per_gpu": 333.18 + }, + { + "epoch": 1.3480638266143063, + "grad_norm": 0.7473700642585754, + "learning_rate": 4.845412097988752e-05, + "loss": 0.1762, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 13560, + "tokens_per_second_per_gpu": 345.86 + }, + { + "epoch": 1.349058010637769, + "grad_norm": 0.544845700263977, + "learning_rate": 4.831969504105145e-05, + "loss": 0.169, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 13570, + "tokens_per_second_per_gpu": 366.86 + }, + { + "epoch": 1.3500521946612318, + "grad_norm": 0.48342615365982056, + "learning_rate": 4.818539640172941e-05, + "loss": 0.1178, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 13580, + "tokens_per_second_per_gpu": 348.38 + }, + { + "epoch": 1.3510463786846945, + "grad_norm": 0.5576662421226501, + "learning_rate": 4.805122539272725e-05, + "loss": 0.1605, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 13590, + "tokens_per_second_per_gpu": 341.46 + }, + { + "epoch": 1.3520405627081573, + "grad_norm": 0.6246042847633362, + "learning_rate": 4.791718234453663e-05, + "loss": 0.2546, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 13600, + "tokens_per_second_per_gpu": 406.86 + }, + { + "epoch": 1.35303474673162, + "grad_norm": 0.31434324383735657, + "learning_rate": 4.7783267587333794e-05, + "loss": 0.1727, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 13610, + "tokens_per_second_per_gpu": 382.72 + }, + { + "epoch": 1.3540289307550828, + "grad_norm": 0.38281211256980896, + "learning_rate": 4.764948145097919e-05, + "loss": 0.1753, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 13620, + "tokens_per_second_per_gpu": 302.55 + }, + { + "epoch": 1.3550231147785454, + "grad_norm": 0.3556180000305176, + "learning_rate": 4.7515824265016276e-05, + "loss": 0.1443, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.66, + "memory/max_allocated (GiB)": 20.66, + "step": 13630, + "tokens_per_second_per_gpu": 341.41 + }, + { + "epoch": 1.3560172988020083, + "grad_norm": 0.29845160245895386, + "learning_rate": 4.7382296358670976e-05, + "loss": 0.1323, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 13640, + "tokens_per_second_per_gpu": 281.42 + }, + { + "epoch": 1.3570114828254711, + "grad_norm": 0.4179192781448364, + "learning_rate": 4.724889806085079e-05, + "loss": 0.1176, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.68, + "memory/max_allocated (GiB)": 19.68, + "step": 13650, + "tokens_per_second_per_gpu": 337.01 + }, + { + "epoch": 1.3580056668489338, + "grad_norm": 0.46176135540008545, + "learning_rate": 4.711562970014384e-05, + "loss": 0.1159, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 13660, + "tokens_per_second_per_gpu": 366.9 + }, + { + "epoch": 1.3589998508723964, + "grad_norm": 0.34104305505752563, + "learning_rate": 4.6982491604818314e-05, + "loss": 0.1194, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 13670, + "tokens_per_second_per_gpu": 306.48 + }, + { + "epoch": 1.3599940348958592, + "grad_norm": 0.8108484148979187, + "learning_rate": 4.684948410282146e-05, + "loss": 0.2221, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 13680, + "tokens_per_second_per_gpu": 379.12 + }, + { + "epoch": 1.360988218919322, + "grad_norm": 0.5723569393157959, + "learning_rate": 4.671660752177892e-05, + "loss": 0.1656, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 13690, + "tokens_per_second_per_gpu": 380.88 + }, + { + "epoch": 1.3619824029427847, + "grad_norm": 0.5756139755249023, + "learning_rate": 4.658386218899371e-05, + "loss": 0.145, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 13700, + "tokens_per_second_per_gpu": 406.13 + }, + { + "epoch": 1.3629765869662473, + "grad_norm": 0.3543594777584076, + "learning_rate": 4.645124843144574e-05, + "loss": 0.1011, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 13710, + "tokens_per_second_per_gpu": 308.42 + }, + { + "epoch": 1.3639707709897102, + "grad_norm": 0.210982084274292, + "learning_rate": 4.631876657579062e-05, + "loss": 0.1649, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 13720, + "tokens_per_second_per_gpu": 339.26 + }, + { + "epoch": 1.364964955013173, + "grad_norm": 0.4334322214126587, + "learning_rate": 4.6186416948359256e-05, + "loss": 0.1152, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 13730, + "tokens_per_second_per_gpu": 384.14 + }, + { + "epoch": 1.3659591390366357, + "grad_norm": 0.7411201000213623, + "learning_rate": 4.6054199875156665e-05, + "loss": 0.1034, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 13740, + "tokens_per_second_per_gpu": 318.13 + }, + { + "epoch": 1.3669533230600983, + "grad_norm": 0.6421767473220825, + "learning_rate": 4.5922115681861536e-05, + "loss": 0.1494, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 13750, + "tokens_per_second_per_gpu": 343.47 + }, + { + "epoch": 1.3679475070835612, + "grad_norm": 0.6301234364509583, + "learning_rate": 4.579016469382505e-05, + "loss": 0.1698, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 13760, + "tokens_per_second_per_gpu": 351.88 + }, + { + "epoch": 1.368941691107024, + "grad_norm": 0.5952703952789307, + "learning_rate": 4.5658347236070445e-05, + "loss": 0.1712, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 13770, + "tokens_per_second_per_gpu": 380.8 + }, + { + "epoch": 1.3699358751304866, + "grad_norm": 0.37993893027305603, + "learning_rate": 4.5526663633292e-05, + "loss": 0.1618, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 13780, + "tokens_per_second_per_gpu": 278.0 + }, + { + "epoch": 1.3709300591539493, + "grad_norm": 0.5828644633293152, + "learning_rate": 4.5395114209854195e-05, + "loss": 0.1806, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 13790, + "tokens_per_second_per_gpu": 371.91 + }, + { + "epoch": 1.3719242431774121, + "grad_norm": 0.6922226548194885, + "learning_rate": 4.526369928979113e-05, + "loss": 0.1802, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 13800, + "tokens_per_second_per_gpu": 392.71 + }, + { + "epoch": 1.372918427200875, + "grad_norm": 0.5355440378189087, + "learning_rate": 4.513241919680546e-05, + "loss": 0.1622, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 13810, + "tokens_per_second_per_gpu": 361.78 + }, + { + "epoch": 1.3739126112243376, + "grad_norm": 0.3850402534008026, + "learning_rate": 4.500127425426783e-05, + "loss": 0.1239, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 13820, + "tokens_per_second_per_gpu": 314.59 + }, + { + "epoch": 1.3749067952478002, + "grad_norm": 0.5045945644378662, + "learning_rate": 4.4870264785215966e-05, + "loss": 0.1422, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 13830, + "tokens_per_second_per_gpu": 329.11 + }, + { + "epoch": 1.375900979271263, + "grad_norm": 0.552185595035553, + "learning_rate": 4.4739391112353915e-05, + "loss": 0.1577, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 13840, + "tokens_per_second_per_gpu": 368.68 + }, + { + "epoch": 1.376895163294726, + "grad_norm": 0.5017216205596924, + "learning_rate": 4.460865355805109e-05, + "loss": 0.1855, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 13850, + "tokens_per_second_per_gpu": 331.39 + }, + { + "epoch": 1.3778893473181886, + "grad_norm": 0.33180108666419983, + "learning_rate": 4.447805244434184e-05, + "loss": 0.1298, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.41, + "memory/max_allocated (GiB)": 21.41, + "step": 13860, + "tokens_per_second_per_gpu": 410.56 + }, + { + "epoch": 1.3788835313416512, + "grad_norm": 0.5970960855484009, + "learning_rate": 4.4347588092924206e-05, + "loss": 0.1828, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 13870, + "tokens_per_second_per_gpu": 367.8 + }, + { + "epoch": 1.379877715365114, + "grad_norm": 0.588447630405426, + "learning_rate": 4.421726082515953e-05, + "loss": 0.1518, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 13880, + "tokens_per_second_per_gpu": 333.81 + }, + { + "epoch": 1.380871899388577, + "grad_norm": 0.5242084860801697, + "learning_rate": 4.4087070962071377e-05, + "loss": 0.2154, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 13890, + "tokens_per_second_per_gpu": 424.02 + }, + { + "epoch": 1.3818660834120395, + "grad_norm": 0.4128514528274536, + "learning_rate": 4.395701882434493e-05, + "loss": 0.1594, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 13900, + "tokens_per_second_per_gpu": 361.98 + }, + { + "epoch": 1.3828602674355024, + "grad_norm": 0.3702273964881897, + "learning_rate": 4.3827104732326055e-05, + "loss": 0.1143, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 13910, + "tokens_per_second_per_gpu": 339.1 + }, + { + "epoch": 1.383854451458965, + "grad_norm": 0.5153867602348328, + "learning_rate": 4.3697329006020614e-05, + "loss": 0.1272, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 13920, + "tokens_per_second_per_gpu": 296.93 + }, + { + "epoch": 1.3848486354824279, + "grad_norm": 0.45272836089134216, + "learning_rate": 4.356769196509373e-05, + "loss": 0.1551, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 13930, + "tokens_per_second_per_gpu": 372.63 + }, + { + "epoch": 1.3858428195058905, + "grad_norm": 0.44970136880874634, + "learning_rate": 4.343819392886873e-05, + "loss": 0.1434, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 13940, + "tokens_per_second_per_gpu": 340.91 + }, + { + "epoch": 1.3868370035293534, + "grad_norm": 0.5090652108192444, + "learning_rate": 4.3308835216326696e-05, + "loss": 0.1548, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 13950, + "tokens_per_second_per_gpu": 350.01 + }, + { + "epoch": 1.387831187552816, + "grad_norm": 0.4184592366218567, + "learning_rate": 4.3179616146105465e-05, + "loss": 0.121, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 13960, + "tokens_per_second_per_gpu": 348.76 + }, + { + "epoch": 1.3888253715762788, + "grad_norm": 0.6222097277641296, + "learning_rate": 4.305053703649897e-05, + "loss": 0.1996, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 13970, + "tokens_per_second_per_gpu": 380.55 + }, + { + "epoch": 1.3898195555997415, + "grad_norm": 0.34182408452033997, + "learning_rate": 4.292159820545627e-05, + "loss": 0.1481, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 13980, + "tokens_per_second_per_gpu": 368.88 + }, + { + "epoch": 1.3908137396232043, + "grad_norm": 0.551923930644989, + "learning_rate": 4.279279997058101e-05, + "loss": 0.1954, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 13990, + "tokens_per_second_per_gpu": 327.45 + }, + { + "epoch": 1.391807923646667, + "grad_norm": 0.43246060609817505, + "learning_rate": 4.266414264913041e-05, + "loss": 0.1153, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 14000, + "tokens_per_second_per_gpu": 301.74 + }, + { + "epoch": 1.3928021076701298, + "grad_norm": 0.3509690761566162, + "learning_rate": 4.2535626558014705e-05, + "loss": 0.1414, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 14010, + "tokens_per_second_per_gpu": 367.19 + }, + { + "epoch": 1.3937962916935924, + "grad_norm": 0.5196961164474487, + "learning_rate": 4.240725201379614e-05, + "loss": 0.1538, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 14020, + "tokens_per_second_per_gpu": 339.78 + }, + { + "epoch": 1.3947904757170553, + "grad_norm": 0.46282872557640076, + "learning_rate": 4.22790193326884e-05, + "loss": 0.1549, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 14030, + "tokens_per_second_per_gpu": 375.57 + }, + { + "epoch": 1.395784659740518, + "grad_norm": 0.3771131932735443, + "learning_rate": 4.21509288305556e-05, + "loss": 0.1359, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 14040, + "tokens_per_second_per_gpu": 377.38 + }, + { + "epoch": 1.3967788437639808, + "grad_norm": 0.639408528804779, + "learning_rate": 4.2022980822911786e-05, + "loss": 0.1673, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 14050, + "tokens_per_second_per_gpu": 333.3 + }, + { + "epoch": 1.3977730277874434, + "grad_norm": 0.44624921679496765, + "learning_rate": 4.189517562491996e-05, + "loss": 0.1855, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 14060, + "tokens_per_second_per_gpu": 378.56 + }, + { + "epoch": 1.3987672118109062, + "grad_norm": 0.5301280617713928, + "learning_rate": 4.176751355139126e-05, + "loss": 0.1335, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 14070, + "tokens_per_second_per_gpu": 256.61 + }, + { + "epoch": 1.3997613958343689, + "grad_norm": 0.6008658409118652, + "learning_rate": 4.163999491678444e-05, + "loss": 0.1674, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 14080, + "tokens_per_second_per_gpu": 375.31 + }, + { + "epoch": 1.4007555798578317, + "grad_norm": 0.569706916809082, + "learning_rate": 4.1512620035204784e-05, + "loss": 0.1309, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 14090, + "tokens_per_second_per_gpu": 311.74 + }, + { + "epoch": 1.4017497638812944, + "grad_norm": 0.5985649228096008, + "learning_rate": 4.138538922040356e-05, + "loss": 0.2239, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 14100, + "tokens_per_second_per_gpu": 431.67 + }, + { + "epoch": 1.4027439479047572, + "grad_norm": 0.18843001127243042, + "learning_rate": 4.125830278577717e-05, + "loss": 0.1663, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.45, + "memory/max_allocated (GiB)": 20.45, + "step": 14110, + "tokens_per_second_per_gpu": 388.4 + }, + { + "epoch": 1.4037381319282198, + "grad_norm": 0.4820663332939148, + "learning_rate": 4.113136104436639e-05, + "loss": 0.1385, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 14120, + "tokens_per_second_per_gpu": 375.91 + }, + { + "epoch": 1.4047323159516827, + "grad_norm": 0.47296223044395447, + "learning_rate": 4.10045643088555e-05, + "loss": 0.188, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 14130, + "tokens_per_second_per_gpu": 364.31 + }, + { + "epoch": 1.4057264999751453, + "grad_norm": 0.5604919791221619, + "learning_rate": 4.0877912891571725e-05, + "loss": 0.1479, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 14140, + "tokens_per_second_per_gpu": 354.18 + }, + { + "epoch": 1.4067206839986082, + "grad_norm": 0.529105544090271, + "learning_rate": 4.075140710448419e-05, + "loss": 0.1566, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 14150, + "tokens_per_second_per_gpu": 364.28 + }, + { + "epoch": 1.407714868022071, + "grad_norm": 0.36099550127983093, + "learning_rate": 4.062504725920347e-05, + "loss": 0.1516, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 14160, + "tokens_per_second_per_gpu": 393.89 + }, + { + "epoch": 1.4087090520455336, + "grad_norm": 0.6695134043693542, + "learning_rate": 4.0498833666980505e-05, + "loss": 0.1371, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 14170, + "tokens_per_second_per_gpu": 394.12 + }, + { + "epoch": 1.4097032360689963, + "grad_norm": 0.4011685252189636, + "learning_rate": 4.037276663870607e-05, + "loss": 0.1888, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 14180, + "tokens_per_second_per_gpu": 401.14 + }, + { + "epoch": 1.4106974200924591, + "grad_norm": 0.5134690403938293, + "learning_rate": 4.024684648490995e-05, + "loss": 0.1467, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 14190, + "tokens_per_second_per_gpu": 320.81 + }, + { + "epoch": 1.411691604115922, + "grad_norm": 0.4153744876384735, + "learning_rate": 4.012107351576001e-05, + "loss": 0.1507, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 14200, + "tokens_per_second_per_gpu": 358.65 + }, + { + "epoch": 1.4126857881393846, + "grad_norm": 0.5884429216384888, + "learning_rate": 3.999544804106174e-05, + "loss": 0.1785, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 14210, + "tokens_per_second_per_gpu": 389.13 + }, + { + "epoch": 1.4136799721628472, + "grad_norm": 0.3641144931316376, + "learning_rate": 3.986997037025716e-05, + "loss": 0.1133, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 14220, + "tokens_per_second_per_gpu": 354.06 + }, + { + "epoch": 1.41467415618631, + "grad_norm": 0.5971251726150513, + "learning_rate": 3.974464081242437e-05, + "loss": 0.18, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 14230, + "tokens_per_second_per_gpu": 388.25 + }, + { + "epoch": 1.415668340209773, + "grad_norm": 0.4704718291759491, + "learning_rate": 3.961945967627648e-05, + "loss": 0.1546, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.85, + "memory/max_allocated (GiB)": 18.85, + "step": 14240, + "tokens_per_second_per_gpu": 358.95 + }, + { + "epoch": 1.4166625242332356, + "grad_norm": 0.44908052682876587, + "learning_rate": 3.9494427270161124e-05, + "loss": 0.1928, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 14250, + "tokens_per_second_per_gpu": 359.69 + }, + { + "epoch": 1.4176567082566982, + "grad_norm": 0.4618172347545624, + "learning_rate": 3.936954390205955e-05, + "loss": 0.1701, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 14260, + "tokens_per_second_per_gpu": 363.08 + }, + { + "epoch": 1.418650892280161, + "grad_norm": 0.4540565609931946, + "learning_rate": 3.924480987958592e-05, + "loss": 0.1195, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 14270, + "tokens_per_second_per_gpu": 382.37 + }, + { + "epoch": 1.419645076303624, + "grad_norm": 0.45630887150764465, + "learning_rate": 3.912022550998642e-05, + "loss": 0.1577, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 14280, + "tokens_per_second_per_gpu": 373.77 + }, + { + "epoch": 1.4206392603270865, + "grad_norm": 0.5009377002716064, + "learning_rate": 3.8995791100138755e-05, + "loss": 0.2043, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 14290, + "tokens_per_second_per_gpu": 436.57 + }, + { + "epoch": 1.4216334443505492, + "grad_norm": 0.5434289574623108, + "learning_rate": 3.887150695655112e-05, + "loss": 0.1279, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 14300, + "tokens_per_second_per_gpu": 368.81 + }, + { + "epoch": 1.422627628374012, + "grad_norm": 0.4998416602611542, + "learning_rate": 3.874737338536164e-05, + "loss": 0.1604, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 14310, + "tokens_per_second_per_gpu": 332.31 + }, + { + "epoch": 1.4236218123974749, + "grad_norm": 0.7324095964431763, + "learning_rate": 3.862339069233759e-05, + "loss": 0.1572, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 14320, + "tokens_per_second_per_gpu": 381.52 + }, + { + "epoch": 1.4246159964209375, + "grad_norm": 0.5743668675422668, + "learning_rate": 3.8499559182874475e-05, + "loss": 0.1947, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 14330, + "tokens_per_second_per_gpu": 399.2 + }, + { + "epoch": 1.4256101804444001, + "grad_norm": 0.3965088427066803, + "learning_rate": 3.837587916199554e-05, + "loss": 0.0943, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 14340, + "tokens_per_second_per_gpu": 308.93 + }, + { + "epoch": 1.426604364467863, + "grad_norm": 0.4278639256954193, + "learning_rate": 3.825235093435076e-05, + "loss": 0.1645, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 14350, + "tokens_per_second_per_gpu": 375.76 + }, + { + "epoch": 1.4275985484913258, + "grad_norm": 0.3658309876918793, + "learning_rate": 3.812897480421631e-05, + "loss": 0.1323, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 14360, + "tokens_per_second_per_gpu": 370.0 + }, + { + "epoch": 1.4285927325147885, + "grad_norm": 0.7314541339874268, + "learning_rate": 3.800575107549362e-05, + "loss": 0.1764, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 14370, + "tokens_per_second_per_gpu": 318.75 + }, + { + "epoch": 1.429586916538251, + "grad_norm": 0.3194984197616577, + "learning_rate": 3.788268005170883e-05, + "loss": 0.1383, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 14380, + "tokens_per_second_per_gpu": 367.0 + }, + { + "epoch": 1.430581100561714, + "grad_norm": 0.24363841116428375, + "learning_rate": 3.7759762036011856e-05, + "loss": 0.171, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 14390, + "tokens_per_second_per_gpu": 350.18 + }, + { + "epoch": 1.4315752845851768, + "grad_norm": 0.8045446872711182, + "learning_rate": 3.7636997331175805e-05, + "loss": 0.1639, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 14400, + "tokens_per_second_per_gpu": 338.94 + }, + { + "epoch": 1.4325694686086394, + "grad_norm": 0.45409709215164185, + "learning_rate": 3.751438623959601e-05, + "loss": 0.1665, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 14410, + "tokens_per_second_per_gpu": 350.33 + }, + { + "epoch": 1.4335636526321023, + "grad_norm": 0.5277268886566162, + "learning_rate": 3.739192906328958e-05, + "loss": 0.1444, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 14420, + "tokens_per_second_per_gpu": 332.77 + }, + { + "epoch": 1.434557836655565, + "grad_norm": 0.5323857069015503, + "learning_rate": 3.726962610389435e-05, + "loss": 0.1362, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 14430, + "tokens_per_second_per_gpu": 339.66 + }, + { + "epoch": 1.4355520206790278, + "grad_norm": 0.7022963166236877, + "learning_rate": 3.7147477662668386e-05, + "loss": 0.1252, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.54, + "memory/max_allocated (GiB)": 21.54, + "step": 14440, + "tokens_per_second_per_gpu": 313.51 + }, + { + "epoch": 1.4365462047024904, + "grad_norm": 0.4480128586292267, + "learning_rate": 3.702548404048917e-05, + "loss": 0.1617, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 14450, + "tokens_per_second_per_gpu": 400.86 + }, + { + "epoch": 1.4375403887259532, + "grad_norm": 0.508468747138977, + "learning_rate": 3.690364553785268e-05, + "loss": 0.1598, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.81, + "memory/max_allocated (GiB)": 19.81, + "step": 14460, + "tokens_per_second_per_gpu": 323.65 + }, + { + "epoch": 1.4385345727494159, + "grad_norm": 0.3107985854148865, + "learning_rate": 3.678196245487299e-05, + "loss": 0.1208, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 14470, + "tokens_per_second_per_gpu": 328.38 + }, + { + "epoch": 1.4395287567728787, + "grad_norm": 0.5842998027801514, + "learning_rate": 3.666043509128118e-05, + "loss": 0.1969, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 14480, + "tokens_per_second_per_gpu": 419.05 + }, + { + "epoch": 1.4405229407963414, + "grad_norm": 0.6047186255455017, + "learning_rate": 3.6539063746424884e-05, + "loss": 0.1706, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.82, + "memory/max_allocated (GiB)": 19.82, + "step": 14490, + "tokens_per_second_per_gpu": 352.49 + }, + { + "epoch": 1.4415171248198042, + "grad_norm": 0.37089505791664124, + "learning_rate": 3.641784871926733e-05, + "loss": 0.1393, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 14500, + "tokens_per_second_per_gpu": 393.16 + }, + { + "epoch": 1.4425113088432668, + "grad_norm": 0.472478985786438, + "learning_rate": 3.629679030838682e-05, + "loss": 0.1796, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 14510, + "tokens_per_second_per_gpu": 386.53 + }, + { + "epoch": 1.4435054928667297, + "grad_norm": 0.3645547330379486, + "learning_rate": 3.617588881197571e-05, + "loss": 0.1152, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 14520, + "tokens_per_second_per_gpu": 360.27 + }, + { + "epoch": 1.4444996768901923, + "grad_norm": 0.16537845134735107, + "learning_rate": 3.605514452784e-05, + "loss": 0.1252, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 14530, + "tokens_per_second_per_gpu": 358.08 + }, + { + "epoch": 1.4454938609136552, + "grad_norm": 0.49066445231437683, + "learning_rate": 3.593455775339837e-05, + "loss": 0.1967, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 14540, + "tokens_per_second_per_gpu": 403.93 + }, + { + "epoch": 1.4464880449371178, + "grad_norm": 0.424955815076828, + "learning_rate": 3.5814128785681554e-05, + "loss": 0.1566, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 14550, + "tokens_per_second_per_gpu": 364.0 + }, + { + "epoch": 1.4474822289605807, + "grad_norm": 0.5189529657363892, + "learning_rate": 3.569385792133151e-05, + "loss": 0.1663, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 14560, + "tokens_per_second_per_gpu": 384.8 + }, + { + "epoch": 1.4484764129840433, + "grad_norm": 0.3756466209888458, + "learning_rate": 3.5573745456600826e-05, + "loss": 0.1533, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 14570, + "tokens_per_second_per_gpu": 368.89 + }, + { + "epoch": 1.4494705970075061, + "grad_norm": 0.5015901923179626, + "learning_rate": 3.54537916873519e-05, + "loss": 0.1883, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 14580, + "tokens_per_second_per_gpu": 330.11 + }, + { + "epoch": 1.4504647810309688, + "grad_norm": 0.49634647369384766, + "learning_rate": 3.5333996909056176e-05, + "loss": 0.1883, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 14590, + "tokens_per_second_per_gpu": 392.27 + }, + { + "epoch": 1.4514589650544316, + "grad_norm": 0.27927589416503906, + "learning_rate": 3.521436141679357e-05, + "loss": 0.1499, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 14600, + "tokens_per_second_per_gpu": 348.04 + }, + { + "epoch": 1.4524531490778942, + "grad_norm": 0.6297810077667236, + "learning_rate": 3.5094885505251515e-05, + "loss": 0.1779, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 14610, + "tokens_per_second_per_gpu": 338.31 + }, + { + "epoch": 1.453447333101357, + "grad_norm": 0.6115005612373352, + "learning_rate": 3.497556946872451e-05, + "loss": 0.1784, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 14620, + "tokens_per_second_per_gpu": 371.67 + }, + { + "epoch": 1.4544415171248197, + "grad_norm": 0.3485526442527771, + "learning_rate": 3.485641360111309e-05, + "loss": 0.1118, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 14630, + "tokens_per_second_per_gpu": 305.03 + }, + { + "epoch": 1.4554357011482826, + "grad_norm": 0.5678386688232422, + "learning_rate": 3.473741819592341e-05, + "loss": 0.1297, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 14640, + "tokens_per_second_per_gpu": 340.8 + }, + { + "epoch": 1.4564298851717452, + "grad_norm": 0.5769549012184143, + "learning_rate": 3.4618583546266246e-05, + "loss": 0.1583, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 14650, + "tokens_per_second_per_gpu": 344.23 + }, + { + "epoch": 1.457424069195208, + "grad_norm": 0.48255455493927, + "learning_rate": 3.449990994485649e-05, + "loss": 0.1803, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.18, + "memory/max_allocated (GiB)": 18.18, + "step": 14660, + "tokens_per_second_per_gpu": 325.5 + }, + { + "epoch": 1.458418253218671, + "grad_norm": 0.5541951060295105, + "learning_rate": 3.4381397684012296e-05, + "loss": 0.1496, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 14670, + "tokens_per_second_per_gpu": 376.55 + }, + { + "epoch": 1.4594124372421335, + "grad_norm": 0.4175207316875458, + "learning_rate": 3.426304705565445e-05, + "loss": 0.1257, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 14680, + "tokens_per_second_per_gpu": 338.73 + }, + { + "epoch": 1.4604066212655962, + "grad_norm": 0.7478891611099243, + "learning_rate": 3.4144858351305496e-05, + "loss": 0.1461, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 14690, + "tokens_per_second_per_gpu": 313.51 + }, + { + "epoch": 1.461400805289059, + "grad_norm": 0.418258935213089, + "learning_rate": 3.402683186208922e-05, + "loss": 0.1719, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 14700, + "tokens_per_second_per_gpu": 372.01 + }, + { + "epoch": 1.4623949893125219, + "grad_norm": 0.6298655271530151, + "learning_rate": 3.390896787872985e-05, + "loss": 0.1876, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 14710, + "tokens_per_second_per_gpu": 384.18 + }, + { + "epoch": 1.4633891733359845, + "grad_norm": 0.2738591134548187, + "learning_rate": 3.379126669155122e-05, + "loss": 0.1502, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 14720, + "tokens_per_second_per_gpu": 316.12 + }, + { + "epoch": 1.4643833573594471, + "grad_norm": 0.6010544896125793, + "learning_rate": 3.3673728590476296e-05, + "loss": 0.1819, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 14730, + "tokens_per_second_per_gpu": 376.31 + }, + { + "epoch": 1.46537754138291, + "grad_norm": 0.4839765727519989, + "learning_rate": 3.355635386502619e-05, + "loss": 0.1465, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 14740, + "tokens_per_second_per_gpu": 339.37 + }, + { + "epoch": 1.4663717254063728, + "grad_norm": 0.569148063659668, + "learning_rate": 3.3439142804319743e-05, + "loss": 0.1178, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 14750, + "tokens_per_second_per_gpu": 338.15 + }, + { + "epoch": 1.4673659094298355, + "grad_norm": 0.7138640284538269, + "learning_rate": 3.3322095697072496e-05, + "loss": 0.1648, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 14760, + "tokens_per_second_per_gpu": 334.69 + }, + { + "epoch": 1.468360093453298, + "grad_norm": 0.7108574509620667, + "learning_rate": 3.3205212831596264e-05, + "loss": 0.1368, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 14770, + "tokens_per_second_per_gpu": 386.29 + }, + { + "epoch": 1.469354277476761, + "grad_norm": 0.347858190536499, + "learning_rate": 3.30884944957982e-05, + "loss": 0.164, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 14780, + "tokens_per_second_per_gpu": 400.72 + }, + { + "epoch": 1.4703484615002238, + "grad_norm": 0.4929182827472687, + "learning_rate": 3.29719409771803e-05, + "loss": 0.1748, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 14790, + "tokens_per_second_per_gpu": 369.52 + }, + { + "epoch": 1.4713426455236864, + "grad_norm": 0.4838101863861084, + "learning_rate": 3.2855552562838445e-05, + "loss": 0.1336, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 14800, + "tokens_per_second_per_gpu": 353.25 + }, + { + "epoch": 1.472336829547149, + "grad_norm": 0.6003445386886597, + "learning_rate": 3.273932953946193e-05, + "loss": 0.1646, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 14810, + "tokens_per_second_per_gpu": 359.65 + }, + { + "epoch": 1.473331013570612, + "grad_norm": 0.40864112973213196, + "learning_rate": 3.262327219333262e-05, + "loss": 0.1307, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 14820, + "tokens_per_second_per_gpu": 283.81 + }, + { + "epoch": 1.4743251975940748, + "grad_norm": 0.5634713768959045, + "learning_rate": 3.250738081032433e-05, + "loss": 0.1409, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 14830, + "tokens_per_second_per_gpu": 370.86 + }, + { + "epoch": 1.4753193816175374, + "grad_norm": 0.421989768743515, + "learning_rate": 3.239165567590197e-05, + "loss": 0.1392, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 14840, + "tokens_per_second_per_gpu": 329.01 + }, + { + "epoch": 1.476313565641, + "grad_norm": 0.8798142671585083, + "learning_rate": 3.2276097075121014e-05, + "loss": 0.2051, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 14850, + "tokens_per_second_per_gpu": 424.3 + }, + { + "epoch": 1.4773077496644629, + "grad_norm": 0.4949728846549988, + "learning_rate": 3.216070529262678e-05, + "loss": 0.156, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.65, + "memory/max_allocated (GiB)": 20.65, + "step": 14860, + "tokens_per_second_per_gpu": 344.69 + }, + { + "epoch": 1.4783019336879257, + "grad_norm": 0.44336196780204773, + "learning_rate": 3.204548061265353e-05, + "loss": 0.1542, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 14870, + "tokens_per_second_per_gpu": 437.39 + }, + { + "epoch": 1.4792961177113884, + "grad_norm": 0.3560582399368286, + "learning_rate": 3.193042331902408e-05, + "loss": 0.1638, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 14880, + "tokens_per_second_per_gpu": 316.98 + }, + { + "epoch": 1.480290301734851, + "grad_norm": 0.5532661080360413, + "learning_rate": 3.181553369514881e-05, + "loss": 0.18, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 14890, + "tokens_per_second_per_gpu": 356.94 + }, + { + "epoch": 1.4812844857583138, + "grad_norm": 0.5040679574012756, + "learning_rate": 3.170081202402518e-05, + "loss": 0.1776, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 14900, + "tokens_per_second_per_gpu": 296.93 + }, + { + "epoch": 1.4822786697817767, + "grad_norm": 0.7191395163536072, + "learning_rate": 3.158625858823688e-05, + "loss": 0.1919, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 14910, + "tokens_per_second_per_gpu": 390.26 + }, + { + "epoch": 1.4832728538052393, + "grad_norm": 0.6036331653594971, + "learning_rate": 3.1471873669953275e-05, + "loss": 0.1501, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 14920, + "tokens_per_second_per_gpu": 324.83 + }, + { + "epoch": 1.4842670378287022, + "grad_norm": 0.4078894555568695, + "learning_rate": 3.135765755092854e-05, + "loss": 0.1536, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 14930, + "tokens_per_second_per_gpu": 434.52 + }, + { + "epoch": 1.4852612218521648, + "grad_norm": 0.5916171669960022, + "learning_rate": 3.1243610512501175e-05, + "loss": 0.1538, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 14940, + "tokens_per_second_per_gpu": 439.02 + }, + { + "epoch": 1.4862554058756277, + "grad_norm": 0.4087867736816406, + "learning_rate": 3.1129732835593085e-05, + "loss": 0.1858, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 14950, + "tokens_per_second_per_gpu": 284.99 + }, + { + "epoch": 1.4872495898990903, + "grad_norm": 0.371852308511734, + "learning_rate": 3.101602480070909e-05, + "loss": 0.1251, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 14960, + "tokens_per_second_per_gpu": 358.19 + }, + { + "epoch": 1.4882437739225531, + "grad_norm": 0.529620349407196, + "learning_rate": 3.0902486687936097e-05, + "loss": 0.1941, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 14970, + "tokens_per_second_per_gpu": 369.15 + }, + { + "epoch": 1.4892379579460158, + "grad_norm": 0.21512271463871002, + "learning_rate": 3.0789118776942484e-05, + "loss": 0.1448, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 14980, + "tokens_per_second_per_gpu": 350.83 + }, + { + "epoch": 1.4902321419694786, + "grad_norm": 0.6459974050521851, + "learning_rate": 3.067592134697741e-05, + "loss": 0.1853, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 14990, + "tokens_per_second_per_gpu": 419.24 + }, + { + "epoch": 1.4912263259929412, + "grad_norm": 0.6449222564697266, + "learning_rate": 3.0562894676870014e-05, + "loss": 0.1876, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.81, + "memory/max_allocated (GiB)": 19.81, + "step": 15000, + "tokens_per_second_per_gpu": 426.0 + }, + { + "epoch": 1.492220510016404, + "grad_norm": 0.5701255202293396, + "learning_rate": 3.045003904502891e-05, + "loss": 0.1583, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 15010, + "tokens_per_second_per_gpu": 355.73 + }, + { + "epoch": 1.4932146940398667, + "grad_norm": 0.754385769367218, + "learning_rate": 3.0337354729441338e-05, + "loss": 0.2089, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 15020, + "tokens_per_second_per_gpu": 372.69 + }, + { + "epoch": 1.4942088780633296, + "grad_norm": 0.27737346291542053, + "learning_rate": 3.022484200767264e-05, + "loss": 0.1358, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 15030, + "tokens_per_second_per_gpu": 445.86 + }, + { + "epoch": 1.4952030620867922, + "grad_norm": 0.35400980710983276, + "learning_rate": 3.0112501156865348e-05, + "loss": 0.1361, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 15040, + "tokens_per_second_per_gpu": 382.91 + }, + { + "epoch": 1.496197246110255, + "grad_norm": 0.703973650932312, + "learning_rate": 3.000033245373881e-05, + "loss": 0.1716, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 15050, + "tokens_per_second_per_gpu": 379.21 + }, + { + "epoch": 1.4971914301337177, + "grad_norm": 0.47651368379592896, + "learning_rate": 2.988833617458816e-05, + "loss": 0.1148, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 15060, + "tokens_per_second_per_gpu": 311.35 + }, + { + "epoch": 1.4981856141571805, + "grad_norm": 0.626930296421051, + "learning_rate": 2.977651259528399e-05, + "loss": 0.1478, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 15070, + "tokens_per_second_per_gpu": 364.12 + }, + { + "epoch": 1.4991797981806432, + "grad_norm": 0.5578542351722717, + "learning_rate": 2.9664861991271343e-05, + "loss": 0.158, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 15080, + "tokens_per_second_per_gpu": 428.74 + }, + { + "epoch": 1.500173982204106, + "grad_norm": 0.3915964961051941, + "learning_rate": 2.9553384637569282e-05, + "loss": 0.1047, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 15090, + "tokens_per_second_per_gpu": 324.43 + }, + { + "epoch": 1.5011681662275689, + "grad_norm": 0.6796404123306274, + "learning_rate": 2.944208080877008e-05, + "loss": 0.1921, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 15100, + "tokens_per_second_per_gpu": 373.42 + }, + { + "epoch": 1.5021623502510315, + "grad_norm": 0.3962237238883972, + "learning_rate": 2.933095077903861e-05, + "loss": 0.1629, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 15110, + "tokens_per_second_per_gpu": 322.99 + }, + { + "epoch": 1.5031565342744941, + "grad_norm": 1.0471795797348022, + "learning_rate": 2.921999482211165e-05, + "loss": 0.1593, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 15120, + "tokens_per_second_per_gpu": 318.74 + }, + { + "epoch": 1.504150718297957, + "grad_norm": 0.3305923640727997, + "learning_rate": 2.9109213211297103e-05, + "loss": 0.1913, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 15130, + "tokens_per_second_per_gpu": 371.82 + }, + { + "epoch": 1.5051449023214198, + "grad_norm": 0.5967475771903992, + "learning_rate": 2.8998606219473555e-05, + "loss": 0.1741, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 15140, + "tokens_per_second_per_gpu": 376.88 + }, + { + "epoch": 1.5061390863448825, + "grad_norm": 0.5528421998023987, + "learning_rate": 2.888817411908935e-05, + "loss": 0.1523, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 15150, + "tokens_per_second_per_gpu": 323.43 + }, + { + "epoch": 1.507133270368345, + "grad_norm": 0.6192378997802734, + "learning_rate": 2.877791718216214e-05, + "loss": 0.1434, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.16, + "memory/max_allocated (GiB)": 18.16, + "step": 15160, + "tokens_per_second_per_gpu": 300.91 + }, + { + "epoch": 1.508127454391808, + "grad_norm": 0.2290807068347931, + "learning_rate": 2.866783568027802e-05, + "loss": 0.1334, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.45, + "memory/max_allocated (GiB)": 20.45, + "step": 15170, + "tokens_per_second_per_gpu": 382.23 + }, + { + "epoch": 1.5091216384152708, + "grad_norm": 0.748935341835022, + "learning_rate": 2.8557929884591038e-05, + "loss": 0.183, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 15180, + "tokens_per_second_per_gpu": 358.48 + }, + { + "epoch": 1.5101158224387334, + "grad_norm": 0.6596253514289856, + "learning_rate": 2.844820006582235e-05, + "loss": 0.1251, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 15190, + "tokens_per_second_per_gpu": 329.83 + }, + { + "epoch": 1.511110006462196, + "grad_norm": 0.5581555962562561, + "learning_rate": 2.8338646494259746e-05, + "loss": 0.1731, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 15200, + "tokens_per_second_per_gpu": 382.43 + }, + { + "epoch": 1.512104190485659, + "grad_norm": 0.525514543056488, + "learning_rate": 2.8229269439756768e-05, + "loss": 0.1252, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 15210, + "tokens_per_second_per_gpu": 398.56 + }, + { + "epoch": 1.5130983745091218, + "grad_norm": 0.5555036664009094, + "learning_rate": 2.812006917173229e-05, + "loss": 0.1561, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 15220, + "tokens_per_second_per_gpu": 360.58 + }, + { + "epoch": 1.5140925585325844, + "grad_norm": 0.4960024058818817, + "learning_rate": 2.801104595916957e-05, + "loss": 0.1428, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 15230, + "tokens_per_second_per_gpu": 378.86 + }, + { + "epoch": 1.515086742556047, + "grad_norm": 0.6800406575202942, + "learning_rate": 2.7902200070615868e-05, + "loss": 0.2009, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.18, + "memory/max_allocated (GiB)": 18.18, + "step": 15240, + "tokens_per_second_per_gpu": 347.33 + }, + { + "epoch": 1.5160809265795099, + "grad_norm": 0.5688208341598511, + "learning_rate": 2.7793531774181614e-05, + "loss": 0.1421, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 15250, + "tokens_per_second_per_gpu": 346.32 + }, + { + "epoch": 1.5170751106029727, + "grad_norm": 0.6293416619300842, + "learning_rate": 2.7685041337539786e-05, + "loss": 0.1442, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 15260, + "tokens_per_second_per_gpu": 373.42 + }, + { + "epoch": 1.5180692946264354, + "grad_norm": 0.5825393795967102, + "learning_rate": 2.7576729027925286e-05, + "loss": 0.1386, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 15270, + "tokens_per_second_per_gpu": 310.81 + }, + { + "epoch": 1.519063478649898, + "grad_norm": 0.37790921330451965, + "learning_rate": 2.7468595112134165e-05, + "loss": 0.1603, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 15280, + "tokens_per_second_per_gpu": 344.02 + }, + { + "epoch": 1.5200576626733608, + "grad_norm": 0.4327569603919983, + "learning_rate": 2.7360639856523172e-05, + "loss": 0.1681, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 15290, + "tokens_per_second_per_gpu": 340.19 + }, + { + "epoch": 1.5210518466968237, + "grad_norm": 0.272372841835022, + "learning_rate": 2.7252863527008867e-05, + "loss": 0.1851, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 15300, + "tokens_per_second_per_gpu": 380.93 + }, + { + "epoch": 1.5220460307202863, + "grad_norm": 0.40148428082466125, + "learning_rate": 2.7145266389067182e-05, + "loss": 0.1504, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 15310, + "tokens_per_second_per_gpu": 331.6 + }, + { + "epoch": 1.523040214743749, + "grad_norm": 0.6262397170066833, + "learning_rate": 2.703784870773255e-05, + "loss": 0.1614, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 15320, + "tokens_per_second_per_gpu": 431.35 + }, + { + "epoch": 1.5240343987672118, + "grad_norm": 0.3884369730949402, + "learning_rate": 2.6930610747597483e-05, + "loss": 0.1573, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.1, + "memory/max_allocated (GiB)": 19.1, + "step": 15330, + "tokens_per_second_per_gpu": 410.18 + }, + { + "epoch": 1.5250285827906747, + "grad_norm": 0.285769522190094, + "learning_rate": 2.682355277281169e-05, + "loss": 0.1303, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 15340, + "tokens_per_second_per_gpu": 317.47 + }, + { + "epoch": 1.5260227668141373, + "grad_norm": 0.8329140543937683, + "learning_rate": 2.671667504708163e-05, + "loss": 0.1356, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 15350, + "tokens_per_second_per_gpu": 341.69 + }, + { + "epoch": 1.5270169508376, + "grad_norm": 0.40556618571281433, + "learning_rate": 2.6609977833669686e-05, + "loss": 0.1665, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 15360, + "tokens_per_second_per_gpu": 382.84 + }, + { + "epoch": 1.5280111348610628, + "grad_norm": 0.23340922594070435, + "learning_rate": 2.650346139539368e-05, + "loss": 0.1838, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 15370, + "tokens_per_second_per_gpu": 369.68 + }, + { + "epoch": 1.5290053188845256, + "grad_norm": 0.4838123917579651, + "learning_rate": 2.6397125994626128e-05, + "loss": 0.1513, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 15380, + "tokens_per_second_per_gpu": 354.9 + }, + { + "epoch": 1.5299995029079883, + "grad_norm": 0.6558499932289124, + "learning_rate": 2.6290971893293547e-05, + "loss": 0.1673, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 15390, + "tokens_per_second_per_gpu": 355.68 + }, + { + "epoch": 1.5309936869314509, + "grad_norm": 0.7235273718833923, + "learning_rate": 2.618499935287595e-05, + "loss": 0.1269, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 15400, + "tokens_per_second_per_gpu": 347.78 + }, + { + "epoch": 1.5319878709549137, + "grad_norm": 2.2355356216430664, + "learning_rate": 2.6079208634406106e-05, + "loss": 0.1649, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 15410, + "tokens_per_second_per_gpu": 418.05 + }, + { + "epoch": 1.5329820549783766, + "grad_norm": 0.2927922010421753, + "learning_rate": 2.5973599998468935e-05, + "loss": 0.1413, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 15420, + "tokens_per_second_per_gpu": 348.29 + }, + { + "epoch": 1.5339762390018392, + "grad_norm": 0.3300843834877014, + "learning_rate": 2.586817370520077e-05, + "loss": 0.1356, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 15430, + "tokens_per_second_per_gpu": 323.42 + }, + { + "epoch": 1.5349704230253018, + "grad_norm": 0.5750816464424133, + "learning_rate": 2.5762930014288933e-05, + "loss": 0.1307, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 15440, + "tokens_per_second_per_gpu": 347.56 + }, + { + "epoch": 1.5359646070487647, + "grad_norm": 0.7192912101745605, + "learning_rate": 2.5657869184970795e-05, + "loss": 0.1837, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 15450, + "tokens_per_second_per_gpu": 347.13 + }, + { + "epoch": 1.5369587910722275, + "grad_norm": 0.7183093428611755, + "learning_rate": 2.555299147603345e-05, + "loss": 0.1978, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 15460, + "tokens_per_second_per_gpu": 372.76 + }, + { + "epoch": 1.5379529750956902, + "grad_norm": 0.27620071172714233, + "learning_rate": 2.5448297145812805e-05, + "loss": 0.1732, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.45, + "memory/max_allocated (GiB)": 20.45, + "step": 15470, + "tokens_per_second_per_gpu": 341.71 + }, + { + "epoch": 1.5389471591191528, + "grad_norm": 0.3960123062133789, + "learning_rate": 2.5343786452193185e-05, + "loss": 0.1229, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 15480, + "tokens_per_second_per_gpu": 351.22 + }, + { + "epoch": 1.5399413431426157, + "grad_norm": 0.4011591970920563, + "learning_rate": 2.5239459652606457e-05, + "loss": 0.1504, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 15490, + "tokens_per_second_per_gpu": 421.47 + }, + { + "epoch": 1.5409355271660785, + "grad_norm": 0.40115076303482056, + "learning_rate": 2.51353170040316e-05, + "loss": 0.116, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 15500, + "tokens_per_second_per_gpu": 326.93 + }, + { + "epoch": 1.5419297111895411, + "grad_norm": 0.487507164478302, + "learning_rate": 2.5031358762994005e-05, + "loss": 0.1467, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.85, + "memory/max_allocated (GiB)": 18.85, + "step": 15510, + "tokens_per_second_per_gpu": 347.74 + }, + { + "epoch": 1.5429238952130038, + "grad_norm": 0.4600401222705841, + "learning_rate": 2.492758518556473e-05, + "loss": 0.1632, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 15520, + "tokens_per_second_per_gpu": 389.11 + }, + { + "epoch": 1.5439180792364666, + "grad_norm": 0.5790596604347229, + "learning_rate": 2.482399652736006e-05, + "loss": 0.1396, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 15530, + "tokens_per_second_per_gpu": 329.02 + }, + { + "epoch": 1.5449122632599295, + "grad_norm": 0.5716336369514465, + "learning_rate": 2.4720593043540752e-05, + "loss": 0.201, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 15540, + "tokens_per_second_per_gpu": 375.4 + }, + { + "epoch": 1.545906447283392, + "grad_norm": 0.5810480117797852, + "learning_rate": 2.461737498881148e-05, + "loss": 0.1525, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 15550, + "tokens_per_second_per_gpu": 331.4 + }, + { + "epoch": 1.5469006313068547, + "grad_norm": 0.3276031017303467, + "learning_rate": 2.451434261742005e-05, + "loss": 0.1237, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 15560, + "tokens_per_second_per_gpu": 303.69 + }, + { + "epoch": 1.5478948153303178, + "grad_norm": 0.40953657031059265, + "learning_rate": 2.4411496183157045e-05, + "loss": 0.131, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 15570, + "tokens_per_second_per_gpu": 341.6 + }, + { + "epoch": 1.5488889993537804, + "grad_norm": 0.34955236315727234, + "learning_rate": 2.4308835939354913e-05, + "loss": 0.1647, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.09, + "memory/max_allocated (GiB)": 19.09, + "step": 15580, + "tokens_per_second_per_gpu": 373.13 + }, + { + "epoch": 1.549883183377243, + "grad_norm": 0.5104487538337708, + "learning_rate": 2.4206362138887584e-05, + "loss": 0.1674, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 15590, + "tokens_per_second_per_gpu": 413.19 + }, + { + "epoch": 1.550877367400706, + "grad_norm": 0.3313903510570526, + "learning_rate": 2.4104075034169628e-05, + "loss": 0.1848, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 15600, + "tokens_per_second_per_gpu": 361.41 + }, + { + "epoch": 1.5518715514241688, + "grad_norm": 0.35265353322029114, + "learning_rate": 2.400197487715585e-05, + "loss": 0.1431, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 15610, + "tokens_per_second_per_gpu": 352.88 + }, + { + "epoch": 1.5528657354476314, + "grad_norm": 0.5764623880386353, + "learning_rate": 2.390006191934048e-05, + "loss": 0.145, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 15620, + "tokens_per_second_per_gpu": 337.91 + }, + { + "epoch": 1.553859919471094, + "grad_norm": 0.5069451928138733, + "learning_rate": 2.3798336411756682e-05, + "loss": 0.1489, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 15630, + "tokens_per_second_per_gpu": 411.7 + }, + { + "epoch": 1.5548541034945569, + "grad_norm": 0.3448779881000519, + "learning_rate": 2.3696798604975933e-05, + "loss": 0.1571, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 15640, + "tokens_per_second_per_gpu": 383.87 + }, + { + "epoch": 1.5558482875180197, + "grad_norm": 0.7202053070068359, + "learning_rate": 2.359544874910723e-05, + "loss": 0.1472, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 15650, + "tokens_per_second_per_gpu": 344.25 + }, + { + "epoch": 1.5568424715414824, + "grad_norm": 0.41816800832748413, + "learning_rate": 2.3494287093796763e-05, + "loss": 0.1428, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 15660, + "tokens_per_second_per_gpu": 343.33 + }, + { + "epoch": 1.557836655564945, + "grad_norm": 0.47676244378089905, + "learning_rate": 2.339331388822701e-05, + "loss": 0.1495, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 15670, + "tokens_per_second_per_gpu": 386.13 + }, + { + "epoch": 1.5588308395884078, + "grad_norm": 0.43887218832969666, + "learning_rate": 2.3292529381116336e-05, + "loss": 0.1541, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 15680, + "tokens_per_second_per_gpu": 345.57 + }, + { + "epoch": 1.5598250236118707, + "grad_norm": 0.5106401443481445, + "learning_rate": 2.319193382071829e-05, + "loss": 0.1124, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 15690, + "tokens_per_second_per_gpu": 339.79 + }, + { + "epoch": 1.5608192076353333, + "grad_norm": 0.6262336373329163, + "learning_rate": 2.3091527454821027e-05, + "loss": 0.1248, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 15700, + "tokens_per_second_per_gpu": 372.97 + }, + { + "epoch": 1.561813391658796, + "grad_norm": 0.48663491010665894, + "learning_rate": 2.299131053074659e-05, + "loss": 0.1214, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 15710, + "tokens_per_second_per_gpu": 292.22 + }, + { + "epoch": 1.5628075756822588, + "grad_norm": 0.7886651158332825, + "learning_rate": 2.2891283295350508e-05, + "loss": 0.1655, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.41, + "memory/max_allocated (GiB)": 21.41, + "step": 15720, + "tokens_per_second_per_gpu": 390.33 + }, + { + "epoch": 1.5638017597057217, + "grad_norm": 0.2919699251651764, + "learning_rate": 2.2791445995020943e-05, + "loss": 0.1406, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 15730, + "tokens_per_second_per_gpu": 323.88 + }, + { + "epoch": 1.5647959437291843, + "grad_norm": 0.4909593164920807, + "learning_rate": 2.2691798875678304e-05, + "loss": 0.1673, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 15740, + "tokens_per_second_per_gpu": 320.3 + }, + { + "epoch": 1.565790127752647, + "grad_norm": 0.37527504563331604, + "learning_rate": 2.2592342182774482e-05, + "loss": 0.1746, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 15750, + "tokens_per_second_per_gpu": 418.23 + }, + { + "epoch": 1.5667843117761098, + "grad_norm": 0.5083606243133545, + "learning_rate": 2.249307616129237e-05, + "loss": 0.1401, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 15760, + "tokens_per_second_per_gpu": 366.1 + }, + { + "epoch": 1.5677784957995726, + "grad_norm": 0.5692464113235474, + "learning_rate": 2.2394001055745107e-05, + "loss": 0.1795, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.81, + "memory/max_allocated (GiB)": 19.81, + "step": 15770, + "tokens_per_second_per_gpu": 340.36 + }, + { + "epoch": 1.5687726798230353, + "grad_norm": 0.6630048751831055, + "learning_rate": 2.2295117110175645e-05, + "loss": 0.1981, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 15780, + "tokens_per_second_per_gpu": 361.48 + }, + { + "epoch": 1.5697668638464979, + "grad_norm": 0.6706286072731018, + "learning_rate": 2.2196424568156073e-05, + "loss": 0.1255, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 15790, + "tokens_per_second_per_gpu": 367.95 + }, + { + "epoch": 1.5707610478699607, + "grad_norm": 0.6745642423629761, + "learning_rate": 2.2097923672786913e-05, + "loss": 0.167, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 15800, + "tokens_per_second_per_gpu": 420.69 + }, + { + "epoch": 1.5717552318934236, + "grad_norm": 0.4416200518608093, + "learning_rate": 2.1999614666696733e-05, + "loss": 0.1483, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 15810, + "tokens_per_second_per_gpu": 343.55 + }, + { + "epoch": 1.5727494159168862, + "grad_norm": 0.6229557394981384, + "learning_rate": 2.1901497792041392e-05, + "loss": 0.1796, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 15820, + "tokens_per_second_per_gpu": 333.79 + }, + { + "epoch": 1.5737435999403488, + "grad_norm": 0.6999335289001465, + "learning_rate": 2.1803573290503497e-05, + "loss": 0.2508, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 15830, + "tokens_per_second_per_gpu": 421.46 + }, + { + "epoch": 1.5747377839638117, + "grad_norm": 0.4594232141971588, + "learning_rate": 2.170584140329177e-05, + "loss": 0.1617, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 15840, + "tokens_per_second_per_gpu": 342.47 + }, + { + "epoch": 1.5757319679872746, + "grad_norm": 0.18643365800380707, + "learning_rate": 2.1608302371140533e-05, + "loss": 0.142, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 15850, + "tokens_per_second_per_gpu": 302.45 + }, + { + "epoch": 1.5767261520107372, + "grad_norm": 0.42105787992477417, + "learning_rate": 2.1510956434308992e-05, + "loss": 0.1691, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 15860, + "tokens_per_second_per_gpu": 375.79 + }, + { + "epoch": 1.5777203360341998, + "grad_norm": 0.5371260046958923, + "learning_rate": 2.1413803832580813e-05, + "loss": 0.1398, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 15870, + "tokens_per_second_per_gpu": 314.27 + }, + { + "epoch": 1.5787145200576627, + "grad_norm": 0.5238702297210693, + "learning_rate": 2.1316844805263346e-05, + "loss": 0.2099, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 15880, + "tokens_per_second_per_gpu": 358.19 + }, + { + "epoch": 1.5797087040811255, + "grad_norm": 0.2773045599460602, + "learning_rate": 2.1220079591187214e-05, + "loss": 0.1528, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 15890, + "tokens_per_second_per_gpu": 332.88 + }, + { + "epoch": 1.5807028881045881, + "grad_norm": 0.6425759196281433, + "learning_rate": 2.112350842870553e-05, + "loss": 0.1474, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 15900, + "tokens_per_second_per_gpu": 303.59 + }, + { + "epoch": 1.5816970721280508, + "grad_norm": 0.3497686982154846, + "learning_rate": 2.1027131555693524e-05, + "loss": 0.1381, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.64, + "memory/max_allocated (GiB)": 20.64, + "step": 15910, + "tokens_per_second_per_gpu": 300.82 + }, + { + "epoch": 1.5826912561515136, + "grad_norm": 0.37605515122413635, + "learning_rate": 2.0930949209547813e-05, + "loss": 0.1004, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 15920, + "tokens_per_second_per_gpu": 326.71 + }, + { + "epoch": 1.5836854401749765, + "grad_norm": 0.6107087731361389, + "learning_rate": 2.08349616271858e-05, + "loss": 0.1309, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 15930, + "tokens_per_second_per_gpu": 325.87 + }, + { + "epoch": 1.584679624198439, + "grad_norm": 0.4413928687572479, + "learning_rate": 2.0739169045045237e-05, + "loss": 0.1575, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 15940, + "tokens_per_second_per_gpu": 370.56 + }, + { + "epoch": 1.5856738082219017, + "grad_norm": 0.3646581470966339, + "learning_rate": 2.064357169908345e-05, + "loss": 0.1437, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 15950, + "tokens_per_second_per_gpu": 362.0 + }, + { + "epoch": 1.5866679922453646, + "grad_norm": 0.5044755935668945, + "learning_rate": 2.054816982477693e-05, + "loss": 0.1873, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 15960, + "tokens_per_second_per_gpu": 335.28 + }, + { + "epoch": 1.5876621762688274, + "grad_norm": 0.5462661981582642, + "learning_rate": 2.045296365712066e-05, + "loss": 0.1646, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 15970, + "tokens_per_second_per_gpu": 380.25 + }, + { + "epoch": 1.58865636029229, + "grad_norm": 0.3154492676258087, + "learning_rate": 2.0357953430627575e-05, + "loss": 0.1461, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.82, + "memory/max_allocated (GiB)": 18.82, + "step": 15980, + "tokens_per_second_per_gpu": 362.04 + }, + { + "epoch": 1.5896505443157527, + "grad_norm": 0.4407619833946228, + "learning_rate": 2.02631393793279e-05, + "loss": 0.1701, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 15990, + "tokens_per_second_per_gpu": 371.26 + }, + { + "epoch": 1.5906447283392156, + "grad_norm": 0.3745664358139038, + "learning_rate": 2.0168521736768732e-05, + "loss": 0.1009, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 16000, + "tokens_per_second_per_gpu": 348.69 + }, + { + "epoch": 1.5916389123626784, + "grad_norm": 0.7978280186653137, + "learning_rate": 2.007410073601326e-05, + "loss": 0.189, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 16010, + "tokens_per_second_per_gpu": 396.26 + }, + { + "epoch": 1.592633096386141, + "grad_norm": 0.5025820136070251, + "learning_rate": 1.9979876609640437e-05, + "loss": 0.1536, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 16020, + "tokens_per_second_per_gpu": 389.34 + }, + { + "epoch": 1.5936272804096037, + "grad_norm": 0.37046942114830017, + "learning_rate": 1.988584958974412e-05, + "loss": 0.0939, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 16030, + "tokens_per_second_per_gpu": 336.01 + }, + { + "epoch": 1.5946214644330665, + "grad_norm": 0.40366023778915405, + "learning_rate": 1.979201990793279e-05, + "loss": 0.131, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 16040, + "tokens_per_second_per_gpu": 366.64 + }, + { + "epoch": 1.5956156484565294, + "grad_norm": 0.2725692689418793, + "learning_rate": 1.9698387795328788e-05, + "loss": 0.1501, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 16050, + "tokens_per_second_per_gpu": 410.98 + }, + { + "epoch": 1.596609832479992, + "grad_norm": 0.5428498983383179, + "learning_rate": 1.9604953482567756e-05, + "loss": 0.1891, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 16060, + "tokens_per_second_per_gpu": 418.32 + }, + { + "epoch": 1.5976040165034546, + "grad_norm": 0.5400422215461731, + "learning_rate": 1.9511717199798208e-05, + "loss": 0.1726, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 16070, + "tokens_per_second_per_gpu": 399.7 + }, + { + "epoch": 1.5985982005269175, + "grad_norm": 0.5957368612289429, + "learning_rate": 1.9418679176680743e-05, + "loss": 0.1893, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 16080, + "tokens_per_second_per_gpu": 388.42 + }, + { + "epoch": 1.5995923845503803, + "grad_norm": 0.8663964867591858, + "learning_rate": 1.9325839642387755e-05, + "loss": 0.1314, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 16090, + "tokens_per_second_per_gpu": 297.89 + }, + { + "epoch": 1.600586568573843, + "grad_norm": 0.46175047755241394, + "learning_rate": 1.9233198825602572e-05, + "loss": 0.1427, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 16100, + "tokens_per_second_per_gpu": 322.74 + }, + { + "epoch": 1.6015807525973058, + "grad_norm": 0.5290225148200989, + "learning_rate": 1.9140756954519136e-05, + "loss": 0.1824, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 16110, + "tokens_per_second_per_gpu": 423.78 + }, + { + "epoch": 1.6025749366207687, + "grad_norm": 0.28334707021713257, + "learning_rate": 1.904851425684131e-05, + "loss": 0.1428, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 16120, + "tokens_per_second_per_gpu": 358.47 + }, + { + "epoch": 1.6035691206442313, + "grad_norm": 0.4152858555316925, + "learning_rate": 1.895647095978238e-05, + "loss": 0.1508, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 16130, + "tokens_per_second_per_gpu": 368.35 + }, + { + "epoch": 1.604563304667694, + "grad_norm": 0.5704591274261475, + "learning_rate": 1.8864627290064396e-05, + "loss": 0.1618, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 16140, + "tokens_per_second_per_gpu": 371.9 + }, + { + "epoch": 1.6055574886911568, + "grad_norm": 0.5318484902381897, + "learning_rate": 1.877298347391777e-05, + "loss": 0.1667, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 16150, + "tokens_per_second_per_gpu": 357.18 + }, + { + "epoch": 1.6065516727146196, + "grad_norm": 0.456482470035553, + "learning_rate": 1.8681539737080543e-05, + "loss": 0.152, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 16160, + "tokens_per_second_per_gpu": 340.75 + }, + { + "epoch": 1.6075458567380823, + "grad_norm": 0.6000080108642578, + "learning_rate": 1.8590296304797996e-05, + "loss": 0.1347, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 16170, + "tokens_per_second_per_gpu": 329.23 + }, + { + "epoch": 1.6085400407615449, + "grad_norm": 0.48490649461746216, + "learning_rate": 1.8499253401822004e-05, + "loss": 0.1503, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 16180, + "tokens_per_second_per_gpu": 304.69 + }, + { + "epoch": 1.6095342247850077, + "grad_norm": 0.5004350543022156, + "learning_rate": 1.840841125241044e-05, + "loss": 0.18, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.82, + "memory/max_allocated (GiB)": 19.82, + "step": 16190, + "tokens_per_second_per_gpu": 352.28 + }, + { + "epoch": 1.6105284088084706, + "grad_norm": 0.5725821256637573, + "learning_rate": 1.8317770080326757e-05, + "loss": 0.171, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 16200, + "tokens_per_second_per_gpu": 427.8 + }, + { + "epoch": 1.6115225928319332, + "grad_norm": 0.47481808066368103, + "learning_rate": 1.822733010883928e-05, + "loss": 0.1318, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.65, + "memory/max_allocated (GiB)": 20.65, + "step": 16210, + "tokens_per_second_per_gpu": 371.38 + }, + { + "epoch": 1.6125167768553959, + "grad_norm": 0.9544722437858582, + "learning_rate": 1.813709156072081e-05, + "loss": 0.1429, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 16220, + "tokens_per_second_per_gpu": 353.59 + }, + { + "epoch": 1.6135109608788587, + "grad_norm": 0.5491587519645691, + "learning_rate": 1.804705465824793e-05, + "loss": 0.1672, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.41, + "memory/max_allocated (GiB)": 21.41, + "step": 16230, + "tokens_per_second_per_gpu": 359.87 + }, + { + "epoch": 1.6145051449023216, + "grad_norm": 0.6248442530632019, + "learning_rate": 1.795721962320057e-05, + "loss": 0.1793, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 16240, + "tokens_per_second_per_gpu": 367.54 + }, + { + "epoch": 1.6154993289257842, + "grad_norm": 0.28726670145988464, + "learning_rate": 1.7867586676861416e-05, + "loss": 0.1438, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 16250, + "tokens_per_second_per_gpu": 278.32 + }, + { + "epoch": 1.6164935129492468, + "grad_norm": 0.36886531114578247, + "learning_rate": 1.7778156040015393e-05, + "loss": 0.182, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 16260, + "tokens_per_second_per_gpu": 396.55 + }, + { + "epoch": 1.6174876969727097, + "grad_norm": 0.3643040955066681, + "learning_rate": 1.7688927932948983e-05, + "loss": 0.1251, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 16270, + "tokens_per_second_per_gpu": 315.86 + }, + { + "epoch": 1.6184818809961725, + "grad_norm": 0.4020111560821533, + "learning_rate": 1.7599902575449955e-05, + "loss": 0.1725, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 16280, + "tokens_per_second_per_gpu": 434.3 + }, + { + "epoch": 1.6194760650196351, + "grad_norm": 0.5112940073013306, + "learning_rate": 1.7511080186806518e-05, + "loss": 0.1305, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 16290, + "tokens_per_second_per_gpu": 260.92 + }, + { + "epoch": 1.6204702490430978, + "grad_norm": 0.7122372984886169, + "learning_rate": 1.742246098580701e-05, + "loss": 0.1869, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 16300, + "tokens_per_second_per_gpu": 320.49 + }, + { + "epoch": 1.6214644330665606, + "grad_norm": 0.6411992311477661, + "learning_rate": 1.7334045190739277e-05, + "loss": 0.2079, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 16310, + "tokens_per_second_per_gpu": 406.2 + }, + { + "epoch": 1.6224586170900235, + "grad_norm": 0.673321008682251, + "learning_rate": 1.7245833019390055e-05, + "loss": 0.1607, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 16320, + "tokens_per_second_per_gpu": 325.22 + }, + { + "epoch": 1.623452801113486, + "grad_norm": 0.6042284369468689, + "learning_rate": 1.7157824689044632e-05, + "loss": 0.1823, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 16330, + "tokens_per_second_per_gpu": 340.57 + }, + { + "epoch": 1.6244469851369487, + "grad_norm": 0.3080300986766815, + "learning_rate": 1.7070020416486065e-05, + "loss": 0.125, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 16340, + "tokens_per_second_per_gpu": 342.14 + }, + { + "epoch": 1.6254411691604116, + "grad_norm": 0.33366823196411133, + "learning_rate": 1.6982420417994893e-05, + "loss": 0.1487, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 16350, + "tokens_per_second_per_gpu": 348.66 + }, + { + "epoch": 1.6264353531838744, + "grad_norm": 0.5744956731796265, + "learning_rate": 1.6895024909348367e-05, + "loss": 0.1528, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.19, + "memory/max_allocated (GiB)": 18.19, + "step": 16360, + "tokens_per_second_per_gpu": 371.23 + }, + { + "epoch": 1.627429537207337, + "grad_norm": 0.578666090965271, + "learning_rate": 1.6807834105820163e-05, + "loss": 0.1521, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 16370, + "tokens_per_second_per_gpu": 356.8 + }, + { + "epoch": 1.6284237212307997, + "grad_norm": 0.5622230768203735, + "learning_rate": 1.6720848222179587e-05, + "loss": 0.1405, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 16380, + "tokens_per_second_per_gpu": 351.53 + }, + { + "epoch": 1.6294179052542626, + "grad_norm": 0.562519371509552, + "learning_rate": 1.6634067472691283e-05, + "loss": 0.1543, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 16390, + "tokens_per_second_per_gpu": 360.87 + }, + { + "epoch": 1.6304120892777254, + "grad_norm": 0.5723958611488342, + "learning_rate": 1.65474920711146e-05, + "loss": 0.1881, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 16400, + "tokens_per_second_per_gpu": 366.21 + }, + { + "epoch": 1.631406273301188, + "grad_norm": 0.5237053632736206, + "learning_rate": 1.646112223070305e-05, + "loss": 0.148, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 16410, + "tokens_per_second_per_gpu": 354.74 + }, + { + "epoch": 1.6324004573246507, + "grad_norm": 0.576963484287262, + "learning_rate": 1.6374958164203768e-05, + "loss": 0.1729, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 16420, + "tokens_per_second_per_gpu": 374.69 + }, + { + "epoch": 1.6333946413481135, + "grad_norm": 0.5174708366394043, + "learning_rate": 1.6289000083857088e-05, + "loss": 0.1487, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 16430, + "tokens_per_second_per_gpu": 414.81 + }, + { + "epoch": 1.6343888253715764, + "grad_norm": 0.44571495056152344, + "learning_rate": 1.620324820139595e-05, + "loss": 0.1505, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 16440, + "tokens_per_second_per_gpu": 359.78 + }, + { + "epoch": 1.635383009395039, + "grad_norm": 0.5897732377052307, + "learning_rate": 1.61177027280453e-05, + "loss": 0.1283, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 16450, + "tokens_per_second_per_gpu": 328.51 + }, + { + "epoch": 1.6363771934185016, + "grad_norm": 0.5686812400817871, + "learning_rate": 1.6032363874521804e-05, + "loss": 0.2051, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 16460, + "tokens_per_second_per_gpu": 373.45 + }, + { + "epoch": 1.6373713774419645, + "grad_norm": 0.6155579686164856, + "learning_rate": 1.5947231851033016e-05, + "loss": 0.1437, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 16470, + "tokens_per_second_per_gpu": 329.49 + }, + { + "epoch": 1.6383655614654273, + "grad_norm": 0.4496712386608124, + "learning_rate": 1.5862306867277155e-05, + "loss": 0.1803, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 16480, + "tokens_per_second_per_gpu": 373.25 + }, + { + "epoch": 1.63935974548889, + "grad_norm": 0.696696937084198, + "learning_rate": 1.5777589132442373e-05, + "loss": 0.1214, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 16490, + "tokens_per_second_per_gpu": 342.62 + }, + { + "epoch": 1.6403539295123526, + "grad_norm": 0.5139226317405701, + "learning_rate": 1.569307885520639e-05, + "loss": 0.1722, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 16500, + "tokens_per_second_per_gpu": 391.42 + }, + { + "epoch": 1.6413481135358154, + "grad_norm": 0.5510913133621216, + "learning_rate": 1.5608776243735834e-05, + "loss": 0.1549, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 16510, + "tokens_per_second_per_gpu": 353.41 + }, + { + "epoch": 1.6423422975592783, + "grad_norm": 0.4640497863292694, + "learning_rate": 1.5524681505685888e-05, + "loss": 0.1431, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 16520, + "tokens_per_second_per_gpu": 387.36 + }, + { + "epoch": 1.643336481582741, + "grad_norm": 0.59361332654953, + "learning_rate": 1.5440794848199657e-05, + "loss": 0.1749, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 16530, + "tokens_per_second_per_gpu": 407.49 + }, + { + "epoch": 1.6443306656062036, + "grad_norm": 0.552768349647522, + "learning_rate": 1.5357116477907728e-05, + "loss": 0.1278, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 16540, + "tokens_per_second_per_gpu": 381.79 + }, + { + "epoch": 1.6453248496296664, + "grad_norm": 0.7618371248245239, + "learning_rate": 1.5273646600927583e-05, + "loss": 0.163, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 16550, + "tokens_per_second_per_gpu": 364.2 + }, + { + "epoch": 1.6463190336531293, + "grad_norm": 0.5775349140167236, + "learning_rate": 1.5190385422863174e-05, + "loss": 0.1467, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 16560, + "tokens_per_second_per_gpu": 367.94 + }, + { + "epoch": 1.647313217676592, + "grad_norm": 0.5621904134750366, + "learning_rate": 1.5107333148804414e-05, + "loss": 0.1599, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 16570, + "tokens_per_second_per_gpu": 324.44 + }, + { + "epoch": 1.6483074017000545, + "grad_norm": 0.565984845161438, + "learning_rate": 1.5024489983326562e-05, + "loss": 0.2052, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.67, + "memory/max_allocated (GiB)": 19.67, + "step": 16580, + "tokens_per_second_per_gpu": 412.88 + }, + { + "epoch": 1.6493015857235174, + "grad_norm": 0.456301748752594, + "learning_rate": 1.4941856130489884e-05, + "loss": 0.1494, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 16590, + "tokens_per_second_per_gpu": 374.55 + }, + { + "epoch": 1.6502957697469802, + "grad_norm": 0.414070725440979, + "learning_rate": 1.4859431793838995e-05, + "loss": 0.131, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 16600, + "tokens_per_second_per_gpu": 396.68 + }, + { + "epoch": 1.6512899537704429, + "grad_norm": 0.6677629947662354, + "learning_rate": 1.477721717640248e-05, + "loss": 0.1532, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 16610, + "tokens_per_second_per_gpu": 419.18 + }, + { + "epoch": 1.6522841377939057, + "grad_norm": 0.22176572680473328, + "learning_rate": 1.4695212480692277e-05, + "loss": 0.1519, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 16620, + "tokens_per_second_per_gpu": 401.78 + }, + { + "epoch": 1.6532783218173686, + "grad_norm": 0.6775956749916077, + "learning_rate": 1.4613417908703342e-05, + "loss": 0.1728, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 16630, + "tokens_per_second_per_gpu": 349.56 + }, + { + "epoch": 1.6542725058408312, + "grad_norm": 0.4293968081474304, + "learning_rate": 1.4531833661912942e-05, + "loss": 0.1076, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 16640, + "tokens_per_second_per_gpu": 314.88 + }, + { + "epoch": 1.6552666898642938, + "grad_norm": 0.5610436797142029, + "learning_rate": 1.445045994128037e-05, + "loss": 0.1682, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 16650, + "tokens_per_second_per_gpu": 363.34 + }, + { + "epoch": 1.6562608738877567, + "grad_norm": 0.616041362285614, + "learning_rate": 1.4369296947246236e-05, + "loss": 0.1587, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 16660, + "tokens_per_second_per_gpu": 362.03 + }, + { + "epoch": 1.6572550579112195, + "grad_norm": 0.3607831299304962, + "learning_rate": 1.4288344879732185e-05, + "loss": 0.1889, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.86, + "memory/max_allocated (GiB)": 18.86, + "step": 16670, + "tokens_per_second_per_gpu": 371.37 + }, + { + "epoch": 1.6582492419346821, + "grad_norm": 0.2895027697086334, + "learning_rate": 1.420760393814028e-05, + "loss": 0.1525, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 16680, + "tokens_per_second_per_gpu": 375.97 + }, + { + "epoch": 1.6592434259581448, + "grad_norm": 0.35571709275245667, + "learning_rate": 1.4127074321352517e-05, + "loss": 0.1653, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 16690, + "tokens_per_second_per_gpu": 359.31 + }, + { + "epoch": 1.6602376099816076, + "grad_norm": 0.6197568774223328, + "learning_rate": 1.404675622773034e-05, + "loss": 0.1795, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 16700, + "tokens_per_second_per_gpu": 320.06 + }, + { + "epoch": 1.6612317940050705, + "grad_norm": 0.5286340713500977, + "learning_rate": 1.3966649855114211e-05, + "loss": 0.1592, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 16710, + "tokens_per_second_per_gpu": 333.26 + }, + { + "epoch": 1.6622259780285331, + "grad_norm": 0.4165874719619751, + "learning_rate": 1.3886755400823071e-05, + "loss": 0.1282, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 16720, + "tokens_per_second_per_gpu": 383.56 + }, + { + "epoch": 1.6632201620519957, + "grad_norm": 0.4585205316543579, + "learning_rate": 1.3807073061653809e-05, + "loss": 0.2177, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 16730, + "tokens_per_second_per_gpu": 423.57 + }, + { + "epoch": 1.6642143460754586, + "grad_norm": 0.4411196708679199, + "learning_rate": 1.372760303388091e-05, + "loss": 0.104, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 16740, + "tokens_per_second_per_gpu": 305.5 + }, + { + "epoch": 1.6652085300989214, + "grad_norm": 0.5241126418113708, + "learning_rate": 1.36483455132558e-05, + "loss": 0.1809, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 16750, + "tokens_per_second_per_gpu": 356.92 + }, + { + "epoch": 1.666202714122384, + "grad_norm": 0.6349103450775146, + "learning_rate": 1.3569300695006548e-05, + "loss": 0.1311, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 16760, + "tokens_per_second_per_gpu": 388.79 + }, + { + "epoch": 1.6671968981458467, + "grad_norm": 0.46360382437705994, + "learning_rate": 1.3490468773837217e-05, + "loss": 0.1204, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.64, + "memory/max_allocated (GiB)": 20.64, + "step": 16770, + "tokens_per_second_per_gpu": 317.52 + }, + { + "epoch": 1.6681910821693096, + "grad_norm": 0.49904516339302063, + "learning_rate": 1.3411849943927513e-05, + "loss": 0.1616, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 16780, + "tokens_per_second_per_gpu": 355.09 + }, + { + "epoch": 1.6691852661927724, + "grad_norm": 0.19544100761413574, + "learning_rate": 1.3333444398932205e-05, + "loss": 0.1091, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 16790, + "tokens_per_second_per_gpu": 289.19 + }, + { + "epoch": 1.670179450216235, + "grad_norm": 0.31488218903541565, + "learning_rate": 1.325525233198076e-05, + "loss": 0.1004, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 16800, + "tokens_per_second_per_gpu": 316.73 + }, + { + "epoch": 1.6711736342396977, + "grad_norm": 0.5296607613563538, + "learning_rate": 1.3177273935676715e-05, + "loss": 0.1112, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 16810, + "tokens_per_second_per_gpu": 343.62 + }, + { + "epoch": 1.6721678182631605, + "grad_norm": 0.4741019308567047, + "learning_rate": 1.3099509402097377e-05, + "loss": 0.1181, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.11, + "memory/max_allocated (GiB)": 19.11, + "step": 16820, + "tokens_per_second_per_gpu": 337.04 + }, + { + "epoch": 1.6731620022866234, + "grad_norm": 0.545759916305542, + "learning_rate": 1.3021958922793209e-05, + "loss": 0.1247, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 16830, + "tokens_per_second_per_gpu": 347.83 + }, + { + "epoch": 1.674156186310086, + "grad_norm": 0.5977072715759277, + "learning_rate": 1.2944622688787445e-05, + "loss": 0.1867, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 16840, + "tokens_per_second_per_gpu": 371.62 + }, + { + "epoch": 1.6751503703335486, + "grad_norm": 0.6563873887062073, + "learning_rate": 1.2867500890575601e-05, + "loss": 0.2069, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 16850, + "tokens_per_second_per_gpu": 339.57 + }, + { + "epoch": 1.6761445543570115, + "grad_norm": 0.5459251999855042, + "learning_rate": 1.279059371812491e-05, + "loss": 0.1413, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 16860, + "tokens_per_second_per_gpu": 355.16 + }, + { + "epoch": 1.6771387383804743, + "grad_norm": 0.5088791847229004, + "learning_rate": 1.2713901360874037e-05, + "loss": 0.1136, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 16870, + "tokens_per_second_per_gpu": 327.52 + }, + { + "epoch": 1.678132922403937, + "grad_norm": 0.6554404497146606, + "learning_rate": 1.2637424007732434e-05, + "loss": 0.1708, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 16880, + "tokens_per_second_per_gpu": 335.39 + }, + { + "epoch": 1.6791271064273996, + "grad_norm": 0.49274370074272156, + "learning_rate": 1.2561161847080028e-05, + "loss": 0.1766, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 16890, + "tokens_per_second_per_gpu": 382.39 + }, + { + "epoch": 1.6801212904508624, + "grad_norm": 0.6049704551696777, + "learning_rate": 1.2485115066766584e-05, + "loss": 0.1887, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 16900, + "tokens_per_second_per_gpu": 395.86 + }, + { + "epoch": 1.6811154744743253, + "grad_norm": 0.45951762795448303, + "learning_rate": 1.2409283854111442e-05, + "loss": 0.1627, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 16910, + "tokens_per_second_per_gpu": 374.46 + }, + { + "epoch": 1.682109658497788, + "grad_norm": 0.31577378511428833, + "learning_rate": 1.2333668395902875e-05, + "loss": 0.1635, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.64, + "memory/max_allocated (GiB)": 20.64, + "step": 16920, + "tokens_per_second_per_gpu": 302.79 + }, + { + "epoch": 1.6831038425212506, + "grad_norm": 0.3830484449863434, + "learning_rate": 1.225826887839776e-05, + "loss": 0.0991, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 16930, + "tokens_per_second_per_gpu": 384.62 + }, + { + "epoch": 1.6840980265447134, + "grad_norm": 0.4875403642654419, + "learning_rate": 1.2183085487321022e-05, + "loss": 0.1612, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 16940, + "tokens_per_second_per_gpu": 393.48 + }, + { + "epoch": 1.6850922105681763, + "grad_norm": 0.4757268726825714, + "learning_rate": 1.2108118407865254e-05, + "loss": 0.1823, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.61, + "memory/max_allocated (GiB)": 20.61, + "step": 16950, + "tokens_per_second_per_gpu": 357.56 + }, + { + "epoch": 1.686086394591639, + "grad_norm": 0.3833984136581421, + "learning_rate": 1.2033367824690223e-05, + "loss": 0.1261, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 16960, + "tokens_per_second_per_gpu": 431.85 + }, + { + "epoch": 1.6870805786151015, + "grad_norm": 0.35014206171035767, + "learning_rate": 1.1958833921922418e-05, + "loss": 0.1545, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 16970, + "tokens_per_second_per_gpu": 309.84 + }, + { + "epoch": 1.6880747626385644, + "grad_norm": 0.5870986580848694, + "learning_rate": 1.1884516883154606e-05, + "loss": 0.1116, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 16980, + "tokens_per_second_per_gpu": 330.54 + }, + { + "epoch": 1.6890689466620272, + "grad_norm": 0.3304491639137268, + "learning_rate": 1.1810416891445319e-05, + "loss": 0.1723, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 16990, + "tokens_per_second_per_gpu": 352.93 + }, + { + "epoch": 1.6900631306854899, + "grad_norm": 0.295502632856369, + "learning_rate": 1.1736534129318532e-05, + "loss": 0.1596, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 17000, + "tokens_per_second_per_gpu": 365.23 + }, + { + "epoch": 1.6910573147089525, + "grad_norm": 0.48762744665145874, + "learning_rate": 1.1662868778763092e-05, + "loss": 0.1732, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 17010, + "tokens_per_second_per_gpu": 390.51 + }, + { + "epoch": 1.6920514987324153, + "grad_norm": 0.4020582437515259, + "learning_rate": 1.1589421021232338e-05, + "loss": 0.1351, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 17020, + "tokens_per_second_per_gpu": 345.17 + }, + { + "epoch": 1.6930456827558782, + "grad_norm": 0.5929551124572754, + "learning_rate": 1.1516191037643598e-05, + "loss": 0.1425, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 17030, + "tokens_per_second_per_gpu": 344.59 + }, + { + "epoch": 1.6940398667793408, + "grad_norm": 0.47912150621414185, + "learning_rate": 1.1443179008377825e-05, + "loss": 0.1139, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 17040, + "tokens_per_second_per_gpu": 351.04 + }, + { + "epoch": 1.6950340508028034, + "grad_norm": 0.3503008782863617, + "learning_rate": 1.1370385113279047e-05, + "loss": 0.1163, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 17050, + "tokens_per_second_per_gpu": 329.13 + }, + { + "epoch": 1.6960282348262663, + "grad_norm": 0.3980488181114197, + "learning_rate": 1.1297809531654046e-05, + "loss": 0.1619, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.78, + "memory/max_allocated (GiB)": 20.78, + "step": 17060, + "tokens_per_second_per_gpu": 381.97 + }, + { + "epoch": 1.6970224188497292, + "grad_norm": 0.5639301538467407, + "learning_rate": 1.1225452442271789e-05, + "loss": 0.1308, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 17070, + "tokens_per_second_per_gpu": 352.08 + }, + { + "epoch": 1.6980166028731918, + "grad_norm": 0.20409265160560608, + "learning_rate": 1.1153314023363126e-05, + "loss": 0.1434, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 17080, + "tokens_per_second_per_gpu": 318.47 + }, + { + "epoch": 1.6990107868966544, + "grad_norm": 0.3582451343536377, + "learning_rate": 1.1081394452620164e-05, + "loss": 0.1646, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 17090, + "tokens_per_second_per_gpu": 296.97 + }, + { + "epoch": 1.7000049709201173, + "grad_norm": 0.396576851606369, + "learning_rate": 1.100969390719605e-05, + "loss": 0.1628, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 17100, + "tokens_per_second_per_gpu": 324.04 + }, + { + "epoch": 1.7009991549435801, + "grad_norm": 0.53066086769104, + "learning_rate": 1.0938212563704364e-05, + "loss": 0.1304, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 17110, + "tokens_per_second_per_gpu": 414.9 + }, + { + "epoch": 1.7019933389670427, + "grad_norm": 0.4172995388507843, + "learning_rate": 1.0866950598218772e-05, + "loss": 0.1336, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 17120, + "tokens_per_second_per_gpu": 319.1 + }, + { + "epoch": 1.7029875229905056, + "grad_norm": 0.8753495216369629, + "learning_rate": 1.0795908186272585e-05, + "loss": 0.1354, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 17130, + "tokens_per_second_per_gpu": 303.94 + }, + { + "epoch": 1.7039817070139684, + "grad_norm": 0.5303057432174683, + "learning_rate": 1.0725085502858223e-05, + "loss": 0.1675, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 17140, + "tokens_per_second_per_gpu": 418.9 + }, + { + "epoch": 1.704975891037431, + "grad_norm": 0.3423836827278137, + "learning_rate": 1.0654482722426984e-05, + "loss": 0.1446, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 17150, + "tokens_per_second_per_gpu": 376.63 + }, + { + "epoch": 1.7059700750608937, + "grad_norm": 0.46940287947654724, + "learning_rate": 1.0584100018888376e-05, + "loss": 0.173, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.18, + "memory/max_allocated (GiB)": 18.18, + "step": 17160, + "tokens_per_second_per_gpu": 398.16 + }, + { + "epoch": 1.7069642590843566, + "grad_norm": 0.5457295775413513, + "learning_rate": 1.0513937565609922e-05, + "loss": 0.1594, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 17170, + "tokens_per_second_per_gpu": 334.42 + }, + { + "epoch": 1.7079584431078194, + "grad_norm": 0.5194307565689087, + "learning_rate": 1.044399553541653e-05, + "loss": 0.0897, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 17180, + "tokens_per_second_per_gpu": 313.63 + }, + { + "epoch": 1.708952627131282, + "grad_norm": 0.6305141448974609, + "learning_rate": 1.0374274100590254e-05, + "loss": 0.157, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 17190, + "tokens_per_second_per_gpu": 356.77 + }, + { + "epoch": 1.7099468111547447, + "grad_norm": 0.7084689140319824, + "learning_rate": 1.0304773432869675e-05, + "loss": 0.2061, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 17200, + "tokens_per_second_per_gpu": 396.04 + }, + { + "epoch": 1.7109409951782075, + "grad_norm": 0.5986453294754028, + "learning_rate": 1.0235493703449673e-05, + "loss": 0.1498, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 17.01, + "memory/max_allocated (GiB)": 17.01, + "step": 17210, + "tokens_per_second_per_gpu": 333.56 + }, + { + "epoch": 1.7119351792016704, + "grad_norm": 0.4625334143638611, + "learning_rate": 1.0166435082980818e-05, + "loss": 0.1473, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 17220, + "tokens_per_second_per_gpu": 344.72 + }, + { + "epoch": 1.712929363225133, + "grad_norm": 0.3249736726284027, + "learning_rate": 1.0097597741569109e-05, + "loss": 0.1247, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 17230, + "tokens_per_second_per_gpu": 387.91 + }, + { + "epoch": 1.7139235472485956, + "grad_norm": 0.395663321018219, + "learning_rate": 1.0028981848775499e-05, + "loss": 0.1227, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 17240, + "tokens_per_second_per_gpu": 400.44 + }, + { + "epoch": 1.7149177312720585, + "grad_norm": 0.3278411030769348, + "learning_rate": 9.960587573615376e-06, + "loss": 0.1161, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 17250, + "tokens_per_second_per_gpu": 419.85 + }, + { + "epoch": 1.7159119152955213, + "grad_norm": 0.27274656295776367, + "learning_rate": 9.892415084558315e-06, + "loss": 0.1541, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 17260, + "tokens_per_second_per_gpu": 357.93 + }, + { + "epoch": 1.716906099318984, + "grad_norm": 0.6081755757331848, + "learning_rate": 9.82446454952759e-06, + "loss": 0.1644, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 17270, + "tokens_per_second_per_gpu": 381.9 + }, + { + "epoch": 1.7179002833424466, + "grad_norm": 0.5921940207481384, + "learning_rate": 9.756736135899724e-06, + "loss": 0.1686, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 17280, + "tokens_per_second_per_gpu": 422.69 + }, + { + "epoch": 1.7188944673659095, + "grad_norm": 0.43674856424331665, + "learning_rate": 9.68923001050408e-06, + "loss": 0.1546, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 17290, + "tokens_per_second_per_gpu": 370.0 + }, + { + "epoch": 1.7198886513893723, + "grad_norm": 0.39371129870414734, + "learning_rate": 9.621946339622567e-06, + "loss": 0.1627, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 17300, + "tokens_per_second_per_gpu": 404.49 + }, + { + "epoch": 1.720882835412835, + "grad_norm": 0.6137861609458923, + "learning_rate": 9.554885288989035e-06, + "loss": 0.1846, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 17310, + "tokens_per_second_per_gpu": 400.34 + }, + { + "epoch": 1.7218770194362976, + "grad_norm": 0.43737614154815674, + "learning_rate": 9.488047023789059e-06, + "loss": 0.1593, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 17320, + "tokens_per_second_per_gpu": 365.64 + }, + { + "epoch": 1.7228712034597604, + "grad_norm": 0.20106801390647888, + "learning_rate": 9.42143170865939e-06, + "loss": 0.1476, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 17330, + "tokens_per_second_per_gpu": 366.76 + }, + { + "epoch": 1.7238653874832233, + "grad_norm": 0.652117133140564, + "learning_rate": 9.355039507687657e-06, + "loss": 0.191, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 17340, + "tokens_per_second_per_gpu": 433.9 + }, + { + "epoch": 1.724859571506686, + "grad_norm": 0.39825040102005005, + "learning_rate": 9.288870584411835e-06, + "loss": 0.1297, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 17350, + "tokens_per_second_per_gpu": 313.48 + }, + { + "epoch": 1.7258537555301485, + "grad_norm": 0.20952925086021423, + "learning_rate": 9.222925101820012e-06, + "loss": 0.1617, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 17360, + "tokens_per_second_per_gpu": 382.53 + }, + { + "epoch": 1.7268479395536114, + "grad_norm": 0.6603105068206787, + "learning_rate": 9.157203222349853e-06, + "loss": 0.2117, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 17370, + "tokens_per_second_per_gpu": 424.27 + }, + { + "epoch": 1.7278421235770742, + "grad_norm": 0.5097489356994629, + "learning_rate": 9.091705107888204e-06, + "loss": 0.181, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 17380, + "tokens_per_second_per_gpu": 487.99 + }, + { + "epoch": 1.7288363076005369, + "grad_norm": 0.674410879611969, + "learning_rate": 9.026430919770767e-06, + "loss": 0.1491, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 17390, + "tokens_per_second_per_gpu": 354.95 + }, + { + "epoch": 1.7298304916239995, + "grad_norm": 0.370148241519928, + "learning_rate": 8.961380818781695e-06, + "loss": 0.1672, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.54, + "memory/max_allocated (GiB)": 21.54, + "step": 17400, + "tokens_per_second_per_gpu": 397.8 + }, + { + "epoch": 1.7308246756474623, + "grad_norm": 0.5135669112205505, + "learning_rate": 8.896554965153126e-06, + "loss": 0.1515, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 17410, + "tokens_per_second_per_gpu": 327.7 + }, + { + "epoch": 1.7318188596709252, + "grad_norm": 0.5589896440505981, + "learning_rate": 8.831953518564816e-06, + "loss": 0.0882, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 17420, + "tokens_per_second_per_gpu": 296.23 + }, + { + "epoch": 1.7328130436943878, + "grad_norm": 0.6967418789863586, + "learning_rate": 8.767576638143804e-06, + "loss": 0.1411, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 17430, + "tokens_per_second_per_gpu": 365.21 + }, + { + "epoch": 1.7338072277178505, + "grad_norm": 0.7863910794258118, + "learning_rate": 8.70342448246394e-06, + "loss": 0.1521, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 17440, + "tokens_per_second_per_gpu": 379.01 + }, + { + "epoch": 1.7348014117413133, + "grad_norm": 0.49855732917785645, + "learning_rate": 8.639497209545556e-06, + "loss": 0.1717, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 17450, + "tokens_per_second_per_gpu": 351.33 + }, + { + "epoch": 1.7357955957647762, + "grad_norm": 0.5448564291000366, + "learning_rate": 8.57579497685501e-06, + "loss": 0.1185, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 17460, + "tokens_per_second_per_gpu": 338.51 + }, + { + "epoch": 1.7367897797882388, + "grad_norm": 0.9186548590660095, + "learning_rate": 8.512317941304404e-06, + "loss": 0.1824, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 17470, + "tokens_per_second_per_gpu": 397.66 + }, + { + "epoch": 1.7377839638117014, + "grad_norm": 0.6779747605323792, + "learning_rate": 8.44906625925106e-06, + "loss": 0.1354, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 17480, + "tokens_per_second_per_gpu": 348.68 + }, + { + "epoch": 1.7387781478351643, + "grad_norm": 0.6516660451889038, + "learning_rate": 8.386040086497238e-06, + "loss": 0.1223, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 17490, + "tokens_per_second_per_gpu": 335.11 + }, + { + "epoch": 1.7397723318586271, + "grad_norm": 0.4572422504425049, + "learning_rate": 8.323239578289754e-06, + "loss": 0.1306, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 17500, + "tokens_per_second_per_gpu": 387.29 + }, + { + "epoch": 1.7407665158820897, + "grad_norm": 0.5288609862327576, + "learning_rate": 8.260664889319502e-06, + "loss": 0.1567, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 17510, + "tokens_per_second_per_gpu": 339.2 + }, + { + "epoch": 1.7417606999055524, + "grad_norm": 0.6885465383529663, + "learning_rate": 8.198316173721199e-06, + "loss": 0.1687, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 17520, + "tokens_per_second_per_gpu": 350.42 + }, + { + "epoch": 1.7427548839290152, + "grad_norm": 0.4652736485004425, + "learning_rate": 8.136193585072871e-06, + "loss": 0.1494, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 17530, + "tokens_per_second_per_gpu": 329.48 + }, + { + "epoch": 1.743749067952478, + "grad_norm": 0.2998766303062439, + "learning_rate": 8.074297276395592e-06, + "loss": 0.1341, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 17540, + "tokens_per_second_per_gpu": 343.21 + }, + { + "epoch": 1.7447432519759407, + "grad_norm": 0.5339378118515015, + "learning_rate": 8.012627400153073e-06, + "loss": 0.1889, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 17550, + "tokens_per_second_per_gpu": 366.33 + }, + { + "epoch": 1.7457374359994033, + "grad_norm": 0.2314610630273819, + "learning_rate": 7.951184108251242e-06, + "loss": 0.1306, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 17560, + "tokens_per_second_per_gpu": 265.9 + }, + { + "epoch": 1.7467316200228662, + "grad_norm": 0.33561253547668457, + "learning_rate": 7.889967552037913e-06, + "loss": 0.1782, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.69, + "memory/max_allocated (GiB)": 19.69, + "step": 17570, + "tokens_per_second_per_gpu": 361.12 + }, + { + "epoch": 1.747725804046329, + "grad_norm": 0.4189736843109131, + "learning_rate": 7.828977882302413e-06, + "loss": 0.1693, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 17580, + "tokens_per_second_per_gpu": 394.43 + }, + { + "epoch": 1.7487199880697917, + "grad_norm": 0.3725840747356415, + "learning_rate": 7.768215249275168e-06, + "loss": 0.1826, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 17590, + "tokens_per_second_per_gpu": 345.73 + }, + { + "epoch": 1.7497141720932543, + "grad_norm": 0.40016722679138184, + "learning_rate": 7.707679802627399e-06, + "loss": 0.1428, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 17600, + "tokens_per_second_per_gpu": 304.16 + }, + { + "epoch": 1.7507083561167172, + "grad_norm": 0.274614542722702, + "learning_rate": 7.647371691470706e-06, + "loss": 0.0966, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.68, + "memory/max_allocated (GiB)": 19.68, + "step": 17610, + "tokens_per_second_per_gpu": 348.33 + }, + { + "epoch": 1.75170254014018, + "grad_norm": 0.5691856145858765, + "learning_rate": 7.587291064356716e-06, + "loss": 0.1495, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 17620, + "tokens_per_second_per_gpu": 285.38 + }, + { + "epoch": 1.7526967241636426, + "grad_norm": 0.529352068901062, + "learning_rate": 7.5274380692766825e-06, + "loss": 0.1387, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 17630, + "tokens_per_second_per_gpu": 328.86 + }, + { + "epoch": 1.7536909081871053, + "grad_norm": 0.4417911171913147, + "learning_rate": 7.467812853661216e-06, + "loss": 0.136, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 17640, + "tokens_per_second_per_gpu": 350.36 + }, + { + "epoch": 1.7546850922105683, + "grad_norm": 0.45288828015327454, + "learning_rate": 7.4084155643798335e-06, + "loss": 0.1326, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 17650, + "tokens_per_second_per_gpu": 329.09 + }, + { + "epoch": 1.755679276234031, + "grad_norm": 0.4209536015987396, + "learning_rate": 7.349246347740568e-06, + "loss": 0.179, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 17660, + "tokens_per_second_per_gpu": 426.76 + }, + { + "epoch": 1.7566734602574936, + "grad_norm": 0.49393320083618164, + "learning_rate": 7.290305349489734e-06, + "loss": 0.2134, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 17670, + "tokens_per_second_per_gpu": 411.57 + }, + { + "epoch": 1.7576676442809565, + "grad_norm": 0.6331411600112915, + "learning_rate": 7.2315927148114635e-06, + "loss": 0.1676, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 17680, + "tokens_per_second_per_gpu": 306.06 + }, + { + "epoch": 1.7586618283044193, + "grad_norm": 0.383672297000885, + "learning_rate": 7.173108588327415e-06, + "loss": 0.14, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 17690, + "tokens_per_second_per_gpu": 342.56 + }, + { + "epoch": 1.759656012327882, + "grad_norm": 0.4192826449871063, + "learning_rate": 7.1148531140962986e-06, + "loss": 0.1358, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.85, + "memory/max_allocated (GiB)": 18.85, + "step": 17700, + "tokens_per_second_per_gpu": 342.72 + }, + { + "epoch": 1.7606501963513446, + "grad_norm": 0.3609708845615387, + "learning_rate": 7.056826435613706e-06, + "loss": 0.1483, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 17710, + "tokens_per_second_per_gpu": 378.78 + }, + { + "epoch": 1.7616443803748074, + "grad_norm": 0.535681962966919, + "learning_rate": 6.999028695811572e-06, + "loss": 0.1457, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 17720, + "tokens_per_second_per_gpu": 315.36 + }, + { + "epoch": 1.7626385643982703, + "grad_norm": 0.7409544587135315, + "learning_rate": 6.941460037057979e-06, + "loss": 0.187, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 17730, + "tokens_per_second_per_gpu": 339.48 + }, + { + "epoch": 1.763632748421733, + "grad_norm": 0.4647164046764374, + "learning_rate": 6.8841206011566625e-06, + "loss": 0.1178, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 17740, + "tokens_per_second_per_gpu": 318.67 + }, + { + "epoch": 1.7646269324451955, + "grad_norm": 0.37413302063941956, + "learning_rate": 6.827010529346822e-06, + "loss": 0.1156, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 17750, + "tokens_per_second_per_gpu": 361.82 + }, + { + "epoch": 1.7656211164686584, + "grad_norm": 0.7163899540901184, + "learning_rate": 6.7701299623025846e-06, + "loss": 0.1514, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 17760, + "tokens_per_second_per_gpu": 346.76 + }, + { + "epoch": 1.7666153004921212, + "grad_norm": 0.47301217913627625, + "learning_rate": 6.713479040132841e-06, + "loss": 0.1582, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 17770, + "tokens_per_second_per_gpu": 346.61 + }, + { + "epoch": 1.7676094845155839, + "grad_norm": 0.4980860948562622, + "learning_rate": 6.657057902380792e-06, + "loss": 0.1119, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 17780, + "tokens_per_second_per_gpu": 302.2 + }, + { + "epoch": 1.7686036685390465, + "grad_norm": 0.6229726076126099, + "learning_rate": 6.600866688023588e-06, + "loss": 0.1737, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 17790, + "tokens_per_second_per_gpu": 408.97 + }, + { + "epoch": 1.7695978525625093, + "grad_norm": 0.5228248834609985, + "learning_rate": 6.5449055354721125e-06, + "loss": 0.1809, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 17800, + "tokens_per_second_per_gpu": 379.16 + }, + { + "epoch": 1.7705920365859722, + "grad_norm": 0.4032490849494934, + "learning_rate": 6.489174582570467e-06, + "loss": 0.147, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 17810, + "tokens_per_second_per_gpu": 378.57 + }, + { + "epoch": 1.7715862206094348, + "grad_norm": 0.44193387031555176, + "learning_rate": 6.433673966595788e-06, + "loss": 0.1556, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 17820, + "tokens_per_second_per_gpu": 330.37 + }, + { + "epoch": 1.7725804046328975, + "grad_norm": 0.41534683108329773, + "learning_rate": 6.3784038242578285e-06, + "loss": 0.1261, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 17830, + "tokens_per_second_per_gpu": 317.12 + }, + { + "epoch": 1.7735745886563603, + "grad_norm": 0.7784578800201416, + "learning_rate": 6.323364291698642e-06, + "loss": 0.1922, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 17840, + "tokens_per_second_per_gpu": 342.84 + }, + { + "epoch": 1.7745687726798232, + "grad_norm": 0.7484685778617859, + "learning_rate": 6.2685555044921905e-06, + "loss": 0.1375, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 17850, + "tokens_per_second_per_gpu": 351.93 + }, + { + "epoch": 1.7755629567032858, + "grad_norm": 0.5791344046592712, + "learning_rate": 6.213977597644138e-06, + "loss": 0.1087, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 17860, + "tokens_per_second_per_gpu": 337.95 + }, + { + "epoch": 1.7765571407267484, + "grad_norm": 0.22857262194156647, + "learning_rate": 6.159630705591379e-06, + "loss": 0.1257, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 17870, + "tokens_per_second_per_gpu": 365.98 + }, + { + "epoch": 1.7775513247502113, + "grad_norm": 0.43469542264938354, + "learning_rate": 6.10551496220183e-06, + "loss": 0.1652, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 17880, + "tokens_per_second_per_gpu": 371.12 + }, + { + "epoch": 1.7785455087736741, + "grad_norm": 0.7687386274337769, + "learning_rate": 6.0516305007739525e-06, + "loss": 0.1787, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 17890, + "tokens_per_second_per_gpu": 369.52 + }, + { + "epoch": 1.7795396927971368, + "grad_norm": 0.3083856999874115, + "learning_rate": 5.997977454036608e-06, + "loss": 0.1817, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 17900, + "tokens_per_second_per_gpu": 342.94 + }, + { + "epoch": 1.7805338768205994, + "grad_norm": 0.3785597085952759, + "learning_rate": 5.944555954148579e-06, + "loss": 0.1563, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 17910, + "tokens_per_second_per_gpu": 396.7 + }, + { + "epoch": 1.7815280608440622, + "grad_norm": 0.5601521730422974, + "learning_rate": 5.891366132698295e-06, + "loss": 0.1876, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 17920, + "tokens_per_second_per_gpu": 357.52 + }, + { + "epoch": 1.782522244867525, + "grad_norm": 0.6354232430458069, + "learning_rate": 5.838408120703554e-06, + "loss": 0.1495, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 17930, + "tokens_per_second_per_gpu": 367.19 + }, + { + "epoch": 1.7835164288909877, + "grad_norm": 0.6532102227210999, + "learning_rate": 5.785682048611097e-06, + "loss": 0.172, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 17940, + "tokens_per_second_per_gpu": 391.52 + }, + { + "epoch": 1.7845106129144503, + "grad_norm": 0.4626152515411377, + "learning_rate": 5.733188046296423e-06, + "loss": 0.1516, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 17950, + "tokens_per_second_per_gpu": 343.51 + }, + { + "epoch": 1.7855047969379132, + "grad_norm": 0.3500106930732727, + "learning_rate": 5.680926243063322e-06, + "loss": 0.1487, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 17960, + "tokens_per_second_per_gpu": 395.4 + }, + { + "epoch": 1.786498980961376, + "grad_norm": 0.5653957724571228, + "learning_rate": 5.628896767643677e-06, + "loss": 0.1401, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 17970, + "tokens_per_second_per_gpu": 372.67 + }, + { + "epoch": 1.7874931649848387, + "grad_norm": 0.4583251476287842, + "learning_rate": 5.577099748197079e-06, + "loss": 0.1497, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 17980, + "tokens_per_second_per_gpu": 319.79 + }, + { + "epoch": 1.7884873490083013, + "grad_norm": 0.7779932022094727, + "learning_rate": 5.525535312310559e-06, + "loss": 0.1878, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.23, + "memory/max_allocated (GiB)": 19.23, + "step": 17990, + "tokens_per_second_per_gpu": 330.6 + }, + { + "epoch": 1.7894815330317642, + "grad_norm": 0.5615857243537903, + "learning_rate": 5.4742035869981726e-06, + "loss": 0.0951, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 18000, + "tokens_per_second_per_gpu": 310.54 + }, + { + "epoch": 1.790475717055227, + "grad_norm": 0.6377988457679749, + "learning_rate": 5.423104698700853e-06, + "loss": 0.15, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.81, + "memory/max_allocated (GiB)": 19.81, + "step": 18010, + "tokens_per_second_per_gpu": 340.19 + }, + { + "epoch": 1.7914699010786896, + "grad_norm": 0.6940702199935913, + "learning_rate": 5.372238773285931e-06, + "loss": 0.1748, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 18020, + "tokens_per_second_per_gpu": 323.58 + }, + { + "epoch": 1.7924640851021523, + "grad_norm": 0.558322548866272, + "learning_rate": 5.321605936046947e-06, + "loss": 0.1468, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 18030, + "tokens_per_second_per_gpu": 305.22 + }, + { + "epoch": 1.7934582691256151, + "grad_norm": 0.3536572754383087, + "learning_rate": 5.271206311703281e-06, + "loss": 0.1207, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.19, + "memory/max_allocated (GiB)": 19.19, + "step": 18040, + "tokens_per_second_per_gpu": 336.32 + }, + { + "epoch": 1.794452453149078, + "grad_norm": 0.5200352668762207, + "learning_rate": 5.221040024399848e-06, + "loss": 0.1664, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 18050, + "tokens_per_second_per_gpu": 281.71 + }, + { + "epoch": 1.7954466371725406, + "grad_norm": 0.3343510329723358, + "learning_rate": 5.171107197706837e-06, + "loss": 0.1725, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 18060, + "tokens_per_second_per_gpu": 345.94 + }, + { + "epoch": 1.7964408211960032, + "grad_norm": 0.4875069856643677, + "learning_rate": 5.121407954619339e-06, + "loss": 0.1521, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.81, + "memory/max_allocated (GiB)": 19.81, + "step": 18070, + "tokens_per_second_per_gpu": 368.58 + }, + { + "epoch": 1.797435005219466, + "grad_norm": 0.524474024772644, + "learning_rate": 5.071942417557096e-06, + "loss": 0.1604, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 18080, + "tokens_per_second_per_gpu": 386.02 + }, + { + "epoch": 1.798429189242929, + "grad_norm": 0.3819403052330017, + "learning_rate": 5.0227107083641756e-06, + "loss": 0.1682, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 18090, + "tokens_per_second_per_gpu": 397.67 + }, + { + "epoch": 1.7994233732663916, + "grad_norm": 0.4870153069496155, + "learning_rate": 4.9737129483086845e-06, + "loss": 0.1726, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 18100, + "tokens_per_second_per_gpu": 409.71 + }, + { + "epoch": 1.8004175572898542, + "grad_norm": 0.48295846581459045, + "learning_rate": 4.924949258082468e-06, + "loss": 0.1807, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 18110, + "tokens_per_second_per_gpu": 386.2 + }, + { + "epoch": 1.801411741313317, + "grad_norm": 0.43393832445144653, + "learning_rate": 4.8764197578008095e-06, + "loss": 0.1368, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 18120, + "tokens_per_second_per_gpu": 348.61 + }, + { + "epoch": 1.80240592533678, + "grad_norm": 0.4767451584339142, + "learning_rate": 4.828124567002113e-06, + "loss": 0.141, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 18130, + "tokens_per_second_per_gpu": 333.81 + }, + { + "epoch": 1.8034001093602425, + "grad_norm": 0.4560125768184662, + "learning_rate": 4.780063804647639e-06, + "loss": 0.1356, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 18140, + "tokens_per_second_per_gpu": 331.92 + }, + { + "epoch": 1.8043942933837052, + "grad_norm": 0.23658177256584167, + "learning_rate": 4.732237589121202e-06, + "loss": 0.1273, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 18150, + "tokens_per_second_per_gpu": 289.7 + }, + { + "epoch": 1.805388477407168, + "grad_norm": 0.310188353061676, + "learning_rate": 4.684646038228891e-06, + "loss": 0.1614, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 18160, + "tokens_per_second_per_gpu": 406.62 + }, + { + "epoch": 1.8063826614306309, + "grad_norm": 0.49459826946258545, + "learning_rate": 4.6372892691987525e-06, + "loss": 0.1765, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.67, + "memory/max_allocated (GiB)": 19.67, + "step": 18170, + "tokens_per_second_per_gpu": 340.47 + }, + { + "epoch": 1.8073768454540935, + "grad_norm": 0.6684255599975586, + "learning_rate": 4.5901673986804896e-06, + "loss": 0.1767, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 18180, + "tokens_per_second_per_gpu": 361.86 + }, + { + "epoch": 1.8083710294775563, + "grad_norm": 0.46647799015045166, + "learning_rate": 4.5432805427452765e-06, + "loss": 0.131, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 18190, + "tokens_per_second_per_gpu": 337.2 + }, + { + "epoch": 1.8093652135010192, + "grad_norm": 0.5966955423355103, + "learning_rate": 4.496628816885318e-06, + "loss": 0.145, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.45, + "memory/max_allocated (GiB)": 20.45, + "step": 18200, + "tokens_per_second_per_gpu": 401.16 + }, + { + "epoch": 1.8103593975244818, + "grad_norm": 0.37769100069999695, + "learning_rate": 4.450212336013681e-06, + "loss": 0.1557, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 18210, + "tokens_per_second_per_gpu": 335.53 + }, + { + "epoch": 1.8113535815479445, + "grad_norm": 0.4889051616191864, + "learning_rate": 4.404031214463966e-06, + "loss": 0.1208, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 18220, + "tokens_per_second_per_gpu": 339.5 + }, + { + "epoch": 1.8123477655714073, + "grad_norm": 0.4089222848415375, + "learning_rate": 4.358085565990044e-06, + "loss": 0.1608, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.64, + "memory/max_allocated (GiB)": 20.64, + "step": 18230, + "tokens_per_second_per_gpu": 362.98 + }, + { + "epoch": 1.8133419495948702, + "grad_norm": 0.7150986194610596, + "learning_rate": 4.312375503765742e-06, + "loss": 0.1807, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 18240, + "tokens_per_second_per_gpu": 348.29 + }, + { + "epoch": 1.8143361336183328, + "grad_norm": 0.5326732397079468, + "learning_rate": 4.266901140384616e-06, + "loss": 0.145, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 18250, + "tokens_per_second_per_gpu": 346.23 + }, + { + "epoch": 1.8153303176417954, + "grad_norm": 0.5794299840927124, + "learning_rate": 4.221662587859631e-06, + "loss": 0.1534, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 18260, + "tokens_per_second_per_gpu": 334.07 + }, + { + "epoch": 1.8163245016652583, + "grad_norm": 0.4839019477367401, + "learning_rate": 4.1766599576229195e-06, + "loss": 0.189, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 18270, + "tokens_per_second_per_gpu": 345.54 + }, + { + "epoch": 1.8173186856887211, + "grad_norm": 0.5688499212265015, + "learning_rate": 4.131893360525452e-06, + "loss": 0.1432, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 18280, + "tokens_per_second_per_gpu": 337.5 + }, + { + "epoch": 1.8183128697121838, + "grad_norm": 0.5413910150527954, + "learning_rate": 4.087362906836812e-06, + "loss": 0.1351, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 18290, + "tokens_per_second_per_gpu": 383.78 + }, + { + "epoch": 1.8193070537356464, + "grad_norm": 0.4739817678928375, + "learning_rate": 4.043068706244957e-06, + "loss": 0.1253, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 18300, + "tokens_per_second_per_gpu": 356.75 + }, + { + "epoch": 1.8203012377591092, + "grad_norm": 0.34818801283836365, + "learning_rate": 3.999010867855812e-06, + "loss": 0.1211, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 18310, + "tokens_per_second_per_gpu": 319.55 + }, + { + "epoch": 1.821295421782572, + "grad_norm": 0.692557692527771, + "learning_rate": 3.955189500193191e-06, + "loss": 0.1494, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 18320, + "tokens_per_second_per_gpu": 378.18 + }, + { + "epoch": 1.8222896058060347, + "grad_norm": 0.713403582572937, + "learning_rate": 3.911604711198358e-06, + "loss": 0.1781, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 18330, + "tokens_per_second_per_gpu": 376.29 + }, + { + "epoch": 1.8232837898294973, + "grad_norm": 0.5273980498313904, + "learning_rate": 3.8682566082298695e-06, + "loss": 0.1802, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 18340, + "tokens_per_second_per_gpu": 353.26 + }, + { + "epoch": 1.8242779738529602, + "grad_norm": 0.4756382703781128, + "learning_rate": 3.825145298063249e-06, + "loss": 0.1553, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.64, + "memory/max_allocated (GiB)": 20.64, + "step": 18350, + "tokens_per_second_per_gpu": 343.84 + }, + { + "epoch": 1.825272157876423, + "grad_norm": 0.27914801239967346, + "learning_rate": 3.782270886890793e-06, + "loss": 0.1831, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 18360, + "tokens_per_second_per_gpu": 341.0 + }, + { + "epoch": 1.8262663418998857, + "grad_norm": 0.6465624570846558, + "learning_rate": 3.739633480321214e-06, + "loss": 0.2023, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 18370, + "tokens_per_second_per_gpu": 395.46 + }, + { + "epoch": 1.8272605259233483, + "grad_norm": 0.5615578293800354, + "learning_rate": 3.697233183379467e-06, + "loss": 0.189, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 18380, + "tokens_per_second_per_gpu": 345.47 + }, + { + "epoch": 1.8282547099468112, + "grad_norm": 0.5241686701774597, + "learning_rate": 3.6550701005064413e-06, + "loss": 0.1616, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 18390, + "tokens_per_second_per_gpu": 381.72 + }, + { + "epoch": 1.829248893970274, + "grad_norm": 0.37639015913009644, + "learning_rate": 3.613144335558738e-06, + "loss": 0.1789, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.85, + "memory/max_allocated (GiB)": 18.85, + "step": 18400, + "tokens_per_second_per_gpu": 353.78 + }, + { + "epoch": 1.8302430779937366, + "grad_norm": 0.3994421064853668, + "learning_rate": 3.571455991808348e-06, + "loss": 0.1619, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 18410, + "tokens_per_second_per_gpu": 387.6 + }, + { + "epoch": 1.8312372620171993, + "grad_norm": 0.6425352692604065, + "learning_rate": 3.5300051719424854e-06, + "loss": 0.137, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.23, + "memory/max_allocated (GiB)": 19.23, + "step": 18420, + "tokens_per_second_per_gpu": 346.83 + }, + { + "epoch": 1.8322314460406621, + "grad_norm": 0.4775727391242981, + "learning_rate": 3.4887919780632995e-06, + "loss": 0.1569, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 18430, + "tokens_per_second_per_gpu": 349.11 + }, + { + "epoch": 1.833225630064125, + "grad_norm": 0.37590330839157104, + "learning_rate": 3.4478165116875626e-06, + "loss": 0.1745, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 18440, + "tokens_per_second_per_gpu": 426.6 + }, + { + "epoch": 1.8342198140875876, + "grad_norm": 0.47229719161987305, + "learning_rate": 3.4070788737465497e-06, + "loss": 0.1438, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 18450, + "tokens_per_second_per_gpu": 354.96 + }, + { + "epoch": 1.8352139981110502, + "grad_norm": 0.5609657764434814, + "learning_rate": 3.3665791645856258e-06, + "loss": 0.151, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 18460, + "tokens_per_second_per_gpu": 349.0 + }, + { + "epoch": 1.836208182134513, + "grad_norm": 0.5257097482681274, + "learning_rate": 3.326317483964181e-06, + "loss": 0.1469, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 18470, + "tokens_per_second_per_gpu": 425.66 + }, + { + "epoch": 1.837202366157976, + "grad_norm": 0.9030627608299255, + "learning_rate": 3.2862939310552065e-06, + "loss": 0.1525, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 18480, + "tokens_per_second_per_gpu": 370.97 + }, + { + "epoch": 1.8381965501814386, + "grad_norm": 0.3405112326145172, + "learning_rate": 3.2465086044451976e-06, + "loss": 0.1649, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 18490, + "tokens_per_second_per_gpu": 368.6 + }, + { + "epoch": 1.8391907342049012, + "grad_norm": 0.3417605459690094, + "learning_rate": 3.206961602133807e-06, + "loss": 0.1433, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 18500, + "tokens_per_second_per_gpu": 373.55 + }, + { + "epoch": 1.840184918228364, + "grad_norm": 0.6065083742141724, + "learning_rate": 3.1676530215336675e-06, + "loss": 0.1569, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 18510, + "tokens_per_second_per_gpu": 327.3 + }, + { + "epoch": 1.841179102251827, + "grad_norm": 0.43219438195228577, + "learning_rate": 3.1285829594701165e-06, + "loss": 0.1324, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.86, + "memory/max_allocated (GiB)": 18.86, + "step": 18520, + "tokens_per_second_per_gpu": 301.88 + }, + { + "epoch": 1.8421732862752895, + "grad_norm": 0.58506840467453, + "learning_rate": 3.089751512180972e-06, + "loss": 0.1842, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 18530, + "tokens_per_second_per_gpu": 414.46 + }, + { + "epoch": 1.8431674702987522, + "grad_norm": 0.4577626883983612, + "learning_rate": 3.0511587753163094e-06, + "loss": 0.1406, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 18540, + "tokens_per_second_per_gpu": 423.64 + }, + { + "epoch": 1.844161654322215, + "grad_norm": 0.6649712324142456, + "learning_rate": 3.0128048439381886e-06, + "loss": 0.1355, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.23, + "memory/max_allocated (GiB)": 19.23, + "step": 18550, + "tokens_per_second_per_gpu": 401.52 + }, + { + "epoch": 1.8451558383456779, + "grad_norm": 0.3550589084625244, + "learning_rate": 2.974689812520448e-06, + "loss": 0.1429, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 18560, + "tokens_per_second_per_gpu": 331.54 + }, + { + "epoch": 1.8461500223691405, + "grad_norm": 0.5486629605293274, + "learning_rate": 2.9368137749484547e-06, + "loss": 0.185, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 18570, + "tokens_per_second_per_gpu": 371.95 + }, + { + "epoch": 1.8471442063926031, + "grad_norm": 0.45112037658691406, + "learning_rate": 2.8991768245189232e-06, + "loss": 0.1472, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 18580, + "tokens_per_second_per_gpu": 388.42 + }, + { + "epoch": 1.848138390416066, + "grad_norm": 0.40423816442489624, + "learning_rate": 2.861779053939595e-06, + "loss": 0.1218, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.85, + "memory/max_allocated (GiB)": 18.85, + "step": 18590, + "tokens_per_second_per_gpu": 357.26 + }, + { + "epoch": 1.8491325744395288, + "grad_norm": 0.49648118019104004, + "learning_rate": 2.824620555329094e-06, + "loss": 0.181, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 18600, + "tokens_per_second_per_gpu": 349.07 + }, + { + "epoch": 1.8501267584629915, + "grad_norm": 0.4115513265132904, + "learning_rate": 2.7877014202166372e-06, + "loss": 0.1188, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 18610, + "tokens_per_second_per_gpu": 344.58 + }, + { + "epoch": 1.851120942486454, + "grad_norm": 0.5676191449165344, + "learning_rate": 2.7510217395418815e-06, + "loss": 0.127, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 18620, + "tokens_per_second_per_gpu": 435.5 + }, + { + "epoch": 1.852115126509917, + "grad_norm": 0.5204625725746155, + "learning_rate": 2.714581603654609e-06, + "loss": 0.14, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 18630, + "tokens_per_second_per_gpu": 374.45 + }, + { + "epoch": 1.8531093105333798, + "grad_norm": 0.5703979134559631, + "learning_rate": 2.6783811023145977e-06, + "loss": 0.1432, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 18640, + "tokens_per_second_per_gpu": 393.22 + }, + { + "epoch": 1.8541034945568424, + "grad_norm": 1.1127846240997314, + "learning_rate": 2.6424203246913194e-06, + "loss": 0.1862, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 18650, + "tokens_per_second_per_gpu": 385.6 + }, + { + "epoch": 1.855097678580305, + "grad_norm": 0.37595993280410767, + "learning_rate": 2.6066993593637844e-06, + "loss": 0.1694, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 18660, + "tokens_per_second_per_gpu": 387.27 + }, + { + "epoch": 1.856091862603768, + "grad_norm": 0.35933300852775574, + "learning_rate": 2.571218294320266e-06, + "loss": 0.1011, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 18670, + "tokens_per_second_per_gpu": 354.13 + }, + { + "epoch": 1.8570860466272308, + "grad_norm": 0.5620598196983337, + "learning_rate": 2.5359772169581297e-06, + "loss": 0.1991, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.7, + "memory/max_allocated (GiB)": 19.7, + "step": 18680, + "tokens_per_second_per_gpu": 389.06 + }, + { + "epoch": 1.8580802306506934, + "grad_norm": 0.29674777388572693, + "learning_rate": 2.5009762140835947e-06, + "loss": 0.1497, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 18690, + "tokens_per_second_per_gpu": 373.51 + }, + { + "epoch": 1.8590744146741562, + "grad_norm": 0.4018515348434448, + "learning_rate": 2.4662153719115398e-06, + "loss": 0.1743, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 18700, + "tokens_per_second_per_gpu": 373.54 + }, + { + "epoch": 1.860068598697619, + "grad_norm": 0.5389109253883362, + "learning_rate": 2.431694776065263e-06, + "loss": 0.1169, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 18710, + "tokens_per_second_per_gpu": 340.4 + }, + { + "epoch": 1.8610627827210817, + "grad_norm": 0.526465654373169, + "learning_rate": 2.397414511576268e-06, + "loss": 0.1841, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 18720, + "tokens_per_second_per_gpu": 348.59 + }, + { + "epoch": 1.8620569667445444, + "grad_norm": 0.3093705177307129, + "learning_rate": 2.3633746628841325e-06, + "loss": 0.1741, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 18730, + "tokens_per_second_per_gpu": 313.63 + }, + { + "epoch": 1.8630511507680072, + "grad_norm": 0.9190280437469482, + "learning_rate": 2.329575313836152e-06, + "loss": 0.2199, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 18740, + "tokens_per_second_per_gpu": 400.19 + }, + { + "epoch": 1.86404533479147, + "grad_norm": 0.5890634655952454, + "learning_rate": 2.2960165476873076e-06, + "loss": 0.133, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 18750, + "tokens_per_second_per_gpu": 344.18 + }, + { + "epoch": 1.8650395188149327, + "grad_norm": 0.7027098536491394, + "learning_rate": 2.262698447099898e-06, + "loss": 0.1459, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 18760, + "tokens_per_second_per_gpu": 288.79 + }, + { + "epoch": 1.8660337028383953, + "grad_norm": 0.7816129922866821, + "learning_rate": 2.2296210941434745e-06, + "loss": 0.1699, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 18770, + "tokens_per_second_per_gpu": 369.86 + }, + { + "epoch": 1.8670278868618582, + "grad_norm": 0.4923355281352997, + "learning_rate": 2.19678457029453e-06, + "loss": 0.1324, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 18780, + "tokens_per_second_per_gpu": 316.49 + }, + { + "epoch": 1.868022070885321, + "grad_norm": 0.27154669165611267, + "learning_rate": 2.164188956436386e-06, + "loss": 0.126, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 18790, + "tokens_per_second_per_gpu": 363.75 + }, + { + "epoch": 1.8690162549087836, + "grad_norm": 0.5623666644096375, + "learning_rate": 2.1318343328588953e-06, + "loss": 0.161, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 18800, + "tokens_per_second_per_gpu": 332.31 + }, + { + "epoch": 1.8700104389322463, + "grad_norm": 0.5379459261894226, + "learning_rate": 2.099720779258352e-06, + "loss": 0.1738, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 18810, + "tokens_per_second_per_gpu": 410.92 + }, + { + "epoch": 1.8710046229557091, + "grad_norm": 0.37512362003326416, + "learning_rate": 2.0678483747372247e-06, + "loss": 0.1164, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 18820, + "tokens_per_second_per_gpu": 351.41 + }, + { + "epoch": 1.871998806979172, + "grad_norm": 0.6244227886199951, + "learning_rate": 2.03621719780398e-06, + "loss": 0.1444, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 18830, + "tokens_per_second_per_gpu": 364.33 + }, + { + "epoch": 1.8729929910026346, + "grad_norm": 0.3308027386665344, + "learning_rate": 2.0048273263729046e-06, + "loss": 0.1737, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 18840, + "tokens_per_second_per_gpu": 346.54 + }, + { + "epoch": 1.8739871750260972, + "grad_norm": 0.2430552840232849, + "learning_rate": 1.9736788377638705e-06, + "loss": 0.1432, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 18850, + "tokens_per_second_per_gpu": 310.33 + }, + { + "epoch": 1.87498135904956, + "grad_norm": 0.4946417212486267, + "learning_rate": 1.942771808702204e-06, + "loss": 0.1859, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 18860, + "tokens_per_second_per_gpu": 314.75 + }, + { + "epoch": 1.875975543073023, + "grad_norm": 0.5932151079177856, + "learning_rate": 1.9121063153184293e-06, + "loss": 0.1639, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 18870, + "tokens_per_second_per_gpu": 378.07 + }, + { + "epoch": 1.8769697270964856, + "grad_norm": 0.6450536847114563, + "learning_rate": 1.8816824331481575e-06, + "loss": 0.1325, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 18880, + "tokens_per_second_per_gpu": 353.93 + }, + { + "epoch": 1.8779639111199482, + "grad_norm": 0.498829185962677, + "learning_rate": 1.8515002371318312e-06, + "loss": 0.1227, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 18890, + "tokens_per_second_per_gpu": 357.94 + }, + { + "epoch": 1.878958095143411, + "grad_norm": 0.5611813068389893, + "learning_rate": 1.8215598016145807e-06, + "loss": 0.202, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.11, + "memory/max_allocated (GiB)": 19.11, + "step": 18900, + "tokens_per_second_per_gpu": 402.84 + }, + { + "epoch": 1.879952279166874, + "grad_norm": 0.42488303780555725, + "learning_rate": 1.7918612003460234e-06, + "loss": 0.1108, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 18910, + "tokens_per_second_per_gpu": 336.67 + }, + { + "epoch": 1.8809464631903365, + "grad_norm": 0.7830858826637268, + "learning_rate": 1.7624045064800975e-06, + "loss": 0.1591, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.45, + "memory/max_allocated (GiB)": 20.45, + "step": 18920, + "tokens_per_second_per_gpu": 323.72 + }, + { + "epoch": 1.8819406472137992, + "grad_norm": 0.4499165415763855, + "learning_rate": 1.7331897925748518e-06, + "loss": 0.1812, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 18930, + "tokens_per_second_per_gpu": 380.35 + }, + { + "epoch": 1.882934831237262, + "grad_norm": 0.6066138744354248, + "learning_rate": 1.7042171305923115e-06, + "loss": 0.0976, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 18940, + "tokens_per_second_per_gpu": 327.67 + }, + { + "epoch": 1.8839290152607249, + "grad_norm": 0.30252575874328613, + "learning_rate": 1.6754865918982677e-06, + "loss": 0.1591, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 18950, + "tokens_per_second_per_gpu": 301.08 + }, + { + "epoch": 1.8849231992841875, + "grad_norm": 0.575855016708374, + "learning_rate": 1.6469982472621103e-06, + "loss": 0.146, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 18960, + "tokens_per_second_per_gpu": 341.39 + }, + { + "epoch": 1.8859173833076501, + "grad_norm": 0.7902649641036987, + "learning_rate": 1.6187521668566518e-06, + "loss": 0.1789, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 18970, + "tokens_per_second_per_gpu": 370.72 + }, + { + "epoch": 1.886911567331113, + "grad_norm": 0.6112450957298279, + "learning_rate": 1.5907484202579482e-06, + "loss": 0.1358, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 18980, + "tokens_per_second_per_gpu": 340.31 + }, + { + "epoch": 1.8879057513545758, + "grad_norm": 0.3243188261985779, + "learning_rate": 1.562987076445177e-06, + "loss": 0.1604, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.54, + "memory/max_allocated (GiB)": 21.54, + "step": 18990, + "tokens_per_second_per_gpu": 383.91 + }, + { + "epoch": 1.8888999353780385, + "grad_norm": 0.3223695158958435, + "learning_rate": 1.53546820380035e-06, + "loss": 0.1408, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 19000, + "tokens_per_second_per_gpu": 346.51 + }, + { + "epoch": 1.889894119401501, + "grad_norm": 0.49110448360443115, + "learning_rate": 1.508191870108311e-06, + "loss": 0.1379, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 19010, + "tokens_per_second_per_gpu": 399.45 + }, + { + "epoch": 1.890888303424964, + "grad_norm": 0.3531555235385895, + "learning_rate": 1.4811581425563936e-06, + "loss": 0.1755, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 19020, + "tokens_per_second_per_gpu": 360.95 + }, + { + "epoch": 1.8918824874484268, + "grad_norm": 0.3664921522140503, + "learning_rate": 1.4543670877344207e-06, + "loss": 0.1661, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 19030, + "tokens_per_second_per_gpu": 397.9 + }, + { + "epoch": 1.8928766714718894, + "grad_norm": 0.6378931403160095, + "learning_rate": 1.4278187716344039e-06, + "loss": 0.174, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 19040, + "tokens_per_second_per_gpu": 417.45 + }, + { + "epoch": 1.893870855495352, + "grad_norm": 0.1918243020772934, + "learning_rate": 1.4015132596504554e-06, + "loss": 0.1896, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 19050, + "tokens_per_second_per_gpu": 407.04 + }, + { + "epoch": 1.894865039518815, + "grad_norm": 0.5390235781669617, + "learning_rate": 1.3754506165786108e-06, + "loss": 0.1789, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 19060, + "tokens_per_second_per_gpu": 441.99 + }, + { + "epoch": 1.8958592235422778, + "grad_norm": 0.5331469178199768, + "learning_rate": 1.3496309066166724e-06, + "loss": 0.145, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 19070, + "tokens_per_second_per_gpu": 392.93 + }, + { + "epoch": 1.8968534075657404, + "grad_norm": 0.48252302408218384, + "learning_rate": 1.3240541933640439e-06, + "loss": 0.159, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 19080, + "tokens_per_second_per_gpu": 330.47 + }, + { + "epoch": 1.897847591589203, + "grad_norm": 0.5958520174026489, + "learning_rate": 1.298720539821563e-06, + "loss": 0.1252, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 19090, + "tokens_per_second_per_gpu": 325.74 + }, + { + "epoch": 1.8988417756126659, + "grad_norm": 0.675596296787262, + "learning_rate": 1.273630008391402e-06, + "loss": 0.1731, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 19100, + "tokens_per_second_per_gpu": 356.72 + }, + { + "epoch": 1.8998359596361287, + "grad_norm": 0.5923382043838501, + "learning_rate": 1.2487826608768127e-06, + "loss": 0.149, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 19110, + "tokens_per_second_per_gpu": 323.65 + }, + { + "epoch": 1.9008301436595914, + "grad_norm": 0.3214609920978546, + "learning_rate": 1.2241785584820808e-06, + "loss": 0.1449, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 19120, + "tokens_per_second_per_gpu": 397.11 + }, + { + "epoch": 1.901824327683054, + "grad_norm": 0.47794198989868164, + "learning_rate": 1.199817761812294e-06, + "loss": 0.0914, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 19130, + "tokens_per_second_per_gpu": 309.82 + }, + { + "epoch": 1.9028185117065168, + "grad_norm": 0.5440109372138977, + "learning_rate": 1.175700330873275e-06, + "loss": 0.1356, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 19140, + "tokens_per_second_per_gpu": 385.7 + }, + { + "epoch": 1.9038126957299797, + "grad_norm": 0.5214424729347229, + "learning_rate": 1.1518263250713147e-06, + "loss": 0.1127, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 19150, + "tokens_per_second_per_gpu": 314.47 + }, + { + "epoch": 1.9048068797534423, + "grad_norm": 0.42965662479400635, + "learning_rate": 1.1281958032131611e-06, + "loss": 0.1387, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 19160, + "tokens_per_second_per_gpu": 381.94 + }, + { + "epoch": 1.905801063776905, + "grad_norm": 0.2793689966201782, + "learning_rate": 1.1048088235057762e-06, + "loss": 0.1781, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 19170, + "tokens_per_second_per_gpu": 312.99 + }, + { + "epoch": 1.9067952478003678, + "grad_norm": 0.8887554407119751, + "learning_rate": 1.0816654435562458e-06, + "loss": 0.1757, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 19180, + "tokens_per_second_per_gpu": 381.31 + }, + { + "epoch": 1.9077894318238307, + "grad_norm": 0.4591097831726074, + "learning_rate": 1.0587657203715795e-06, + "loss": 0.1417, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 19190, + "tokens_per_second_per_gpu": 326.93 + }, + { + "epoch": 1.9087836158472933, + "grad_norm": 0.4055291414260864, + "learning_rate": 1.036109710358657e-06, + "loss": 0.1935, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 19200, + "tokens_per_second_per_gpu": 363.7 + }, + { + "epoch": 1.9097777998707561, + "grad_norm": 0.6825937628746033, + "learning_rate": 1.0136974693240153e-06, + "loss": 0.1551, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 17.41, + "memory/max_allocated (GiB)": 17.41, + "step": 19210, + "tokens_per_second_per_gpu": 291.2 + }, + { + "epoch": 1.910771983894219, + "grad_norm": 0.43678414821624756, + "learning_rate": 9.915290524737274e-07, + "loss": 0.1015, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.69, + "memory/max_allocated (GiB)": 19.69, + "step": 19220, + "tokens_per_second_per_gpu": 363.79 + }, + { + "epoch": 1.9117661679176816, + "grad_norm": 0.24454043805599213, + "learning_rate": 9.696045144133136e-07, + "loss": 0.1326, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.19, + "memory/max_allocated (GiB)": 18.19, + "step": 19230, + "tokens_per_second_per_gpu": 314.11 + }, + { + "epoch": 1.9127603519411442, + "grad_norm": 0.40022847056388855, + "learning_rate": 9.4792390914753e-07, + "loss": 0.1666, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.65, + "memory/max_allocated (GiB)": 20.65, + "step": 19240, + "tokens_per_second_per_gpu": 377.05 + }, + { + "epoch": 1.913754535964607, + "grad_norm": 0.4418630003929138, + "learning_rate": 9.264872900802912e-07, + "loss": 0.2076, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 19250, + "tokens_per_second_per_gpu": 374.03 + }, + { + "epoch": 1.91474871998807, + "grad_norm": 0.1951112598180771, + "learning_rate": 9.052947100145149e-07, + "loss": 0.1309, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 19260, + "tokens_per_second_per_gpu": 372.81 + }, + { + "epoch": 1.9157429040115326, + "grad_norm": 0.4262109100818634, + "learning_rate": 8.843462211520215e-07, + "loss": 0.159, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 19270, + "tokens_per_second_per_gpu": 352.44 + }, + { + "epoch": 1.9167370880349952, + "grad_norm": 0.35612645745277405, + "learning_rate": 8.636418750933461e-07, + "loss": 0.1805, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.78, + "memory/max_allocated (GiB)": 20.78, + "step": 19280, + "tokens_per_second_per_gpu": 423.92 + }, + { + "epoch": 1.917731272058458, + "grad_norm": 0.4936760365962982, + "learning_rate": 8.431817228376937e-07, + "loss": 0.151, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 19290, + "tokens_per_second_per_gpu": 382.85 + }, + { + "epoch": 1.918725456081921, + "grad_norm": 0.6932761669158936, + "learning_rate": 8.229658147827169e-07, + "loss": 0.1807, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 19300, + "tokens_per_second_per_gpu": 369.43 + }, + { + "epoch": 1.9197196401053835, + "grad_norm": 0.40733301639556885, + "learning_rate": 8.02994200724494e-07, + "loss": 0.0993, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 19310, + "tokens_per_second_per_gpu": 290.15 + }, + { + "epoch": 1.9207138241288462, + "grad_norm": 0.49551132321357727, + "learning_rate": 7.83266929857307e-07, + "loss": 0.1942, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 19320, + "tokens_per_second_per_gpu": 361.74 + }, + { + "epoch": 1.921708008152309, + "grad_norm": 0.5282984972000122, + "learning_rate": 7.637840507736194e-07, + "loss": 0.2019, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 19330, + "tokens_per_second_per_gpu": 354.99 + }, + { + "epoch": 1.9227021921757719, + "grad_norm": 0.6770442724227905, + "learning_rate": 7.445456114638539e-07, + "loss": 0.1387, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 19340, + "tokens_per_second_per_gpu": 297.34 + }, + { + "epoch": 1.9236963761992345, + "grad_norm": 0.35707518458366394, + "learning_rate": 7.255516593163703e-07, + "loss": 0.1626, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 19350, + "tokens_per_second_per_gpu": 361.77 + }, + { + "epoch": 1.9246905602226971, + "grad_norm": 0.5072949528694153, + "learning_rate": 7.06802241117288e-07, + "loss": 0.19, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.45, + "memory/max_allocated (GiB)": 20.45, + "step": 19360, + "tokens_per_second_per_gpu": 382.78 + }, + { + "epoch": 1.92568474424616, + "grad_norm": 0.5360376834869385, + "learning_rate": 6.882974030503863e-07, + "loss": 0.1228, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 19370, + "tokens_per_second_per_gpu": 318.51 + }, + { + "epoch": 1.9266789282696228, + "grad_norm": 0.5876972079277039, + "learning_rate": 6.700371906969815e-07, + "loss": 0.1511, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 19380, + "tokens_per_second_per_gpu": 409.23 + }, + { + "epoch": 1.9276731122930855, + "grad_norm": 0.5715491771697998, + "learning_rate": 6.520216490358388e-07, + "loss": 0.1781, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.16, + "memory/max_allocated (GiB)": 18.16, + "step": 19390, + "tokens_per_second_per_gpu": 394.12 + }, + { + "epoch": 1.928667296316548, + "grad_norm": 0.47656625509262085, + "learning_rate": 6.342508224430499e-07, + "loss": 0.1633, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 19400, + "tokens_per_second_per_gpu": 406.05 + }, + { + "epoch": 1.929661480340011, + "grad_norm": 0.531021237373352, + "learning_rate": 6.167247546919219e-07, + "loss": 0.1678, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 19410, + "tokens_per_second_per_gpu": 428.32 + }, + { + "epoch": 1.9306556643634738, + "grad_norm": 0.5396085381507874, + "learning_rate": 5.994434889528556e-07, + "loss": 0.1532, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 19420, + "tokens_per_second_per_gpu": 393.0 + }, + { + "epoch": 1.9316498483869364, + "grad_norm": 0.33545199036598206, + "learning_rate": 5.824070677932558e-07, + "loss": 0.1298, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 19430, + "tokens_per_second_per_gpu": 361.14 + }, + { + "epoch": 1.932644032410399, + "grad_norm": 0.3300628066062927, + "learning_rate": 5.656155331774437e-07, + "loss": 0.155, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 19440, + "tokens_per_second_per_gpu": 350.34 + }, + { + "epoch": 1.933638216433862, + "grad_norm": 0.46381351351737976, + "learning_rate": 5.490689264665117e-07, + "loss": 0.1298, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 19450, + "tokens_per_second_per_gpu": 357.6 + }, + { + "epoch": 1.9346324004573248, + "grad_norm": 0.5388414263725281, + "learning_rate": 5.327672884182455e-07, + "loss": 0.1173, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 19460, + "tokens_per_second_per_gpu": 354.37 + }, + { + "epoch": 1.9356265844807874, + "grad_norm": 0.3915001153945923, + "learning_rate": 5.167106591870252e-07, + "loss": 0.1224, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 19470, + "tokens_per_second_per_gpu": 298.76 + }, + { + "epoch": 1.93662076850425, + "grad_norm": 0.3780313730239868, + "learning_rate": 5.008990783237244e-07, + "loss": 0.1175, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 19480, + "tokens_per_second_per_gpu": 300.87 + }, + { + "epoch": 1.9376149525277129, + "grad_norm": 0.5083069801330566, + "learning_rate": 4.853325847755997e-07, + "loss": 0.144, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 19490, + "tokens_per_second_per_gpu": 391.62 + }, + { + "epoch": 1.9386091365511757, + "grad_norm": 0.8569310307502747, + "learning_rate": 4.700112168862347e-07, + "loss": 0.1368, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 19500, + "tokens_per_second_per_gpu": 364.34 + }, + { + "epoch": 1.9396033205746384, + "grad_norm": 0.6087594032287598, + "learning_rate": 4.549350123953855e-07, + "loss": 0.1282, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 19510, + "tokens_per_second_per_gpu": 351.22 + }, + { + "epoch": 1.940597504598101, + "grad_norm": 0.5610264539718628, + "learning_rate": 4.4010400843892407e-07, + "loss": 0.1441, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 19520, + "tokens_per_second_per_gpu": 344.44 + }, + { + "epoch": 1.9415916886215638, + "grad_norm": 0.3209472894668579, + "learning_rate": 4.255182415487613e-07, + "loss": 0.1601, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 19530, + "tokens_per_second_per_gpu": 371.35 + }, + { + "epoch": 1.9425858726450267, + "grad_norm": 0.34549176692962646, + "learning_rate": 4.1117774765270235e-07, + "loss": 0.1846, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 19540, + "tokens_per_second_per_gpu": 392.28 + }, + { + "epoch": 1.9435800566684893, + "grad_norm": 0.610714852809906, + "learning_rate": 3.970825620744467e-07, + "loss": 0.1392, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 19550, + "tokens_per_second_per_gpu": 300.36 + }, + { + "epoch": 1.944574240691952, + "grad_norm": 0.42442601919174194, + "learning_rate": 3.8323271953338844e-07, + "loss": 0.2258, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 19560, + "tokens_per_second_per_gpu": 434.15 + }, + { + "epoch": 1.9455684247154148, + "grad_norm": 0.30905285477638245, + "learning_rate": 3.696282541446272e-07, + "loss": 0.1478, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 19570, + "tokens_per_second_per_gpu": 368.05 + }, + { + "epoch": 1.9465626087388777, + "grad_norm": 0.29837194085121155, + "learning_rate": 3.56269199418835e-07, + "loss": 0.1548, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 19580, + "tokens_per_second_per_gpu": 314.0 + }, + { + "epoch": 1.9475567927623403, + "grad_norm": 0.2777169346809387, + "learning_rate": 3.431555882621895e-07, + "loss": 0.1615, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 19590, + "tokens_per_second_per_gpu": 391.63 + }, + { + "epoch": 1.948550976785803, + "grad_norm": 0.7126250267028809, + "learning_rate": 3.302874529762745e-07, + "loss": 0.1598, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 19600, + "tokens_per_second_per_gpu": 376.12 + }, + { + "epoch": 1.9495451608092658, + "grad_norm": 0.5951281785964966, + "learning_rate": 3.176648252580461e-07, + "loss": 0.1509, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 19610, + "tokens_per_second_per_gpu": 361.14 + }, + { + "epoch": 1.9505393448327286, + "grad_norm": 0.5065922737121582, + "learning_rate": 3.0528773619969977e-07, + "loss": 0.1458, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 19620, + "tokens_per_second_per_gpu": 343.47 + }, + { + "epoch": 1.9515335288561912, + "grad_norm": 0.463041216135025, + "learning_rate": 2.9315621628860366e-07, + "loss": 0.1204, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 19630, + "tokens_per_second_per_gpu": 303.97 + }, + { + "epoch": 1.9525277128796539, + "grad_norm": 0.7519829869270325, + "learning_rate": 2.812702954072877e-07, + "loss": 0.1774, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 19640, + "tokens_per_second_per_gpu": 321.53 + }, + { + "epoch": 1.9535218969031167, + "grad_norm": 0.52646404504776, + "learning_rate": 2.6963000283325434e-07, + "loss": 0.1509, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 19650, + "tokens_per_second_per_gpu": 311.88 + }, + { + "epoch": 1.9545160809265796, + "grad_norm": 0.4693813920021057, + "learning_rate": 2.5823536723902364e-07, + "loss": 0.1782, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 19660, + "tokens_per_second_per_gpu": 393.51 + }, + { + "epoch": 1.9555102649500422, + "grad_norm": 0.5138804912567139, + "learning_rate": 2.470864166919884e-07, + "loss": 0.16, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 19670, + "tokens_per_second_per_gpu": 343.73 + }, + { + "epoch": 1.9565044489735048, + "grad_norm": 0.5430607795715332, + "learning_rate": 2.361831786543589e-07, + "loss": 0.0975, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 19680, + "tokens_per_second_per_gpu": 321.72 + }, + { + "epoch": 1.9574986329969677, + "grad_norm": 0.39533936977386475, + "learning_rate": 2.2552567998312957e-07, + "loss": 0.1112, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 19690, + "tokens_per_second_per_gpu": 342.61 + }, + { + "epoch": 1.9584928170204305, + "grad_norm": 0.4892284870147705, + "learning_rate": 2.151139469299679e-07, + "loss": 0.1444, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 19700, + "tokens_per_second_per_gpu": 337.57 + }, + { + "epoch": 1.9594870010438932, + "grad_norm": 0.6313555836677551, + "learning_rate": 2.0494800514117007e-07, + "loss": 0.1839, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.43, + "memory/max_allocated (GiB)": 20.43, + "step": 19710, + "tokens_per_second_per_gpu": 379.84 + }, + { + "epoch": 1.9604811850673558, + "grad_norm": 0.3936193585395813, + "learning_rate": 1.950278796576055e-07, + "loss": 0.1649, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.42, + "memory/max_allocated (GiB)": 21.42, + "step": 19720, + "tokens_per_second_per_gpu": 352.38 + }, + { + "epoch": 1.9614753690908189, + "grad_norm": 0.4717642068862915, + "learning_rate": 1.8535359491462789e-07, + "loss": 0.1267, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 19730, + "tokens_per_second_per_gpu": 297.29 + }, + { + "epoch": 1.9624695531142815, + "grad_norm": 0.574255108833313, + "learning_rate": 1.7592517474205317e-07, + "loss": 0.1551, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 19740, + "tokens_per_second_per_gpu": 367.19 + }, + { + "epoch": 1.9634637371377441, + "grad_norm": 0.6956222057342529, + "learning_rate": 1.6674264236408165e-07, + "loss": 0.1482, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 19750, + "tokens_per_second_per_gpu": 278.73 + }, + { + "epoch": 1.964457921161207, + "grad_norm": 0.5839533805847168, + "learning_rate": 1.5780602039920932e-07, + "loss": 0.1813, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.54, + "memory/max_allocated (GiB)": 21.54, + "step": 19760, + "tokens_per_second_per_gpu": 344.8 + }, + { + "epoch": 1.9654521051846698, + "grad_norm": 0.4014669954776764, + "learning_rate": 1.4911533086024997e-07, + "loss": 0.1851, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 19770, + "tokens_per_second_per_gpu": 318.26 + }, + { + "epoch": 1.9664462892081325, + "grad_norm": 0.33930712938308716, + "learning_rate": 1.406705951541909e-07, + "loss": 0.1547, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.41, + "memory/max_allocated (GiB)": 21.41, + "step": 19780, + "tokens_per_second_per_gpu": 422.7 + }, + { + "epoch": 1.967440473231595, + "grad_norm": 0.7784512639045715, + "learning_rate": 1.32471834082204e-07, + "loss": 0.1737, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 19790, + "tokens_per_second_per_gpu": 379.19 + }, + { + "epoch": 1.968434657255058, + "grad_norm": 0.5413442850112915, + "learning_rate": 1.2451906783957912e-07, + "loss": 0.1701, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 19800, + "tokens_per_second_per_gpu": 413.24 + }, + { + "epoch": 1.9694288412785208, + "grad_norm": 0.49978771805763245, + "learning_rate": 1.1681231601564647e-07, + "loss": 0.1503, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 19810, + "tokens_per_second_per_gpu": 325.42 + }, + { + "epoch": 1.9704230253019834, + "grad_norm": 0.5488161444664001, + "learning_rate": 1.0935159759378755e-07, + "loss": 0.1421, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 19820, + "tokens_per_second_per_gpu": 347.6 + }, + { + "epoch": 1.971417209325446, + "grad_norm": 0.36043041944503784, + "learning_rate": 1.0213693095130206e-07, + "loss": 0.1309, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 19830, + "tokens_per_second_per_gpu": 286.15 + }, + { + "epoch": 1.972411393348909, + "grad_norm": 0.30269375443458557, + "learning_rate": 9.516833385945224e-08, + "loss": 0.1344, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 19840, + "tokens_per_second_per_gpu": 411.14 + }, + { + "epoch": 1.9734055773723718, + "grad_norm": 0.6402490735054016, + "learning_rate": 8.844582348336294e-08, + "loss": 0.1754, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 19850, + "tokens_per_second_per_gpu": 333.73 + }, + { + "epoch": 1.9743997613958344, + "grad_norm": 0.3337123394012451, + "learning_rate": 8.196941638199951e-08, + "loss": 0.1245, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 19860, + "tokens_per_second_per_gpu": 365.58 + }, + { + "epoch": 1.975393945419297, + "grad_norm": 0.4003879427909851, + "learning_rate": 7.573912850812326e-08, + "loss": 0.1501, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 19870, + "tokens_per_second_per_gpu": 344.89 + }, + { + "epoch": 1.9763881294427599, + "grad_norm": 0.21886324882507324, + "learning_rate": 6.975497520824715e-08, + "loss": 0.1815, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 19880, + "tokens_per_second_per_gpu": 419.3 + }, + { + "epoch": 1.9773823134662227, + "grad_norm": 0.7370038628578186, + "learning_rate": 6.401697122260241e-08, + "loss": 0.1328, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.18, + "memory/max_allocated (GiB)": 18.18, + "step": 19890, + "tokens_per_second_per_gpu": 332.95 + }, + { + "epoch": 1.9783764974896854, + "grad_norm": 0.46660006046295166, + "learning_rate": 5.852513068511645e-08, + "loss": 0.1096, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 19900, + "tokens_per_second_per_gpu": 298.67 + }, + { + "epoch": 1.979370681513148, + "grad_norm": 0.5484210848808289, + "learning_rate": 5.3279467123346086e-08, + "loss": 0.1374, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 19910, + "tokens_per_second_per_gpu": 340.44 + }, + { + "epoch": 1.9803648655366108, + "grad_norm": 0.4510185122489929, + "learning_rate": 4.827999345846657e-08, + "loss": 0.1858, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 19920, + "tokens_per_second_per_gpu": 407.5 + }, + { + "epoch": 1.9813590495600737, + "grad_norm": 0.4410419464111328, + "learning_rate": 4.352672200523822e-08, + "loss": 0.1675, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 19930, + "tokens_per_second_per_gpu": 308.63 + }, + { + "epoch": 1.9823532335835363, + "grad_norm": 0.5038071274757385, + "learning_rate": 3.901966447197314e-08, + "loss": 0.1349, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 19940, + "tokens_per_second_per_gpu": 351.5 + }, + { + "epoch": 1.983347417606999, + "grad_norm": 1.0024924278259277, + "learning_rate": 3.4758831960524095e-08, + "loss": 0.1633, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 19950, + "tokens_per_second_per_gpu": 326.94 + }, + { + "epoch": 1.9843416016304618, + "grad_norm": 0.5408278107643127, + "learning_rate": 3.0744234966195715e-08, + "loss": 0.1853, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 19960, + "tokens_per_second_per_gpu": 380.23 + }, + { + "epoch": 1.9853357856539247, + "grad_norm": 0.6154715418815613, + "learning_rate": 2.6975883377799993e-08, + "loss": 0.159, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 19970, + "tokens_per_second_per_gpu": 345.31 + }, + { + "epoch": 1.9863299696773873, + "grad_norm": 0.52059006690979, + "learning_rate": 2.3453786477589668e-08, + "loss": 0.1489, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.64, + "memory/max_allocated (GiB)": 20.64, + "step": 19980, + "tokens_per_second_per_gpu": 365.37 + }, + { + "epoch": 1.98732415370085, + "grad_norm": 0.5241625905036926, + "learning_rate": 2.0177952941224932e-08, + "loss": 0.1679, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 19990, + "tokens_per_second_per_gpu": 361.81 + }, + { + "epoch": 1.9883183377243128, + "grad_norm": 0.5427475571632385, + "learning_rate": 1.7148390837784523e-08, + "loss": 0.1612, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 20000, + "tokens_per_second_per_gpu": 405.51 + }, + { + "epoch": 1.9893125217477756, + "grad_norm": 0.4156535267829895, + "learning_rate": 1.4365107629710218e-08, + "loss": 0.1309, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 20010, + "tokens_per_second_per_gpu": 371.64 + }, + { + "epoch": 1.9903067057712382, + "grad_norm": 0.5698441863059998, + "learning_rate": 1.182811017281793e-08, + "loss": 0.1373, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 20020, + "tokens_per_second_per_gpu": 339.51 + }, + { + "epoch": 1.9913008897947009, + "grad_norm": 0.7525358200073242, + "learning_rate": 9.537404716286613e-09, + "loss": 0.1489, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.67, + "memory/max_allocated (GiB)": 19.67, + "step": 20030, + "tokens_per_second_per_gpu": 322.13 + }, + { + "epoch": 1.9922950738181637, + "grad_norm": 0.6561192870140076, + "learning_rate": 7.49299690258054e-09, + "loss": 0.1278, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 20040, + "tokens_per_second_per_gpu": 405.65 + }, + { + "epoch": 1.9932892578416266, + "grad_norm": 0.28374719619750977, + "learning_rate": 5.694891767527022e-09, + "loss": 0.1298, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 20050, + "tokens_per_second_per_gpu": 347.78 + }, + { + "epoch": 1.9942834418650892, + "grad_norm": 0.46046513319015503, + "learning_rate": 4.1430937402275885e-09, + "loss": 0.105, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 20060, + "tokens_per_second_per_gpu": 354.72 + }, + { + "epoch": 1.9952776258885518, + "grad_norm": 0.6896146535873413, + "learning_rate": 2.837606643102397e-09, + "loss": 0.1675, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 20070, + "tokens_per_second_per_gpu": 322.76 + }, + { + "epoch": 1.9962718099120147, + "grad_norm": 0.29641345143318176, + "learning_rate": 1.7784336918347244e-09, + "loss": 0.1681, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 20080, + "tokens_per_second_per_gpu": 359.15 + }, + { + "epoch": 1.9972659939354775, + "grad_norm": 0.6377694010734558, + "learning_rate": 9.655774953931662e-10, + "loss": 0.164, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 20090, + "tokens_per_second_per_gpu": 351.54 + }, + { + "epoch": 1.9982601779589402, + "grad_norm": 0.5432741641998291, + "learning_rate": 3.990400559983343e-10, + "loss": 0.1053, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 20100, + "tokens_per_second_per_gpu": 346.46 + }, + { + "epoch": 1.9992543619824028, + "grad_norm": 0.35591670870780945, + "learning_rate": 7.882276917836606e-11, + "loss": 0.2105, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 20110, + "tokens_per_second_per_gpu": 408.89 + } + ], + "logging_steps": 10, + "max_steps": 20117, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.2374951650690335e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}