diff --git "a/checkpoint-10059/trainer_state.json" "b/checkpoint-10059/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-10059/trainer_state.json" @@ -0,0 +1,11089 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10059, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000994184023462743, + "grad_norm": 1.0385397672653198, + "learning_rate": 1.8e-05, + "loss": 0.604, + "memory/device_reserved (GiB)": 21.62, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 10, + "tokens_per_second_per_gpu": 347.46 + }, + { + "epoch": 0.001988368046925486, + "grad_norm": 0.556696891784668, + "learning_rate": 3.8e-05, + "loss": 0.4244, + "memory/device_reserved (GiB)": 21.62, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 20, + "tokens_per_second_per_gpu": 318.15 + }, + { + "epoch": 0.002982552070388229, + "grad_norm": 0.24665255844593048, + "learning_rate": 5.8e-05, + "loss": 0.3883, + "memory/device_reserved (GiB)": 21.62, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 30, + "tokens_per_second_per_gpu": 330.79 + }, + { + "epoch": 0.003976736093850972, + "grad_norm": 0.3350813090801239, + "learning_rate": 7.800000000000001e-05, + "loss": 0.4163, + "memory/device_reserved (GiB)": 22.38, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 40, + "tokens_per_second_per_gpu": 396.14 + }, + { + "epoch": 0.0049709201173137145, + "grad_norm": 0.42506587505340576, + "learning_rate": 9.8e-05, + "loss": 0.3811, + "memory/device_reserved (GiB)": 22.42, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 50, + "tokens_per_second_per_gpu": 402.54 + }, + { + "epoch": 0.005965104140776458, + "grad_norm": 0.5153183937072754, + "learning_rate": 0.000118, + "loss": 0.418, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 60, + "tokens_per_second_per_gpu": 330.44 + }, + { + "epoch": 0.0069592881642392005, + "grad_norm": 0.3010534644126892, + "learning_rate": 0.000138, + "loss": 0.3671, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 70, + "tokens_per_second_per_gpu": 344.47 + }, + { + "epoch": 0.007953472187701944, + "grad_norm": 0.46113327145576477, + "learning_rate": 0.00015800000000000002, + "loss": 0.3387, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 80, + "tokens_per_second_per_gpu": 372.27 + }, + { + "epoch": 0.008947656211164686, + "grad_norm": 0.4268002212047577, + "learning_rate": 0.00017800000000000002, + "loss": 0.2999, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 90, + "tokens_per_second_per_gpu": 344.49 + }, + { + "epoch": 0.009941840234627429, + "grad_norm": 0.5650917291641235, + "learning_rate": 0.00019800000000000002, + "loss": 0.3356, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 100, + "tokens_per_second_per_gpu": 335.57 + }, + { + "epoch": 0.010936024258090173, + "grad_norm": 0.2521424889564514, + "learning_rate": 0.00019999990023993625, + "loss": 0.3025, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 110, + "tokens_per_second_per_gpu": 307.78 + }, + { + "epoch": 0.011930208281552916, + "grad_norm": 0.34742406010627747, + "learning_rate": 0.00019999955539058868, + "loss": 0.351, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 120, + "tokens_per_second_per_gpu": 355.93 + }, + { + "epoch": 0.012924392305015658, + "grad_norm": 0.2816642224788666, + "learning_rate": 0.00019999896422120075, + "loss": 0.4031, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 130, + "tokens_per_second_per_gpu": 392.82 + }, + { + "epoch": 0.013918576328478401, + "grad_norm": 0.41705670952796936, + "learning_rate": 0.0001999981267332287, + "loss": 0.3481, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 140, + "tokens_per_second_per_gpu": 404.67 + }, + { + "epoch": 0.014912760351941143, + "grad_norm": 0.5290879011154175, + "learning_rate": 0.00019999704292873545, + "loss": 0.3784, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 150, + "tokens_per_second_per_gpu": 409.59 + }, + { + "epoch": 0.015906944375403888, + "grad_norm": 0.2704632878303528, + "learning_rate": 0.0001999957128103906, + "loss": 0.2029, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 160, + "tokens_per_second_per_gpu": 309.25 + }, + { + "epoch": 0.01690112839886663, + "grad_norm": 0.3863286077976227, + "learning_rate": 0.00019999413638147049, + "loss": 0.3084, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 170, + "tokens_per_second_per_gpu": 395.0 + }, + { + "epoch": 0.017895312422329373, + "grad_norm": 0.32178717851638794, + "learning_rate": 0.00019999231364585827, + "loss": 0.2713, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 180, + "tokens_per_second_per_gpu": 343.31 + }, + { + "epoch": 0.018889496445792117, + "grad_norm": 0.3010699450969696, + "learning_rate": 0.00019999024460804366, + "loss": 0.353, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 190, + "tokens_per_second_per_gpu": 353.97 + }, + { + "epoch": 0.019883680469254858, + "grad_norm": 0.2678498327732086, + "learning_rate": 0.00019998792927312315, + "loss": 0.1904, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 200, + "tokens_per_second_per_gpu": 352.72 + }, + { + "epoch": 0.020877864492717602, + "grad_norm": 0.25298821926116943, + "learning_rate": 0.00019998536764679993, + "loss": 0.2397, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 210, + "tokens_per_second_per_gpu": 336.24 + }, + { + "epoch": 0.021872048516180347, + "grad_norm": 0.4027327001094818, + "learning_rate": 0.0001999825597353838, + "loss": 0.3111, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 220, + "tokens_per_second_per_gpu": 388.05 + }, + { + "epoch": 0.022866232539643087, + "grad_norm": 0.4060591757297516, + "learning_rate": 0.00019997950554579124, + "loss": 0.2578, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 230, + "tokens_per_second_per_gpu": 354.57 + }, + { + "epoch": 0.02386041656310583, + "grad_norm": 0.29408156871795654, + "learning_rate": 0.00019997620508554537, + "loss": 0.2952, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.45, + "memory/max_allocated (GiB)": 20.45, + "step": 240, + "tokens_per_second_per_gpu": 334.85 + }, + { + "epoch": 0.024854600586568572, + "grad_norm": 0.381528377532959, + "learning_rate": 0.00019997265836277595, + "loss": 0.2397, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 250, + "tokens_per_second_per_gpu": 402.73 + }, + { + "epoch": 0.025848784610031317, + "grad_norm": 0.30223792791366577, + "learning_rate": 0.00019996886538621925, + "loss": 0.4017, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 260, + "tokens_per_second_per_gpu": 430.9 + }, + { + "epoch": 0.02684296863349406, + "grad_norm": 0.3889918327331543, + "learning_rate": 0.0001999648261652182, + "loss": 0.2051, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 270, + "tokens_per_second_per_gpu": 279.64 + }, + { + "epoch": 0.027837152656956802, + "grad_norm": 0.4357030391693115, + "learning_rate": 0.00019996054070972225, + "loss": 0.3332, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 280, + "tokens_per_second_per_gpu": 362.33 + }, + { + "epoch": 0.028831336680419546, + "grad_norm": 0.3736005425453186, + "learning_rate": 0.00019995600903028742, + "loss": 0.3052, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 290, + "tokens_per_second_per_gpu": 362.89 + }, + { + "epoch": 0.029825520703882287, + "grad_norm": 0.39748865365982056, + "learning_rate": 0.00019995123113807615, + "loss": 0.361, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 300, + "tokens_per_second_per_gpu": 387.24 + }, + { + "epoch": 0.03081970472734503, + "grad_norm": 0.18977899849414825, + "learning_rate": 0.00019994620704485741, + "loss": 0.2449, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 310, + "tokens_per_second_per_gpu": 339.16 + }, + { + "epoch": 0.031813888750807776, + "grad_norm": 0.3898354172706604, + "learning_rate": 0.00019994093676300662, + "loss": 0.266, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 320, + "tokens_per_second_per_gpu": 360.62 + }, + { + "epoch": 0.032808072774270516, + "grad_norm": 0.3335312008857727, + "learning_rate": 0.00019993542030550553, + "loss": 0.2886, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 330, + "tokens_per_second_per_gpu": 386.95 + }, + { + "epoch": 0.03380225679773326, + "grad_norm": 0.3043772280216217, + "learning_rate": 0.00019992965768594244, + "loss": 0.2542, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 340, + "tokens_per_second_per_gpu": 400.18 + }, + { + "epoch": 0.034796440821196005, + "grad_norm": 0.35784608125686646, + "learning_rate": 0.00019992364891851185, + "loss": 0.2748, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.66, + "memory/max_allocated (GiB)": 19.66, + "step": 350, + "tokens_per_second_per_gpu": 333.64 + }, + { + "epoch": 0.035790624844658746, + "grad_norm": 0.5068204998970032, + "learning_rate": 0.00019991739401801464, + "loss": 0.2705, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 360, + "tokens_per_second_per_gpu": 367.07 + }, + { + "epoch": 0.03678480886812149, + "grad_norm": 0.44382113218307495, + "learning_rate": 0.00019991089299985793, + "loss": 0.2403, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 370, + "tokens_per_second_per_gpu": 365.0 + }, + { + "epoch": 0.037778992891584234, + "grad_norm": 0.27181145548820496, + "learning_rate": 0.0001999041458800551, + "loss": 0.3065, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.18, + "memory/max_allocated (GiB)": 18.18, + "step": 380, + "tokens_per_second_per_gpu": 372.54 + }, + { + "epoch": 0.038773176915046975, + "grad_norm": 0.28408923745155334, + "learning_rate": 0.00019989715267522575, + "loss": 0.2894, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 390, + "tokens_per_second_per_gpu": 340.31 + }, + { + "epoch": 0.039767360938509716, + "grad_norm": 0.4882698357105255, + "learning_rate": 0.00019988991340259563, + "loss": 0.4061, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 400, + "tokens_per_second_per_gpu": 358.94 + }, + { + "epoch": 0.040761544961972464, + "grad_norm": 0.2663392722606659, + "learning_rate": 0.0001998824280799966, + "loss": 0.3141, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 410, + "tokens_per_second_per_gpu": 368.76 + }, + { + "epoch": 0.041755728985435205, + "grad_norm": 0.25356051325798035, + "learning_rate": 0.00019987469672586654, + "loss": 0.3374, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 420, + "tokens_per_second_per_gpu": 402.99 + }, + { + "epoch": 0.042749913008897945, + "grad_norm": 0.4773045778274536, + "learning_rate": 0.00019986671935924946, + "loss": 0.2929, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 430, + "tokens_per_second_per_gpu": 360.32 + }, + { + "epoch": 0.04374409703236069, + "grad_norm": 0.37164929509162903, + "learning_rate": 0.0001998584959997953, + "loss": 0.3106, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 440, + "tokens_per_second_per_gpu": 415.21 + }, + { + "epoch": 0.044738281055823434, + "grad_norm": 0.3310747742652893, + "learning_rate": 0.00019985002666775986, + "loss": 0.2676, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 450, + "tokens_per_second_per_gpu": 363.95 + }, + { + "epoch": 0.045732465079286175, + "grad_norm": 0.32523512840270996, + "learning_rate": 0.000199841311384005, + "loss": 0.3139, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 460, + "tokens_per_second_per_gpu": 361.92 + }, + { + "epoch": 0.046726649102748916, + "grad_norm": 0.40525123476982117, + "learning_rate": 0.00019983235016999827, + "loss": 0.323, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 470, + "tokens_per_second_per_gpu": 413.28 + }, + { + "epoch": 0.04772083312621166, + "grad_norm": 0.4233141541481018, + "learning_rate": 0.000199823143047813, + "loss": 0.2941, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 480, + "tokens_per_second_per_gpu": 383.95 + }, + { + "epoch": 0.048715017149674404, + "grad_norm": 0.21106044948101044, + "learning_rate": 0.0001998136900401283, + "loss": 0.2835, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 490, + "tokens_per_second_per_gpu": 368.99 + }, + { + "epoch": 0.049709201173137145, + "grad_norm": 0.34198832511901855, + "learning_rate": 0.00019980399117022895, + "loss": 0.3895, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 500, + "tokens_per_second_per_gpu": 387.18 + }, + { + "epoch": 0.05070338519659989, + "grad_norm": 0.44045203924179077, + "learning_rate": 0.00019979404646200527, + "loss": 0.2854, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 510, + "tokens_per_second_per_gpu": 336.09 + }, + { + "epoch": 0.051697569220062634, + "grad_norm": 0.33906373381614685, + "learning_rate": 0.0001997838559399532, + "loss": 0.3218, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 520, + "tokens_per_second_per_gpu": 406.12 + }, + { + "epoch": 0.052691753243525374, + "grad_norm": 0.32613444328308105, + "learning_rate": 0.00019977341962917414, + "loss": 0.2803, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 530, + "tokens_per_second_per_gpu": 407.45 + }, + { + "epoch": 0.05368593726698812, + "grad_norm": 0.3789099454879761, + "learning_rate": 0.00019976273755537499, + "loss": 0.3143, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 540, + "tokens_per_second_per_gpu": 377.61 + }, + { + "epoch": 0.05468012129045086, + "grad_norm": 0.4602185785770416, + "learning_rate": 0.00019975180974486786, + "loss": 0.2434, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 550, + "tokens_per_second_per_gpu": 349.16 + }, + { + "epoch": 0.055674305313913604, + "grad_norm": 0.4232983887195587, + "learning_rate": 0.00019974063622457032, + "loss": 0.3238, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 560, + "tokens_per_second_per_gpu": 348.84 + }, + { + "epoch": 0.05666848933737635, + "grad_norm": 0.16223137080669403, + "learning_rate": 0.0001997292170220051, + "loss": 0.2722, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 570, + "tokens_per_second_per_gpu": 340.57 + }, + { + "epoch": 0.05766267336083909, + "grad_norm": 0.4484419822692871, + "learning_rate": 0.00019971755216530008, + "loss": 0.2801, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 580, + "tokens_per_second_per_gpu": 353.53 + }, + { + "epoch": 0.05865685738430183, + "grad_norm": 0.23834413290023804, + "learning_rate": 0.0001997056416831883, + "loss": 0.3015, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 590, + "tokens_per_second_per_gpu": 365.09 + }, + { + "epoch": 0.059651041407764574, + "grad_norm": 0.4154009521007538, + "learning_rate": 0.0001996934856050078, + "loss": 0.2959, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 600, + "tokens_per_second_per_gpu": 365.21 + }, + { + "epoch": 0.06064522543122732, + "grad_norm": 0.23120558261871338, + "learning_rate": 0.00019968108396070157, + "loss": 0.2563, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 610, + "tokens_per_second_per_gpu": 397.04 + }, + { + "epoch": 0.06163940945469006, + "grad_norm": 0.4453487694263458, + "learning_rate": 0.00019966843678081745, + "loss": 0.3025, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 620, + "tokens_per_second_per_gpu": 380.75 + }, + { + "epoch": 0.06263359347815281, + "grad_norm": 0.47098028659820557, + "learning_rate": 0.0001996555440965081, + "loss": 0.2248, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 630, + "tokens_per_second_per_gpu": 367.41 + }, + { + "epoch": 0.06362777750161555, + "grad_norm": 0.2540164887905121, + "learning_rate": 0.000199642405939531, + "loss": 0.2597, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 640, + "tokens_per_second_per_gpu": 361.69 + }, + { + "epoch": 0.06462196152507829, + "grad_norm": 0.30327877402305603, + "learning_rate": 0.00019962902234224816, + "loss": 0.2623, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 650, + "tokens_per_second_per_gpu": 316.43 + }, + { + "epoch": 0.06561614554854103, + "grad_norm": 0.3211521804332733, + "learning_rate": 0.00019961539333762622, + "loss": 0.2571, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 660, + "tokens_per_second_per_gpu": 352.04 + }, + { + "epoch": 0.06661032957200377, + "grad_norm": 0.19880682229995728, + "learning_rate": 0.00019960151895923628, + "loss": 0.2531, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 670, + "tokens_per_second_per_gpu": 419.33 + }, + { + "epoch": 0.06760451359546651, + "grad_norm": 0.3732224702835083, + "learning_rate": 0.0001995873992412539, + "loss": 0.3275, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 680, + "tokens_per_second_per_gpu": 428.82 + }, + { + "epoch": 0.06859869761892927, + "grad_norm": 0.2961219847202301, + "learning_rate": 0.00019957303421845889, + "loss": 0.2884, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 690, + "tokens_per_second_per_gpu": 394.73 + }, + { + "epoch": 0.06959288164239201, + "grad_norm": 0.4014001488685608, + "learning_rate": 0.00019955842392623539, + "loss": 0.25, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 700, + "tokens_per_second_per_gpu": 402.5 + }, + { + "epoch": 0.07058706566585475, + "grad_norm": 0.3465085029602051, + "learning_rate": 0.0001995435684005716, + "loss": 0.3208, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 710, + "tokens_per_second_per_gpu": 418.41 + }, + { + "epoch": 0.07158124968931749, + "grad_norm": 0.302223265171051, + "learning_rate": 0.0001995284676780598, + "loss": 0.2538, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 720, + "tokens_per_second_per_gpu": 303.27 + }, + { + "epoch": 0.07257543371278023, + "grad_norm": 0.27174416184425354, + "learning_rate": 0.00019951312179589632, + "loss": 0.2559, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 730, + "tokens_per_second_per_gpu": 373.05 + }, + { + "epoch": 0.07356961773624297, + "grad_norm": 0.23477095365524292, + "learning_rate": 0.00019949753079188124, + "loss": 0.2655, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 740, + "tokens_per_second_per_gpu": 334.0 + }, + { + "epoch": 0.07456380175970573, + "grad_norm": 0.35739773511886597, + "learning_rate": 0.00019948169470441855, + "loss": 0.2869, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 750, + "tokens_per_second_per_gpu": 398.33 + }, + { + "epoch": 0.07555798578316847, + "grad_norm": 0.42109552025794983, + "learning_rate": 0.0001994656135725159, + "loss": 0.254, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.69, + "memory/max_allocated (GiB)": 19.69, + "step": 760, + "tokens_per_second_per_gpu": 303.95 + }, + { + "epoch": 0.07655216980663121, + "grad_norm": 0.5730820298194885, + "learning_rate": 0.00019944928743578446, + "loss": 0.2718, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.09, + "memory/max_allocated (GiB)": 19.09, + "step": 770, + "tokens_per_second_per_gpu": 379.86 + }, + { + "epoch": 0.07754635383009395, + "grad_norm": 0.3591574430465698, + "learning_rate": 0.000199432716334439, + "loss": 0.3039, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 780, + "tokens_per_second_per_gpu": 387.31 + }, + { + "epoch": 0.07854053785355669, + "grad_norm": 0.2447095662355423, + "learning_rate": 0.0001994159003092976, + "loss": 0.3044, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 790, + "tokens_per_second_per_gpu": 332.59 + }, + { + "epoch": 0.07953472187701943, + "grad_norm": 1.134704351425171, + "learning_rate": 0.0001993988394017817, + "loss": 0.21, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 800, + "tokens_per_second_per_gpu": 376.02 + }, + { + "epoch": 0.08052890590048217, + "grad_norm": 0.4009522795677185, + "learning_rate": 0.00019938153365391595, + "loss": 0.3189, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 810, + "tokens_per_second_per_gpu": 409.87 + }, + { + "epoch": 0.08152308992394493, + "grad_norm": 0.32234618067741394, + "learning_rate": 0.00019936398310832802, + "loss": 0.3242, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 820, + "tokens_per_second_per_gpu": 429.4 + }, + { + "epoch": 0.08251727394740767, + "grad_norm": 0.42835402488708496, + "learning_rate": 0.00019934618780824865, + "loss": 0.2646, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 830, + "tokens_per_second_per_gpu": 334.03 + }, + { + "epoch": 0.08351145797087041, + "grad_norm": 0.39109423756599426, + "learning_rate": 0.00019932814779751143, + "loss": 0.2891, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 840, + "tokens_per_second_per_gpu": 386.65 + }, + { + "epoch": 0.08450564199433315, + "grad_norm": 0.30428260564804077, + "learning_rate": 0.00019930986312055268, + "loss": 0.2478, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 850, + "tokens_per_second_per_gpu": 344.59 + }, + { + "epoch": 0.08549982601779589, + "grad_norm": 0.41264912486076355, + "learning_rate": 0.00019929133382241146, + "loss": 0.2942, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 860, + "tokens_per_second_per_gpu": 315.94 + }, + { + "epoch": 0.08649401004125863, + "grad_norm": 0.3414939045906067, + "learning_rate": 0.00019927255994872932, + "loss": 0.2403, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 870, + "tokens_per_second_per_gpu": 373.66 + }, + { + "epoch": 0.08748819406472139, + "grad_norm": 0.3265244662761688, + "learning_rate": 0.00019925354154575028, + "loss": 0.2024, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 880, + "tokens_per_second_per_gpu": 304.2 + }, + { + "epoch": 0.08848237808818413, + "grad_norm": 0.45628246665000916, + "learning_rate": 0.00019923427866032074, + "loss": 0.2319, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 890, + "tokens_per_second_per_gpu": 361.11 + }, + { + "epoch": 0.08947656211164687, + "grad_norm": 0.15768177807331085, + "learning_rate": 0.00019921477133988917, + "loss": 0.2944, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 900, + "tokens_per_second_per_gpu": 342.86 + }, + { + "epoch": 0.09047074613510961, + "grad_norm": 0.31603825092315674, + "learning_rate": 0.0001991950196325063, + "loss": 0.2697, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 910, + "tokens_per_second_per_gpu": 366.81 + }, + { + "epoch": 0.09146493015857235, + "grad_norm": 0.30974528193473816, + "learning_rate": 0.00019917502358682474, + "loss": 0.2915, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.23, + "memory/max_allocated (GiB)": 19.23, + "step": 920, + "tokens_per_second_per_gpu": 333.81 + }, + { + "epoch": 0.09245911418203509, + "grad_norm": 0.5116154551506042, + "learning_rate": 0.00019915478325209892, + "loss": 0.2984, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 930, + "tokens_per_second_per_gpu": 420.26 + }, + { + "epoch": 0.09345329820549783, + "grad_norm": 0.4105079174041748, + "learning_rate": 0.00019913429867818517, + "loss": 0.2456, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 940, + "tokens_per_second_per_gpu": 297.2 + }, + { + "epoch": 0.09444748222896059, + "grad_norm": 0.32649049162864685, + "learning_rate": 0.00019911356991554122, + "loss": 0.2974, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 950, + "tokens_per_second_per_gpu": 395.2 + }, + { + "epoch": 0.09544166625242333, + "grad_norm": 0.3354904353618622, + "learning_rate": 0.00019909259701522645, + "loss": 0.2627, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 960, + "tokens_per_second_per_gpu": 324.97 + }, + { + "epoch": 0.09643585027588607, + "grad_norm": 0.33404502272605896, + "learning_rate": 0.00019907138002890154, + "loss": 0.2386, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 970, + "tokens_per_second_per_gpu": 317.18 + }, + { + "epoch": 0.09743003429934881, + "grad_norm": 0.3161865472793579, + "learning_rate": 0.0001990499190088284, + "loss": 0.2362, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 980, + "tokens_per_second_per_gpu": 310.78 + }, + { + "epoch": 0.09842421832281155, + "grad_norm": 0.33621177077293396, + "learning_rate": 0.00019902821400787004, + "loss": 0.2792, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 990, + "tokens_per_second_per_gpu": 333.17 + }, + { + "epoch": 0.09941840234627429, + "grad_norm": 0.09930042922496796, + "learning_rate": 0.00019900626507949053, + "loss": 0.2622, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1000, + "tokens_per_second_per_gpu": 365.14 + }, + { + "epoch": 0.10041258636973704, + "grad_norm": 0.3367346525192261, + "learning_rate": 0.00019898407227775464, + "loss": 0.2214, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 1010, + "tokens_per_second_per_gpu": 342.03 + }, + { + "epoch": 0.10140677039319979, + "grad_norm": 0.35485416650772095, + "learning_rate": 0.00019896163565732798, + "loss": 0.3446, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1020, + "tokens_per_second_per_gpu": 378.53 + }, + { + "epoch": 0.10240095441666253, + "grad_norm": 0.31839531660079956, + "learning_rate": 0.0001989389552734767, + "loss": 0.2519, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 1030, + "tokens_per_second_per_gpu": 390.74 + }, + { + "epoch": 0.10339513844012527, + "grad_norm": 0.36318239569664, + "learning_rate": 0.0001989160311820673, + "loss": 0.3, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1040, + "tokens_per_second_per_gpu": 404.8 + }, + { + "epoch": 0.10438932246358801, + "grad_norm": 0.20552317798137665, + "learning_rate": 0.00019889286343956677, + "loss": 0.2531, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 1050, + "tokens_per_second_per_gpu": 349.7 + }, + { + "epoch": 0.10538350648705075, + "grad_norm": 0.4065081477165222, + "learning_rate": 0.00019886945210304208, + "loss": 0.3196, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 1060, + "tokens_per_second_per_gpu": 356.46 + }, + { + "epoch": 0.10637769051051349, + "grad_norm": 0.3974571228027344, + "learning_rate": 0.00019884579723016037, + "loss": 0.2585, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 1070, + "tokens_per_second_per_gpu": 361.16 + }, + { + "epoch": 0.10737187453397624, + "grad_norm": 0.39827367663383484, + "learning_rate": 0.0001988218988791885, + "loss": 0.2807, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1080, + "tokens_per_second_per_gpu": 399.5 + }, + { + "epoch": 0.10836605855743899, + "grad_norm": 0.38661155104637146, + "learning_rate": 0.00019879775710899322, + "loss": 0.262, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.81, + "memory/max_allocated (GiB)": 19.81, + "step": 1090, + "tokens_per_second_per_gpu": 311.89 + }, + { + "epoch": 0.10936024258090173, + "grad_norm": 0.272942453622818, + "learning_rate": 0.0001987733719790408, + "loss": 0.1925, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1100, + "tokens_per_second_per_gpu": 399.72 + }, + { + "epoch": 0.11035442660436447, + "grad_norm": 0.32272958755493164, + "learning_rate": 0.00019874874354939697, + "loss": 0.1643, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 1110, + "tokens_per_second_per_gpu": 315.4 + }, + { + "epoch": 0.11134861062782721, + "grad_norm": 0.3583936095237732, + "learning_rate": 0.00019872387188072673, + "loss": 0.2834, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 1120, + "tokens_per_second_per_gpu": 439.26 + }, + { + "epoch": 0.11234279465128995, + "grad_norm": 0.3295114040374756, + "learning_rate": 0.00019869875703429433, + "loss": 0.2157, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 1130, + "tokens_per_second_per_gpu": 290.71 + }, + { + "epoch": 0.1133369786747527, + "grad_norm": 0.21794943511486053, + "learning_rate": 0.00019867339907196283, + "loss": 0.2848, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1140, + "tokens_per_second_per_gpu": 382.48 + }, + { + "epoch": 0.11433116269821544, + "grad_norm": 0.47928282618522644, + "learning_rate": 0.00019864779805619435, + "loss": 0.2497, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 1150, + "tokens_per_second_per_gpu": 351.65 + }, + { + "epoch": 0.11532534672167818, + "grad_norm": 0.4386768341064453, + "learning_rate": 0.0001986219540500496, + "loss": 0.2885, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 1160, + "tokens_per_second_per_gpu": 351.08 + }, + { + "epoch": 0.11631953074514093, + "grad_norm": 0.46047040820121765, + "learning_rate": 0.00019859586711718776, + "loss": 0.325, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1170, + "tokens_per_second_per_gpu": 375.58 + }, + { + "epoch": 0.11731371476860367, + "grad_norm": 0.4252080023288727, + "learning_rate": 0.00019856953732186653, + "loss": 0.2923, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 1180, + "tokens_per_second_per_gpu": 349.36 + }, + { + "epoch": 0.11830789879206641, + "grad_norm": 0.3643350899219513, + "learning_rate": 0.00019854296472894168, + "loss": 0.2315, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1190, + "tokens_per_second_per_gpu": 390.28 + }, + { + "epoch": 0.11930208281552915, + "grad_norm": 0.4747346341609955, + "learning_rate": 0.00019851614940386722, + "loss": 0.3214, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.19, + "memory/max_allocated (GiB)": 19.19, + "step": 1200, + "tokens_per_second_per_gpu": 414.56 + }, + { + "epoch": 0.1202962668389919, + "grad_norm": 0.32468438148498535, + "learning_rate": 0.0001984890914126949, + "loss": 0.2606, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 1210, + "tokens_per_second_per_gpu": 393.59 + }, + { + "epoch": 0.12129045086245464, + "grad_norm": 0.27018123865127563, + "learning_rate": 0.00019846179082207429, + "loss": 0.2457, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 1220, + "tokens_per_second_per_gpu": 357.71 + }, + { + "epoch": 0.12228463488591738, + "grad_norm": 0.39820268750190735, + "learning_rate": 0.00019843424769925248, + "loss": 0.2618, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 1230, + "tokens_per_second_per_gpu": 377.03 + }, + { + "epoch": 0.12327881890938013, + "grad_norm": 0.4186467230319977, + "learning_rate": 0.00019840646211207407, + "loss": 0.2864, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1240, + "tokens_per_second_per_gpu": 415.58 + }, + { + "epoch": 0.12427300293284287, + "grad_norm": 0.3047218918800354, + "learning_rate": 0.00019837843412898081, + "loss": 0.1777, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1250, + "tokens_per_second_per_gpu": 376.07 + }, + { + "epoch": 0.12526718695630562, + "grad_norm": 0.3663698136806488, + "learning_rate": 0.0001983501638190115, + "loss": 0.2906, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1260, + "tokens_per_second_per_gpu": 341.64 + }, + { + "epoch": 0.12626137097976836, + "grad_norm": 0.5897945761680603, + "learning_rate": 0.00019832165125180194, + "loss": 0.2498, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 1270, + "tokens_per_second_per_gpu": 380.54 + }, + { + "epoch": 0.1272555550032311, + "grad_norm": 0.40836209058761597, + "learning_rate": 0.0001982928964975846, + "loss": 0.2722, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 1280, + "tokens_per_second_per_gpu": 387.55 + }, + { + "epoch": 0.12824973902669384, + "grad_norm": 0.33597612380981445, + "learning_rate": 0.00019826389962718848, + "loss": 0.3202, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1290, + "tokens_per_second_per_gpu": 313.03 + }, + { + "epoch": 0.12924392305015658, + "grad_norm": 0.44784456491470337, + "learning_rate": 0.00019823466071203902, + "loss": 0.2949, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1300, + "tokens_per_second_per_gpu": 390.03 + }, + { + "epoch": 0.13023810707361932, + "grad_norm": 0.3199595510959625, + "learning_rate": 0.0001982051798241579, + "loss": 0.2323, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 1310, + "tokens_per_second_per_gpu": 415.8 + }, + { + "epoch": 0.13123229109708207, + "grad_norm": 0.4944785535335541, + "learning_rate": 0.0001981754570361627, + "loss": 0.291, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 1320, + "tokens_per_second_per_gpu": 347.08 + }, + { + "epoch": 0.1322264751205448, + "grad_norm": 0.379162073135376, + "learning_rate": 0.00019814549242126698, + "loss": 0.2631, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 1330, + "tokens_per_second_per_gpu": 405.3 + }, + { + "epoch": 0.13322065914400755, + "grad_norm": 0.20690025389194489, + "learning_rate": 0.00019811528605327992, + "loss": 0.2099, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1340, + "tokens_per_second_per_gpu": 377.71 + }, + { + "epoch": 0.1342148431674703, + "grad_norm": 0.39738351106643677, + "learning_rate": 0.00019808483800660612, + "loss": 0.2486, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 1350, + "tokens_per_second_per_gpu": 333.2 + }, + { + "epoch": 0.13520902719093303, + "grad_norm": 0.5237305164337158, + "learning_rate": 0.00019805414835624566, + "loss": 0.2407, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 1360, + "tokens_per_second_per_gpu": 353.94 + }, + { + "epoch": 0.1362032112143958, + "grad_norm": 0.2773837447166443, + "learning_rate": 0.00019802321717779354, + "loss": 0.3119, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 1370, + "tokens_per_second_per_gpu": 391.05 + }, + { + "epoch": 0.13719739523785854, + "grad_norm": 0.2825298011302948, + "learning_rate": 0.00019799204454743987, + "loss": 0.2812, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1380, + "tokens_per_second_per_gpu": 348.62 + }, + { + "epoch": 0.13819157926132128, + "grad_norm": 0.3622908592224121, + "learning_rate": 0.00019796063054196937, + "loss": 0.2506, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1390, + "tokens_per_second_per_gpu": 380.11 + }, + { + "epoch": 0.13918576328478402, + "grad_norm": 0.3992385268211365, + "learning_rate": 0.0001979289752387614, + "loss": 0.2132, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1400, + "tokens_per_second_per_gpu": 301.04 + }, + { + "epoch": 0.14017994730824676, + "grad_norm": 0.4148050546646118, + "learning_rate": 0.00019789707871578966, + "loss": 0.1813, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 1410, + "tokens_per_second_per_gpu": 296.18 + }, + { + "epoch": 0.1411741313317095, + "grad_norm": 0.36811864376068115, + "learning_rate": 0.000197864941051622, + "loss": 0.2607, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 1420, + "tokens_per_second_per_gpu": 376.48 + }, + { + "epoch": 0.14216831535517224, + "grad_norm": 0.33353865146636963, + "learning_rate": 0.00019783256232542033, + "loss": 0.2694, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 1430, + "tokens_per_second_per_gpu": 352.36 + }, + { + "epoch": 0.14316249937863498, + "grad_norm": 0.4390527606010437, + "learning_rate": 0.00019779994261694025, + "loss": 0.2851, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 1440, + "tokens_per_second_per_gpu": 332.85 + }, + { + "epoch": 0.14415668340209772, + "grad_norm": 0.4553990066051483, + "learning_rate": 0.00019776708200653102, + "loss": 0.3301, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 1450, + "tokens_per_second_per_gpu": 400.73 + }, + { + "epoch": 0.14515086742556046, + "grad_norm": 0.3526112139225006, + "learning_rate": 0.00019773398057513526, + "loss": 0.2276, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 1460, + "tokens_per_second_per_gpu": 342.73 + }, + { + "epoch": 0.1461450514490232, + "grad_norm": 0.4758242070674896, + "learning_rate": 0.0001977006384042888, + "loss": 0.2185, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 1470, + "tokens_per_second_per_gpu": 343.6 + }, + { + "epoch": 0.14713923547248595, + "grad_norm": 0.4020686447620392, + "learning_rate": 0.00019766705557612045, + "loss": 0.2598, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 1480, + "tokens_per_second_per_gpu": 288.04 + }, + { + "epoch": 0.1481334194959487, + "grad_norm": 0.44152265787124634, + "learning_rate": 0.00019763323217335182, + "loss": 0.3394, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 1490, + "tokens_per_second_per_gpu": 408.81 + }, + { + "epoch": 0.14912760351941146, + "grad_norm": 0.31458431482315063, + "learning_rate": 0.00019759916827929706, + "loss": 0.2692, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 1500, + "tokens_per_second_per_gpu": 359.64 + }, + { + "epoch": 0.1501217875428742, + "grad_norm": 0.48072609305381775, + "learning_rate": 0.0001975648639778628, + "loss": 0.3545, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1510, + "tokens_per_second_per_gpu": 374.78 + }, + { + "epoch": 0.15111597156633694, + "grad_norm": 0.30275699496269226, + "learning_rate": 0.00019753031935354777, + "loss": 0.2109, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 1520, + "tokens_per_second_per_gpu": 371.06 + }, + { + "epoch": 0.15211015558979968, + "grad_norm": 0.5390923619270325, + "learning_rate": 0.00019749553449144267, + "loss": 0.2435, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 1530, + "tokens_per_second_per_gpu": 368.76 + }, + { + "epoch": 0.15310433961326242, + "grad_norm": 0.28221625089645386, + "learning_rate": 0.00019746050947722993, + "loss": 0.2105, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1540, + "tokens_per_second_per_gpu": 322.88 + }, + { + "epoch": 0.15409852363672516, + "grad_norm": 0.3471927046775818, + "learning_rate": 0.00019742524439718363, + "loss": 0.2761, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 17.4, + "memory/max_allocated (GiB)": 17.4, + "step": 1550, + "tokens_per_second_per_gpu": 331.45 + }, + { + "epoch": 0.1550927076601879, + "grad_norm": 0.34601831436157227, + "learning_rate": 0.0001973897393381691, + "loss": 0.2419, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1560, + "tokens_per_second_per_gpu": 330.91 + }, + { + "epoch": 0.15608689168365064, + "grad_norm": 0.4680122435092926, + "learning_rate": 0.00019735399438764275, + "loss": 0.2948, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1570, + "tokens_per_second_per_gpu": 369.5 + }, + { + "epoch": 0.15708107570711338, + "grad_norm": 0.35631850361824036, + "learning_rate": 0.000197318009633652, + "loss": 0.2865, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1580, + "tokens_per_second_per_gpu": 331.56 + }, + { + "epoch": 0.15807525973057612, + "grad_norm": 0.43517372012138367, + "learning_rate": 0.0001972817851648349, + "loss": 0.2912, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1590, + "tokens_per_second_per_gpu": 334.38 + }, + { + "epoch": 0.15906944375403886, + "grad_norm": 0.3614802360534668, + "learning_rate": 0.00019724532107041995, + "loss": 0.2182, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.45, + "memory/max_allocated (GiB)": 20.45, + "step": 1600, + "tokens_per_second_per_gpu": 330.71 + }, + { + "epoch": 0.1600636277775016, + "grad_norm": 0.3124898672103882, + "learning_rate": 0.00019720861744022594, + "loss": 0.1887, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1610, + "tokens_per_second_per_gpu": 419.4 + }, + { + "epoch": 0.16105781180096435, + "grad_norm": 0.37882000207901, + "learning_rate": 0.00019717167436466166, + "loss": 0.3199, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 1620, + "tokens_per_second_per_gpu": 352.77 + }, + { + "epoch": 0.16205199582442711, + "grad_norm": 0.47813880443573, + "learning_rate": 0.00019713449193472572, + "loss": 0.2644, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 1630, + "tokens_per_second_per_gpu": 328.31 + }, + { + "epoch": 0.16304617984788985, + "grad_norm": 0.4390937387943268, + "learning_rate": 0.00019709707024200633, + "loss": 0.2157, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 1640, + "tokens_per_second_per_gpu": 337.97 + }, + { + "epoch": 0.1640403638713526, + "grad_norm": 0.28492602705955505, + "learning_rate": 0.00019705940937868096, + "loss": 0.2301, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1650, + "tokens_per_second_per_gpu": 377.43 + }, + { + "epoch": 0.16503454789481534, + "grad_norm": 0.41057339310646057, + "learning_rate": 0.00019702150943751636, + "loss": 0.2755, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 1660, + "tokens_per_second_per_gpu": 364.78 + }, + { + "epoch": 0.16602873191827808, + "grad_norm": 0.36004287004470825, + "learning_rate": 0.00019698337051186803, + "loss": 0.2254, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1670, + "tokens_per_second_per_gpu": 348.57 + }, + { + "epoch": 0.16702291594174082, + "grad_norm": 0.36488133668899536, + "learning_rate": 0.00019694499269568022, + "loss": 0.2556, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1680, + "tokens_per_second_per_gpu": 338.09 + }, + { + "epoch": 0.16801709996520356, + "grad_norm": 0.1386333703994751, + "learning_rate": 0.00019690637608348562, + "loss": 0.2765, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1690, + "tokens_per_second_per_gpu": 279.22 + }, + { + "epoch": 0.1690112839886663, + "grad_norm": 0.30839794874191284, + "learning_rate": 0.00019686752077040505, + "loss": 0.2745, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 1700, + "tokens_per_second_per_gpu": 363.03 + }, + { + "epoch": 0.17000546801212904, + "grad_norm": 0.48986828327178955, + "learning_rate": 0.00019682842685214745, + "loss": 0.2415, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1710, + "tokens_per_second_per_gpu": 321.69 + }, + { + "epoch": 0.17099965203559178, + "grad_norm": 0.25523653626441956, + "learning_rate": 0.00019678909442500937, + "loss": 0.2618, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.18, + "memory/max_allocated (GiB)": 18.18, + "step": 1720, + "tokens_per_second_per_gpu": 312.41 + }, + { + "epoch": 0.17199383605905452, + "grad_norm": 0.1990797519683838, + "learning_rate": 0.00019674952358587488, + "loss": 0.2569, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 1730, + "tokens_per_second_per_gpu": 338.48 + }, + { + "epoch": 0.17298802008251726, + "grad_norm": 1.2966935634613037, + "learning_rate": 0.00019670971443221528, + "loss": 0.2789, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 1740, + "tokens_per_second_per_gpu": 377.96 + }, + { + "epoch": 0.17398220410598, + "grad_norm": 0.4156191647052765, + "learning_rate": 0.00019666966706208898, + "loss": 0.2537, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1750, + "tokens_per_second_per_gpu": 423.21 + }, + { + "epoch": 0.17497638812944277, + "grad_norm": 0.43119025230407715, + "learning_rate": 0.00019662938157414113, + "loss": 0.3316, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 1760, + "tokens_per_second_per_gpu": 385.14 + }, + { + "epoch": 0.1759705721529055, + "grad_norm": 0.4492398798465729, + "learning_rate": 0.00019658885806760336, + "loss": 0.2969, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 1770, + "tokens_per_second_per_gpu": 364.99 + }, + { + "epoch": 0.17696475617636825, + "grad_norm": 0.4664623737335205, + "learning_rate": 0.00019654809664229364, + "loss": 0.291, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1780, + "tokens_per_second_per_gpu": 295.57 + }, + { + "epoch": 0.177958940199831, + "grad_norm": 0.46608006954193115, + "learning_rate": 0.000196507097398616, + "loss": 0.295, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1790, + "tokens_per_second_per_gpu": 284.68 + }, + { + "epoch": 0.17895312422329374, + "grad_norm": 0.378513365983963, + "learning_rate": 0.00019646586043756023, + "loss": 0.2396, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1800, + "tokens_per_second_per_gpu": 325.71 + }, + { + "epoch": 0.17994730824675648, + "grad_norm": 0.36179885268211365, + "learning_rate": 0.00019642438586070168, + "loss": 0.2364, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 1810, + "tokens_per_second_per_gpu": 349.38 + }, + { + "epoch": 0.18094149227021922, + "grad_norm": 0.31644207239151, + "learning_rate": 0.000196382673770201, + "loss": 0.223, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 1820, + "tokens_per_second_per_gpu": 320.59 + }, + { + "epoch": 0.18193567629368196, + "grad_norm": 0.27562472224235535, + "learning_rate": 0.00019634072426880382, + "loss": 0.2641, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 1830, + "tokens_per_second_per_gpu": 386.93 + }, + { + "epoch": 0.1829298603171447, + "grad_norm": 0.4514225423336029, + "learning_rate": 0.00019629853745984076, + "loss": 0.2167, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1840, + "tokens_per_second_per_gpu": 350.66 + }, + { + "epoch": 0.18392404434060744, + "grad_norm": 0.43807438015937805, + "learning_rate": 0.00019625611344722675, + "loss": 0.2429, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1850, + "tokens_per_second_per_gpu": 390.61 + }, + { + "epoch": 0.18491822836407018, + "grad_norm": 0.3817460536956787, + "learning_rate": 0.00019621345233546115, + "loss": 0.2565, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 1860, + "tokens_per_second_per_gpu": 366.89 + }, + { + "epoch": 0.18591241238753292, + "grad_norm": 0.51050865650177, + "learning_rate": 0.0001961705542296272, + "loss": 0.2815, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 1870, + "tokens_per_second_per_gpu": 314.07 + }, + { + "epoch": 0.18690659641099566, + "grad_norm": 0.34085533022880554, + "learning_rate": 0.00019612741923539218, + "loss": 0.2353, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 1880, + "tokens_per_second_per_gpu": 366.93 + }, + { + "epoch": 0.18790078043445843, + "grad_norm": 0.38903746008872986, + "learning_rate": 0.00019608404745900652, + "loss": 0.2679, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 1890, + "tokens_per_second_per_gpu": 407.24 + }, + { + "epoch": 0.18889496445792117, + "grad_norm": 0.3198868930339813, + "learning_rate": 0.00019604043900730414, + "loss": 0.299, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 1900, + "tokens_per_second_per_gpu": 420.81 + }, + { + "epoch": 0.1898891484813839, + "grad_norm": 0.43067699670791626, + "learning_rate": 0.0001959965939877019, + "loss": 0.2957, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 1910, + "tokens_per_second_per_gpu": 467.81 + }, + { + "epoch": 0.19088333250484665, + "grad_norm": 0.3319333493709564, + "learning_rate": 0.00019595251250819932, + "loss": 0.2512, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 17.12, + "memory/max_allocated (GiB)": 17.12, + "step": 1920, + "tokens_per_second_per_gpu": 283.54 + }, + { + "epoch": 0.1918775165283094, + "grad_norm": 0.399617999792099, + "learning_rate": 0.00019590819467737837, + "loss": 0.2627, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 1930, + "tokens_per_second_per_gpu": 314.92 + }, + { + "epoch": 0.19287170055177214, + "grad_norm": 0.31971487402915955, + "learning_rate": 0.00019586364060440332, + "loss": 0.2705, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 1940, + "tokens_per_second_per_gpu": 371.9 + }, + { + "epoch": 0.19386588457523488, + "grad_norm": 0.22907792031764984, + "learning_rate": 0.0001958188503990202, + "loss": 0.2479, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 1950, + "tokens_per_second_per_gpu": 329.97 + }, + { + "epoch": 0.19486006859869762, + "grad_norm": 0.3624865710735321, + "learning_rate": 0.00019577382417155676, + "loss": 0.309, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 1960, + "tokens_per_second_per_gpu": 466.41 + }, + { + "epoch": 0.19585425262216036, + "grad_norm": 0.3285991847515106, + "learning_rate": 0.00019572856203292215, + "loss": 0.2188, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 1970, + "tokens_per_second_per_gpu": 336.34 + }, + { + "epoch": 0.1968484366456231, + "grad_norm": 0.22952090203762054, + "learning_rate": 0.00019568306409460654, + "loss": 0.2277, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 1980, + "tokens_per_second_per_gpu": 377.15 + }, + { + "epoch": 0.19784262066908584, + "grad_norm": 0.38868236541748047, + "learning_rate": 0.000195637330468681, + "loss": 0.299, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 1990, + "tokens_per_second_per_gpu": 371.96 + }, + { + "epoch": 0.19883680469254858, + "grad_norm": 0.30058735609054565, + "learning_rate": 0.0001955913612677971, + "loss": 0.2993, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2000, + "tokens_per_second_per_gpu": 418.93 + }, + { + "epoch": 0.19983098871601132, + "grad_norm": 0.25231263041496277, + "learning_rate": 0.00019554515660518668, + "loss": 0.2894, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2010, + "tokens_per_second_per_gpu": 372.68 + }, + { + "epoch": 0.2008251727394741, + "grad_norm": 0.5621690154075623, + "learning_rate": 0.00019549871659466165, + "loss": 0.2152, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 2020, + "tokens_per_second_per_gpu": 358.38 + }, + { + "epoch": 0.20181935676293683, + "grad_norm": 0.3443728983402252, + "learning_rate": 0.0001954520413506135, + "loss": 0.2398, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2030, + "tokens_per_second_per_gpu": 306.09 + }, + { + "epoch": 0.20281354078639957, + "grad_norm": 0.40964439511299133, + "learning_rate": 0.0001954051309880133, + "loss": 0.2259, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 2040, + "tokens_per_second_per_gpu": 371.31 + }, + { + "epoch": 0.2038077248098623, + "grad_norm": 0.49535176157951355, + "learning_rate": 0.0001953579856224111, + "loss": 0.2681, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 2050, + "tokens_per_second_per_gpu": 368.85 + }, + { + "epoch": 0.20480190883332505, + "grad_norm": 0.4733733832836151, + "learning_rate": 0.00019531060536993598, + "loss": 0.2309, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.47, + "memory/max_allocated (GiB)": 21.47, + "step": 2060, + "tokens_per_second_per_gpu": 378.99 + }, + { + "epoch": 0.2057960928567878, + "grad_norm": 0.4018077552318573, + "learning_rate": 0.00019526299034729544, + "loss": 0.2717, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 2070, + "tokens_per_second_per_gpu": 373.89 + }, + { + "epoch": 0.20679027688025053, + "grad_norm": 0.18390242755413055, + "learning_rate": 0.0001952151406717754, + "loss": 0.2604, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 2080, + "tokens_per_second_per_gpu": 339.07 + }, + { + "epoch": 0.20778446090371328, + "grad_norm": 0.31284740567207336, + "learning_rate": 0.0001951670564612397, + "loss": 0.2528, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 2090, + "tokens_per_second_per_gpu": 358.01 + }, + { + "epoch": 0.20877864492717602, + "grad_norm": 0.1479184776544571, + "learning_rate": 0.0001951187378341299, + "loss": 0.2753, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2100, + "tokens_per_second_per_gpu": 373.8 + }, + { + "epoch": 0.20977282895063876, + "grad_norm": 0.35812532901763916, + "learning_rate": 0.00019507018490946503, + "loss": 0.2799, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2110, + "tokens_per_second_per_gpu": 394.97 + }, + { + "epoch": 0.2107670129741015, + "grad_norm": 0.39899322390556335, + "learning_rate": 0.00019502139780684118, + "loss": 0.2785, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 2120, + "tokens_per_second_per_gpu": 371.01 + }, + { + "epoch": 0.21176119699756424, + "grad_norm": 0.30708786845207214, + "learning_rate": 0.00019497237664643132, + "loss": 0.2985, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 2130, + "tokens_per_second_per_gpu": 393.88 + }, + { + "epoch": 0.21275538102102698, + "grad_norm": 0.280734658241272, + "learning_rate": 0.00019492312154898488, + "loss": 0.2661, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 2140, + "tokens_per_second_per_gpu": 401.57 + }, + { + "epoch": 0.21374956504448975, + "grad_norm": 0.19114673137664795, + "learning_rate": 0.00019487363263582765, + "loss": 0.2197, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 2150, + "tokens_per_second_per_gpu": 346.18 + }, + { + "epoch": 0.2147437490679525, + "grad_norm": 0.5506372451782227, + "learning_rate": 0.00019482391002886122, + "loss": 0.2724, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 2160, + "tokens_per_second_per_gpu": 281.19 + }, + { + "epoch": 0.21573793309141523, + "grad_norm": 0.3187256157398224, + "learning_rate": 0.0001947739538505629, + "loss": 0.2758, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2170, + "tokens_per_second_per_gpu": 405.51 + }, + { + "epoch": 0.21673211711487797, + "grad_norm": 0.4344545602798462, + "learning_rate": 0.00019472376422398528, + "loss": 0.2792, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2180, + "tokens_per_second_per_gpu": 313.61 + }, + { + "epoch": 0.2177263011383407, + "grad_norm": 0.4564478099346161, + "learning_rate": 0.00019467334127275606, + "loss": 0.2474, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.43, + "memory/max_allocated (GiB)": 20.43, + "step": 2190, + "tokens_per_second_per_gpu": 328.19 + }, + { + "epoch": 0.21872048516180345, + "grad_norm": 0.25979048013687134, + "learning_rate": 0.00019462268512107766, + "loss": 0.2877, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2200, + "tokens_per_second_per_gpu": 360.33 + }, + { + "epoch": 0.2197146691852662, + "grad_norm": 0.3676232397556305, + "learning_rate": 0.00019457179589372684, + "loss": 0.3336, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2210, + "tokens_per_second_per_gpu": 368.21 + }, + { + "epoch": 0.22070885320872893, + "grad_norm": 0.7130278944969177, + "learning_rate": 0.0001945206737160545, + "loss": 0.253, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 2220, + "tokens_per_second_per_gpu": 305.12 + }, + { + "epoch": 0.22170303723219167, + "grad_norm": 2.660079002380371, + "learning_rate": 0.0001944693187139854, + "loss": 0.3069, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2230, + "tokens_per_second_per_gpu": 337.93 + }, + { + "epoch": 0.22269722125565442, + "grad_norm": 0.2822214663028717, + "learning_rate": 0.00019441773101401777, + "loss": 0.2744, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 2240, + "tokens_per_second_per_gpu": 349.76 + }, + { + "epoch": 0.22369140527911716, + "grad_norm": 0.45128756761550903, + "learning_rate": 0.00019436591074322302, + "loss": 0.245, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 2250, + "tokens_per_second_per_gpu": 390.72 + }, + { + "epoch": 0.2246855893025799, + "grad_norm": 0.2821448743343353, + "learning_rate": 0.00019431385802924539, + "loss": 0.2625, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2260, + "tokens_per_second_per_gpu": 321.11 + }, + { + "epoch": 0.22567977332604264, + "grad_norm": 0.3787819445133209, + "learning_rate": 0.00019426157300030176, + "loss": 0.2116, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2270, + "tokens_per_second_per_gpu": 345.13 + }, + { + "epoch": 0.2266739573495054, + "grad_norm": 0.36587730050086975, + "learning_rate": 0.0001942090557851812, + "loss": 0.2618, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 2280, + "tokens_per_second_per_gpu": 288.26 + }, + { + "epoch": 0.22766814137296815, + "grad_norm": 0.5559653639793396, + "learning_rate": 0.0001941563065132447, + "loss": 0.307, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2290, + "tokens_per_second_per_gpu": 402.27 + }, + { + "epoch": 0.2286623253964309, + "grad_norm": 0.26463228464126587, + "learning_rate": 0.0001941033253144249, + "loss": 0.2583, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2300, + "tokens_per_second_per_gpu": 388.95 + }, + { + "epoch": 0.22965650941989363, + "grad_norm": 0.330695778131485, + "learning_rate": 0.0001940501123192256, + "loss": 0.2671, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 2310, + "tokens_per_second_per_gpu": 353.27 + }, + { + "epoch": 0.23065069344335637, + "grad_norm": 0.3159751892089844, + "learning_rate": 0.00019399666765872176, + "loss": 0.2023, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2320, + "tokens_per_second_per_gpu": 347.05 + }, + { + "epoch": 0.2316448774668191, + "grad_norm": 0.4762006103992462, + "learning_rate": 0.0001939429914645588, + "loss": 0.2633, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 2330, + "tokens_per_second_per_gpu": 292.93 + }, + { + "epoch": 0.23263906149028185, + "grad_norm": 0.47802531719207764, + "learning_rate": 0.00019388908386895254, + "loss": 0.2381, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 2340, + "tokens_per_second_per_gpu": 366.87 + }, + { + "epoch": 0.2336332455137446, + "grad_norm": 0.4501783847808838, + "learning_rate": 0.00019383494500468883, + "loss": 0.3052, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 2350, + "tokens_per_second_per_gpu": 412.56 + }, + { + "epoch": 0.23462742953720733, + "grad_norm": 0.3937671184539795, + "learning_rate": 0.0001937805750051231, + "loss": 0.2495, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2360, + "tokens_per_second_per_gpu": 374.59 + }, + { + "epoch": 0.23562161356067007, + "grad_norm": 0.2139206975698471, + "learning_rate": 0.00019372597400418019, + "loss": 0.1679, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2370, + "tokens_per_second_per_gpu": 329.45 + }, + { + "epoch": 0.23661579758413281, + "grad_norm": 0.4658704102039337, + "learning_rate": 0.00019367114213635382, + "loss": 0.2242, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 2380, + "tokens_per_second_per_gpu": 302.96 + }, + { + "epoch": 0.23760998160759556, + "grad_norm": 0.34023401141166687, + "learning_rate": 0.00019361607953670654, + "loss": 0.2632, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2390, + "tokens_per_second_per_gpu": 387.14 + }, + { + "epoch": 0.2386041656310583, + "grad_norm": 0.4804230034351349, + "learning_rate": 0.00019356078634086914, + "loss": 0.251, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 2400, + "tokens_per_second_per_gpu": 410.6 + }, + { + "epoch": 0.23959834965452106, + "grad_norm": 0.32980307936668396, + "learning_rate": 0.00019350526268504048, + "loss": 0.176, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2410, + "tokens_per_second_per_gpu": 372.52 + }, + { + "epoch": 0.2405925336779838, + "grad_norm": 0.25920480489730835, + "learning_rate": 0.00019344950870598703, + "loss": 0.2976, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.83, + "memory/max_allocated (GiB)": 19.83, + "step": 2420, + "tokens_per_second_per_gpu": 330.47 + }, + { + "epoch": 0.24158671770144655, + "grad_norm": 0.455616295337677, + "learning_rate": 0.00019339352454104264, + "loss": 0.2976, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2430, + "tokens_per_second_per_gpu": 340.47 + }, + { + "epoch": 0.2425809017249093, + "grad_norm": 0.3018989562988281, + "learning_rate": 0.00019333731032810812, + "loss": 0.2732, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2440, + "tokens_per_second_per_gpu": 462.35 + }, + { + "epoch": 0.24357508574837203, + "grad_norm": 0.4818798303604126, + "learning_rate": 0.00019328086620565095, + "loss": 0.2886, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 2450, + "tokens_per_second_per_gpu": 323.25 + }, + { + "epoch": 0.24456926977183477, + "grad_norm": 0.4339105188846588, + "learning_rate": 0.000193224192312705, + "loss": 0.3325, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 2460, + "tokens_per_second_per_gpu": 401.06 + }, + { + "epoch": 0.2455634537952975, + "grad_norm": 0.2657606303691864, + "learning_rate": 0.00019316728878887, + "loss": 0.2369, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2470, + "tokens_per_second_per_gpu": 344.52 + }, + { + "epoch": 0.24655763781876025, + "grad_norm": 0.2487681657075882, + "learning_rate": 0.0001931101557743113, + "loss": 0.2481, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 2480, + "tokens_per_second_per_gpu": 338.75 + }, + { + "epoch": 0.247551821842223, + "grad_norm": 0.4219423532485962, + "learning_rate": 0.0001930527934097597, + "loss": 0.2467, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.23, + "memory/max_allocated (GiB)": 19.23, + "step": 2490, + "tokens_per_second_per_gpu": 357.65 + }, + { + "epoch": 0.24854600586568573, + "grad_norm": 0.3535912334918976, + "learning_rate": 0.00019299520183651075, + "loss": 0.2844, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 2500, + "tokens_per_second_per_gpu": 341.4 + }, + { + "epoch": 0.24954018988914847, + "grad_norm": 0.28509521484375, + "learning_rate": 0.0001929373811964247, + "loss": 0.2335, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 2510, + "tokens_per_second_per_gpu": 325.61 + }, + { + "epoch": 0.25053437391261124, + "grad_norm": 0.540902853012085, + "learning_rate": 0.00019287933163192602, + "loss": 0.314, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 2520, + "tokens_per_second_per_gpu": 390.5 + }, + { + "epoch": 0.251528557936074, + "grad_norm": 0.2346281111240387, + "learning_rate": 0.00019282105328600303, + "loss": 0.2592, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 17.12, + "memory/max_allocated (GiB)": 17.12, + "step": 2530, + "tokens_per_second_per_gpu": 335.14 + }, + { + "epoch": 0.2525227419595367, + "grad_norm": 0.4211244583129883, + "learning_rate": 0.0001927625463022076, + "loss": 0.2119, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2540, + "tokens_per_second_per_gpu": 360.45 + }, + { + "epoch": 0.25351692598299946, + "grad_norm": 0.5125908851623535, + "learning_rate": 0.00019270381082465483, + "loss": 0.2628, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2550, + "tokens_per_second_per_gpu": 316.73 + }, + { + "epoch": 0.2545111100064622, + "grad_norm": 0.20704445242881775, + "learning_rate": 0.00019264484699802262, + "loss": 0.2393, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 2560, + "tokens_per_second_per_gpu": 352.58 + }, + { + "epoch": 0.25550529402992495, + "grad_norm": 0.2666438817977905, + "learning_rate": 0.00019258565496755128, + "loss": 0.1621, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 2570, + "tokens_per_second_per_gpu": 346.34 + }, + { + "epoch": 0.2564994780533877, + "grad_norm": 0.4072653353214264, + "learning_rate": 0.00019252623487904335, + "loss": 0.2066, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 17.11, + "memory/max_allocated (GiB)": 17.11, + "step": 2580, + "tokens_per_second_per_gpu": 360.03 + }, + { + "epoch": 0.2574936620768504, + "grad_norm": 0.517437219619751, + "learning_rate": 0.00019246658687886302, + "loss": 0.256, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 2590, + "tokens_per_second_per_gpu": 335.1 + }, + { + "epoch": 0.25848784610031317, + "grad_norm": 1.1084229946136475, + "learning_rate": 0.00019240671111393597, + "loss": 0.2437, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2600, + "tokens_per_second_per_gpu": 336.03 + }, + { + "epoch": 0.2594820301237759, + "grad_norm": 0.3532284200191498, + "learning_rate": 0.00019234660773174883, + "loss": 0.2102, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 2610, + "tokens_per_second_per_gpu": 304.29 + }, + { + "epoch": 0.26047621414723865, + "grad_norm": 0.4219340980052948, + "learning_rate": 0.00019228627688034898, + "loss": 0.3338, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.61, + "memory/max_allocated (GiB)": 20.61, + "step": 2620, + "tokens_per_second_per_gpu": 350.23 + }, + { + "epoch": 0.2614703981707014, + "grad_norm": 0.3215112090110779, + "learning_rate": 0.000192225718708344, + "loss": 0.2466, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2630, + "tokens_per_second_per_gpu": 369.51 + }, + { + "epoch": 0.26246458219416413, + "grad_norm": 0.36818957328796387, + "learning_rate": 0.00019216493336490152, + "loss": 0.2839, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 2640, + "tokens_per_second_per_gpu": 369.73 + }, + { + "epoch": 0.26345876621762687, + "grad_norm": 0.41190099716186523, + "learning_rate": 0.0001921039209997486, + "loss": 0.2231, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2650, + "tokens_per_second_per_gpu": 334.38 + }, + { + "epoch": 0.2644529502410896, + "grad_norm": 0.3659015893936157, + "learning_rate": 0.0001920426817631717, + "loss": 0.162, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 2660, + "tokens_per_second_per_gpu": 336.11 + }, + { + "epoch": 0.26544713426455235, + "grad_norm": 0.13166293501853943, + "learning_rate": 0.00019198121580601596, + "loss": 0.2313, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 2670, + "tokens_per_second_per_gpu": 341.25 + }, + { + "epoch": 0.2664413182880151, + "grad_norm": 0.3052745759487152, + "learning_rate": 0.00019191952327968497, + "loss": 0.2887, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 2680, + "tokens_per_second_per_gpu": 390.54 + }, + { + "epoch": 0.26743550231147784, + "grad_norm": 0.3094649910926819, + "learning_rate": 0.00019185760433614054, + "loss": 0.2272, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2690, + "tokens_per_second_per_gpu": 371.44 + }, + { + "epoch": 0.2684296863349406, + "grad_norm": 4.753208160400391, + "learning_rate": 0.00019179545912790207, + "loss": 0.2826, + "memory/device_reserved (GiB)": 22.43, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 2700, + "tokens_per_second_per_gpu": 374.86 + }, + { + "epoch": 0.2694238703584033, + "grad_norm": 0.3265855014324188, + "learning_rate": 0.00019173308780804637, + "loss": 0.2372, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 2710, + "tokens_per_second_per_gpu": 320.39 + }, + { + "epoch": 0.27041805438186606, + "grad_norm": 0.3575897514820099, + "learning_rate": 0.00019167049053020712, + "loss": 0.247, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 2720, + "tokens_per_second_per_gpu": 312.17 + }, + { + "epoch": 0.27141223840532885, + "grad_norm": 0.486605703830719, + "learning_rate": 0.00019160766744857476, + "loss": 0.2732, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 2730, + "tokens_per_second_per_gpu": 388.44 + }, + { + "epoch": 0.2724064224287916, + "grad_norm": 0.3815803527832031, + "learning_rate": 0.00019154461871789572, + "loss": 0.2733, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.76, + "memory/max_allocated (GiB)": 19.76, + "step": 2740, + "tokens_per_second_per_gpu": 338.38 + }, + { + "epoch": 0.27340060645225434, + "grad_norm": 0.3805699050426483, + "learning_rate": 0.0001914813444934724, + "loss": 0.2912, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.67, + "memory/max_allocated (GiB)": 19.67, + "step": 2750, + "tokens_per_second_per_gpu": 385.54 + }, + { + "epoch": 0.2743947904757171, + "grad_norm": 0.5376086831092834, + "learning_rate": 0.00019141784493116254, + "loss": 0.3009, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2760, + "tokens_per_second_per_gpu": 392.53 + }, + { + "epoch": 0.2753889744991798, + "grad_norm": 0.2709028720855713, + "learning_rate": 0.000191354120187379, + "loss": 0.3044, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2770, + "tokens_per_second_per_gpu": 363.03 + }, + { + "epoch": 0.27638315852264256, + "grad_norm": 0.24123673141002655, + "learning_rate": 0.00019129017041908934, + "loss": 0.2744, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 2780, + "tokens_per_second_per_gpu": 351.75 + }, + { + "epoch": 0.2773773425461053, + "grad_norm": 0.4202309548854828, + "learning_rate": 0.00019122599578381532, + "loss": 0.2764, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2790, + "tokens_per_second_per_gpu": 387.68 + }, + { + "epoch": 0.27837152656956804, + "grad_norm": 0.42621132731437683, + "learning_rate": 0.00019116159643963262, + "loss": 0.245, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 2800, + "tokens_per_second_per_gpu": 351.56 + }, + { + "epoch": 0.2793657105930308, + "grad_norm": 0.30440858006477356, + "learning_rate": 0.00019109697254517048, + "loss": 0.2809, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2810, + "tokens_per_second_per_gpu": 382.5 + }, + { + "epoch": 0.2803598946164935, + "grad_norm": 0.49908211827278137, + "learning_rate": 0.00019103212425961111, + "loss": 0.319, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 2820, + "tokens_per_second_per_gpu": 402.18 + }, + { + "epoch": 0.28135407863995626, + "grad_norm": 0.38030481338500977, + "learning_rate": 0.00019096705174268967, + "loss": 0.2392, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 2830, + "tokens_per_second_per_gpu": 319.29 + }, + { + "epoch": 0.282348262663419, + "grad_norm": 0.4159802198410034, + "learning_rate": 0.00019090175515469344, + "loss": 0.2962, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 2840, + "tokens_per_second_per_gpu": 354.51 + }, + { + "epoch": 0.28334244668688174, + "grad_norm": 0.47257235646247864, + "learning_rate": 0.00019083623465646172, + "loss": 0.2523, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2850, + "tokens_per_second_per_gpu": 336.31 + }, + { + "epoch": 0.2843366307103445, + "grad_norm": 0.2591971158981323, + "learning_rate": 0.0001907704904093854, + "loss": 0.2605, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 2860, + "tokens_per_second_per_gpu": 386.6 + }, + { + "epoch": 0.2853308147338072, + "grad_norm": 0.3341462314128876, + "learning_rate": 0.00019070452257540638, + "loss": 0.3024, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2870, + "tokens_per_second_per_gpu": 389.19 + }, + { + "epoch": 0.28632499875726997, + "grad_norm": 0.3934953808784485, + "learning_rate": 0.00019063833131701744, + "loss": 0.2648, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 2880, + "tokens_per_second_per_gpu": 333.15 + }, + { + "epoch": 0.2873191827807327, + "grad_norm": 0.3615911304950714, + "learning_rate": 0.00019057191679726162, + "loss": 0.2232, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 2890, + "tokens_per_second_per_gpu": 326.24 + }, + { + "epoch": 0.28831336680419545, + "grad_norm": 0.45740652084350586, + "learning_rate": 0.00019050527917973192, + "loss": 0.2467, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 2900, + "tokens_per_second_per_gpu": 399.74 + }, + { + "epoch": 0.2893075508276582, + "grad_norm": 0.5232924818992615, + "learning_rate": 0.00019043841862857088, + "loss": 0.292, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 2910, + "tokens_per_second_per_gpu": 385.0 + }, + { + "epoch": 0.29030173485112093, + "grad_norm": 0.3617020547389984, + "learning_rate": 0.00019037133530847014, + "loss": 0.2618, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 2920, + "tokens_per_second_per_gpu": 380.42 + }, + { + "epoch": 0.29129591887458367, + "grad_norm": 0.4762667119503021, + "learning_rate": 0.00019030402938467013, + "loss": 0.281, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 2930, + "tokens_per_second_per_gpu": 398.66 + }, + { + "epoch": 0.2922901028980464, + "grad_norm": 0.5239278078079224, + "learning_rate": 0.00019023650102295957, + "loss": 0.2205, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 2940, + "tokens_per_second_per_gpu": 317.01 + }, + { + "epoch": 0.29328428692150915, + "grad_norm": 0.38952499628067017, + "learning_rate": 0.00019016875038967507, + "loss": 0.2618, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 2950, + "tokens_per_second_per_gpu": 356.78 + }, + { + "epoch": 0.2942784709449719, + "grad_norm": 0.4657343924045563, + "learning_rate": 0.00019010077765170072, + "loss": 0.2508, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 2960, + "tokens_per_second_per_gpu": 364.11 + }, + { + "epoch": 0.29527265496843463, + "grad_norm": 0.4659232497215271, + "learning_rate": 0.0001900325829764678, + "loss": 0.2148, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.69, + "memory/max_allocated (GiB)": 19.69, + "step": 2970, + "tokens_per_second_per_gpu": 387.16 + }, + { + "epoch": 0.2962668389918974, + "grad_norm": 0.5370404124259949, + "learning_rate": 0.0001899641665319542, + "loss": 0.2857, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 2980, + "tokens_per_second_per_gpu": 315.91 + }, + { + "epoch": 0.29726102301536017, + "grad_norm": 0.3220022916793823, + "learning_rate": 0.00018989552848668406, + "loss": 0.2989, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 2990, + "tokens_per_second_per_gpu": 394.41 + }, + { + "epoch": 0.2982552070388229, + "grad_norm": 0.38055410981178284, + "learning_rate": 0.0001898266690097274, + "loss": 0.2354, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 3000, + "tokens_per_second_per_gpu": 319.33 + }, + { + "epoch": 0.29924939106228565, + "grad_norm": 0.36663955450057983, + "learning_rate": 0.00018975758827069968, + "loss": 0.2437, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3010, + "tokens_per_second_per_gpu": 326.18 + }, + { + "epoch": 0.3002435750857484, + "grad_norm": 0.3347165882587433, + "learning_rate": 0.00018968828643976135, + "loss": 0.2016, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3020, + "tokens_per_second_per_gpu": 363.38 + }, + { + "epoch": 0.30123775910921113, + "grad_norm": 0.4647526144981384, + "learning_rate": 0.0001896187636876175, + "loss": 0.2421, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3030, + "tokens_per_second_per_gpu": 366.11 + }, + { + "epoch": 0.3022319431326739, + "grad_norm": 0.35668620467185974, + "learning_rate": 0.00018954902018551728, + "loss": 0.28, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3040, + "tokens_per_second_per_gpu": 333.01 + }, + { + "epoch": 0.3032261271561366, + "grad_norm": 0.4070911109447479, + "learning_rate": 0.00018947905610525374, + "loss": 0.3329, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.58, + "memory/max_allocated (GiB)": 20.58, + "step": 3050, + "tokens_per_second_per_gpu": 367.0 + }, + { + "epoch": 0.30422031117959936, + "grad_norm": 0.4529660642147064, + "learning_rate": 0.00018940887161916317, + "loss": 0.2532, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3060, + "tokens_per_second_per_gpu": 438.51 + }, + { + "epoch": 0.3052144952030621, + "grad_norm": 0.323428213596344, + "learning_rate": 0.0001893384669001248, + "loss": 0.2717, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 3070, + "tokens_per_second_per_gpu": 304.06 + }, + { + "epoch": 0.30620867922652484, + "grad_norm": 0.33601540327072144, + "learning_rate": 0.00018926784212156038, + "loss": 0.2575, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 17.29, + "memory/max_allocated (GiB)": 17.29, + "step": 3080, + "tokens_per_second_per_gpu": 315.78 + }, + { + "epoch": 0.3072028632499876, + "grad_norm": 0.4147079586982727, + "learning_rate": 0.0001891969974574336, + "loss": 0.2606, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3090, + "tokens_per_second_per_gpu": 357.02 + }, + { + "epoch": 0.3081970472734503, + "grad_norm": 0.5923524498939514, + "learning_rate": 0.00018912593308224987, + "loss": 0.2824, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 3100, + "tokens_per_second_per_gpu": 360.95 + }, + { + "epoch": 0.30919123129691306, + "grad_norm": 0.2961815297603607, + "learning_rate": 0.00018905464917105577, + "loss": 0.3001, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3110, + "tokens_per_second_per_gpu": 369.31 + }, + { + "epoch": 0.3101854153203758, + "grad_norm": 0.2864610254764557, + "learning_rate": 0.00018898314589943862, + "loss": 0.2441, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 3120, + "tokens_per_second_per_gpu": 383.33 + }, + { + "epoch": 0.31117959934383854, + "grad_norm": 0.28341177105903625, + "learning_rate": 0.00018891142344352611, + "loss": 0.2402, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 3130, + "tokens_per_second_per_gpu": 347.6 + }, + { + "epoch": 0.3121737833673013, + "grad_norm": 0.3956068158149719, + "learning_rate": 0.0001888394819799858, + "loss": 0.2318, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3140, + "tokens_per_second_per_gpu": 343.71 + }, + { + "epoch": 0.313167967390764, + "grad_norm": 0.3796366751194, + "learning_rate": 0.00018876732168602472, + "loss": 0.2047, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3150, + "tokens_per_second_per_gpu": 353.35 + }, + { + "epoch": 0.31416215141422676, + "grad_norm": 0.34916970133781433, + "learning_rate": 0.00018869494273938893, + "loss": 0.3128, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 3160, + "tokens_per_second_per_gpu": 316.54 + }, + { + "epoch": 0.3151563354376895, + "grad_norm": 0.3254728317260742, + "learning_rate": 0.00018862234531836307, + "loss": 0.3033, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 3170, + "tokens_per_second_per_gpu": 377.71 + }, + { + "epoch": 0.31615051946115225, + "grad_norm": 0.43219825625419617, + "learning_rate": 0.0001885495296017699, + "loss": 0.2952, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 3180, + "tokens_per_second_per_gpu": 314.3 + }, + { + "epoch": 0.317144703484615, + "grad_norm": 0.2606908082962036, + "learning_rate": 0.00018847649576897, + "loss": 0.2087, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3190, + "tokens_per_second_per_gpu": 345.1 + }, + { + "epoch": 0.31813888750807773, + "grad_norm": 0.46498534083366394, + "learning_rate": 0.00018840324399986105, + "loss": 0.2665, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3200, + "tokens_per_second_per_gpu": 313.16 + }, + { + "epoch": 0.31913307153154047, + "grad_norm": 0.35087594389915466, + "learning_rate": 0.00018832977447487772, + "loss": 0.2328, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3210, + "tokens_per_second_per_gpu": 321.23 + }, + { + "epoch": 0.3201272555550032, + "grad_norm": 0.46294090151786804, + "learning_rate": 0.00018825608737499088, + "loss": 0.2233, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3220, + "tokens_per_second_per_gpu": 366.88 + }, + { + "epoch": 0.32112143957846595, + "grad_norm": 0.35385075211524963, + "learning_rate": 0.00018818218288170753, + "loss": 0.2653, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3230, + "tokens_per_second_per_gpu": 350.06 + }, + { + "epoch": 0.3221156236019287, + "grad_norm": 0.4959578812122345, + "learning_rate": 0.00018810806117706998, + "loss": 0.2921, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3240, + "tokens_per_second_per_gpu": 415.69 + }, + { + "epoch": 0.3231098076253915, + "grad_norm": 0.4978230893611908, + "learning_rate": 0.0001880337224436557, + "loss": 0.3187, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 3250, + "tokens_per_second_per_gpu": 377.08 + }, + { + "epoch": 0.32410399164885423, + "grad_norm": 0.45267730951309204, + "learning_rate": 0.00018795916686457667, + "loss": 0.3013, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3260, + "tokens_per_second_per_gpu": 410.24 + }, + { + "epoch": 0.32509817567231697, + "grad_norm": 0.3263493478298187, + "learning_rate": 0.00018788439462347908, + "loss": 0.2876, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3270, + "tokens_per_second_per_gpu": 446.75 + }, + { + "epoch": 0.3260923596957797, + "grad_norm": 0.4769454896450043, + "learning_rate": 0.00018780940590454277, + "loss": 0.3504, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 3280, + "tokens_per_second_per_gpu": 301.09 + }, + { + "epoch": 0.32708654371924245, + "grad_norm": 0.21503253281116486, + "learning_rate": 0.00018773420089248074, + "loss": 0.2291, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3290, + "tokens_per_second_per_gpu": 375.18 + }, + { + "epoch": 0.3280807277427052, + "grad_norm": 0.49010154604911804, + "learning_rate": 0.00018765877977253888, + "loss": 0.2674, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3300, + "tokens_per_second_per_gpu": 349.74 + }, + { + "epoch": 0.32907491176616793, + "grad_norm": 0.3158112168312073, + "learning_rate": 0.00018758314273049532, + "loss": 0.2975, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3310, + "tokens_per_second_per_gpu": 336.25 + }, + { + "epoch": 0.3300690957896307, + "grad_norm": 0.428846150636673, + "learning_rate": 0.0001875072899526601, + "loss": 0.2618, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3320, + "tokens_per_second_per_gpu": 284.44 + }, + { + "epoch": 0.3310632798130934, + "grad_norm": 0.5033756494522095, + "learning_rate": 0.00018743122162587464, + "loss": 0.2926, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 3330, + "tokens_per_second_per_gpu": 419.59 + }, + { + "epoch": 0.33205746383655615, + "grad_norm": 0.17825426161289215, + "learning_rate": 0.0001873549379375113, + "loss": 0.2464, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3340, + "tokens_per_second_per_gpu": 342.16 + }, + { + "epoch": 0.3330516478600189, + "grad_norm": 0.34919679164886475, + "learning_rate": 0.00018727843907547293, + "loss": 0.2729, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 3350, + "tokens_per_second_per_gpu": 390.88 + }, + { + "epoch": 0.33404583188348164, + "grad_norm": 0.37125077843666077, + "learning_rate": 0.00018720172522819243, + "loss": 0.3117, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3360, + "tokens_per_second_per_gpu": 394.73 + }, + { + "epoch": 0.3350400159069444, + "grad_norm": 0.31769683957099915, + "learning_rate": 0.00018712479658463215, + "loss": 0.2702, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3370, + "tokens_per_second_per_gpu": 424.16 + }, + { + "epoch": 0.3360341999304071, + "grad_norm": 0.398548424243927, + "learning_rate": 0.00018704765333428367, + "loss": 0.1966, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3380, + "tokens_per_second_per_gpu": 310.88 + }, + { + "epoch": 0.33702838395386986, + "grad_norm": 0.24172648787498474, + "learning_rate": 0.00018697029566716705, + "loss": 0.2189, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3390, + "tokens_per_second_per_gpu": 311.63 + }, + { + "epoch": 0.3380225679773326, + "grad_norm": 0.46132785081863403, + "learning_rate": 0.00018689272377383064, + "loss": 0.3093, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 3400, + "tokens_per_second_per_gpu": 411.8 + }, + { + "epoch": 0.33901675200079534, + "grad_norm": 0.3627679944038391, + "learning_rate": 0.00018681493784535036, + "loss": 0.2558, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 3410, + "tokens_per_second_per_gpu": 319.21 + }, + { + "epoch": 0.3400109360242581, + "grad_norm": 1.1992244720458984, + "learning_rate": 0.00018673693807332945, + "loss": 0.228, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3420, + "tokens_per_second_per_gpu": 354.78 + }, + { + "epoch": 0.3410051200477208, + "grad_norm": 0.26419004797935486, + "learning_rate": 0.00018665872464989773, + "loss": 0.1874, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3430, + "tokens_per_second_per_gpu": 369.02 + }, + { + "epoch": 0.34199930407118356, + "grad_norm": 0.3501751720905304, + "learning_rate": 0.00018658029776771152, + "loss": 0.2231, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 3440, + "tokens_per_second_per_gpu": 358.64 + }, + { + "epoch": 0.3429934880946463, + "grad_norm": 0.4123583137989044, + "learning_rate": 0.0001865016576199527, + "loss": 0.2456, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3450, + "tokens_per_second_per_gpu": 399.31 + }, + { + "epoch": 0.34398767211810904, + "grad_norm": 0.4507691264152527, + "learning_rate": 0.00018642280440032863, + "loss": 0.2716, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3460, + "tokens_per_second_per_gpu": 384.1 + }, + { + "epoch": 0.3449818561415718, + "grad_norm": 0.43500733375549316, + "learning_rate": 0.00018634373830307146, + "loss": 0.2352, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 3470, + "tokens_per_second_per_gpu": 319.67 + }, + { + "epoch": 0.3459760401650345, + "grad_norm": 0.40590760111808777, + "learning_rate": 0.00018626445952293766, + "loss": 0.2623, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3480, + "tokens_per_second_per_gpu": 335.37 + }, + { + "epoch": 0.34697022418849727, + "grad_norm": 0.2494644969701767, + "learning_rate": 0.00018618496825520767, + "loss": 0.2245, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3490, + "tokens_per_second_per_gpu": 347.95 + }, + { + "epoch": 0.34796440821196, + "grad_norm": 0.47100207209587097, + "learning_rate": 0.00018610526469568526, + "loss": 0.2775, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3500, + "tokens_per_second_per_gpu": 393.01 + }, + { + "epoch": 0.3489585922354228, + "grad_norm": 0.5600543022155762, + "learning_rate": 0.00018602534904069712, + "loss": 0.3007, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 3510, + "tokens_per_second_per_gpu": 399.51 + }, + { + "epoch": 0.34995277625888555, + "grad_norm": 0.25791943073272705, + "learning_rate": 0.00018594522148709244, + "loss": 0.2134, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 3520, + "tokens_per_second_per_gpu": 341.75 + }, + { + "epoch": 0.3509469602823483, + "grad_norm": 0.2849276661872864, + "learning_rate": 0.00018586488223224228, + "loss": 0.1919, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 3530, + "tokens_per_second_per_gpu": 369.12 + }, + { + "epoch": 0.351941144305811, + "grad_norm": 0.32681041955947876, + "learning_rate": 0.00018578433147403925, + "loss": 0.2192, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 3540, + "tokens_per_second_per_gpu": 309.14 + }, + { + "epoch": 0.35293532832927377, + "grad_norm": 0.5691526532173157, + "learning_rate": 0.00018570356941089686, + "loss": 0.2775, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3550, + "tokens_per_second_per_gpu": 357.84 + }, + { + "epoch": 0.3539295123527365, + "grad_norm": 0.35383832454681396, + "learning_rate": 0.00018562259624174915, + "loss": 0.2285, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3560, + "tokens_per_second_per_gpu": 341.0 + }, + { + "epoch": 0.35492369637619925, + "grad_norm": 0.4128180742263794, + "learning_rate": 0.00018554141216605016, + "loss": 0.2216, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3570, + "tokens_per_second_per_gpu": 310.04 + }, + { + "epoch": 0.355917880399662, + "grad_norm": 0.4811583459377289, + "learning_rate": 0.00018546001738377338, + "loss": 0.3354, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3580, + "tokens_per_second_per_gpu": 446.97 + }, + { + "epoch": 0.35691206442312473, + "grad_norm": 0.4148445725440979, + "learning_rate": 0.0001853784120954114, + "loss": 0.216, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3590, + "tokens_per_second_per_gpu": 327.38 + }, + { + "epoch": 0.35790624844658747, + "grad_norm": 0.5083706378936768, + "learning_rate": 0.0001852965965019753, + "loss": 0.2762, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3600, + "tokens_per_second_per_gpu": 371.89 + }, + { + "epoch": 0.3589004324700502, + "grad_norm": 0.4946528673171997, + "learning_rate": 0.00018521457080499418, + "loss": 0.2455, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 3610, + "tokens_per_second_per_gpu": 361.51 + }, + { + "epoch": 0.35989461649351295, + "grad_norm": 0.4880548417568207, + "learning_rate": 0.00018513233520651466, + "loss": 0.2299, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3620, + "tokens_per_second_per_gpu": 310.24 + }, + { + "epoch": 0.3608888005169757, + "grad_norm": 0.18661662936210632, + "learning_rate": 0.00018504988990910036, + "loss": 0.2325, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3630, + "tokens_per_second_per_gpu": 360.45 + }, + { + "epoch": 0.36188298454043843, + "grad_norm": 0.49652183055877686, + "learning_rate": 0.00018496723511583153, + "loss": 0.2312, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 3640, + "tokens_per_second_per_gpu": 347.3 + }, + { + "epoch": 0.3628771685639012, + "grad_norm": 0.35343873500823975, + "learning_rate": 0.0001848843710303044, + "loss": 0.154, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3650, + "tokens_per_second_per_gpu": 334.16 + }, + { + "epoch": 0.3638713525873639, + "grad_norm": 0.5269297361373901, + "learning_rate": 0.0001848012978566307, + "loss": 0.2677, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3660, + "tokens_per_second_per_gpu": 343.01 + }, + { + "epoch": 0.36486553661082666, + "grad_norm": 0.4809168875217438, + "learning_rate": 0.00018471801579943717, + "loss": 0.3083, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3670, + "tokens_per_second_per_gpu": 359.61 + }, + { + "epoch": 0.3658597206342894, + "grad_norm": 0.4312402904033661, + "learning_rate": 0.0001846345250638652, + "loss": 0.2711, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3680, + "tokens_per_second_per_gpu": 343.63 + }, + { + "epoch": 0.36685390465775214, + "grad_norm": 0.44654685258865356, + "learning_rate": 0.0001845508258555701, + "loss": 0.2629, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3690, + "tokens_per_second_per_gpu": 380.56 + }, + { + "epoch": 0.3678480886812149, + "grad_norm": 0.19989164173603058, + "learning_rate": 0.00018446691838072067, + "loss": 0.2451, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 3700, + "tokens_per_second_per_gpu": 316.7 + }, + { + "epoch": 0.3688422727046776, + "grad_norm": 0.268655925989151, + "learning_rate": 0.00018438280284599877, + "loss": 0.2172, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 3710, + "tokens_per_second_per_gpu": 270.99 + }, + { + "epoch": 0.36983645672814036, + "grad_norm": 0.3657344579696655, + "learning_rate": 0.00018429847945859872, + "loss": 0.2505, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 3720, + "tokens_per_second_per_gpu": 319.93 + }, + { + "epoch": 0.3708306407516031, + "grad_norm": 0.40177711844444275, + "learning_rate": 0.00018421394842622695, + "loss": 0.2462, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3730, + "tokens_per_second_per_gpu": 344.24 + }, + { + "epoch": 0.37182482477506584, + "grad_norm": 0.48767533898353577, + "learning_rate": 0.00018412920995710113, + "loss": 0.2827, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3740, + "tokens_per_second_per_gpu": 380.45 + }, + { + "epoch": 0.3728190087985286, + "grad_norm": 0.45828619599342346, + "learning_rate": 0.00018404426425995007, + "loss": 0.2355, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 3750, + "tokens_per_second_per_gpu": 388.78 + }, + { + "epoch": 0.3738131928219913, + "grad_norm": 0.49931567907333374, + "learning_rate": 0.000183959111544013, + "loss": 0.2813, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3760, + "tokens_per_second_per_gpu": 361.55 + }, + { + "epoch": 0.3748073768454541, + "grad_norm": 0.3232674300670624, + "learning_rate": 0.00018387375201903903, + "loss": 0.2488, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3770, + "tokens_per_second_per_gpu": 326.01 + }, + { + "epoch": 0.37580156086891686, + "grad_norm": 0.41870149970054626, + "learning_rate": 0.0001837881858952867, + "loss": 0.3117, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3780, + "tokens_per_second_per_gpu": 356.89 + }, + { + "epoch": 0.3767957448923796, + "grad_norm": 0.396383672952652, + "learning_rate": 0.00018370241338352348, + "loss": 0.3046, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3790, + "tokens_per_second_per_gpu": 397.36 + }, + { + "epoch": 0.37778992891584234, + "grad_norm": 0.33363988995552063, + "learning_rate": 0.00018361643469502517, + "loss": 0.2074, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3800, + "tokens_per_second_per_gpu": 320.36 + }, + { + "epoch": 0.3787841129393051, + "grad_norm": 0.34591570496559143, + "learning_rate": 0.00018353025004157552, + "loss": 0.2449, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 3810, + "tokens_per_second_per_gpu": 370.18 + }, + { + "epoch": 0.3797782969627678, + "grad_norm": 0.4369080066680908, + "learning_rate": 0.00018344385963546547, + "loss": 0.2017, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 3820, + "tokens_per_second_per_gpu": 346.43 + }, + { + "epoch": 0.38077248098623057, + "grad_norm": 0.4190782308578491, + "learning_rate": 0.00018335726368949286, + "loss": 0.2987, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3830, + "tokens_per_second_per_gpu": 381.05 + }, + { + "epoch": 0.3817666650096933, + "grad_norm": 0.4989373981952667, + "learning_rate": 0.00018327046241696184, + "loss": 0.2992, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 3840, + "tokens_per_second_per_gpu": 363.52 + }, + { + "epoch": 0.38276084903315605, + "grad_norm": 0.3604322671890259, + "learning_rate": 0.00018318345603168226, + "loss": 0.2311, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 3850, + "tokens_per_second_per_gpu": 286.25 + }, + { + "epoch": 0.3837550330566188, + "grad_norm": 0.303365021944046, + "learning_rate": 0.00018309624474796926, + "loss": 0.1952, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 3860, + "tokens_per_second_per_gpu": 381.31 + }, + { + "epoch": 0.38474921708008153, + "grad_norm": 0.5674360990524292, + "learning_rate": 0.00018300882878064266, + "loss": 0.2694, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.82, + "memory/max_allocated (GiB)": 18.82, + "step": 3870, + "tokens_per_second_per_gpu": 346.22 + }, + { + "epoch": 0.38574340110354427, + "grad_norm": 0.4402889609336853, + "learning_rate": 0.00018292120834502643, + "loss": 0.2825, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 3880, + "tokens_per_second_per_gpu": 339.23 + }, + { + "epoch": 0.386737585127007, + "grad_norm": 0.3922783136367798, + "learning_rate": 0.00018283338365694825, + "loss": 0.2294, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3890, + "tokens_per_second_per_gpu": 289.83 + }, + { + "epoch": 0.38773176915046975, + "grad_norm": 0.6003592014312744, + "learning_rate": 0.00018274535493273893, + "loss": 0.2244, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3900, + "tokens_per_second_per_gpu": 350.12 + }, + { + "epoch": 0.3887259531739325, + "grad_norm": 0.436212956905365, + "learning_rate": 0.00018265712238923175, + "loss": 0.2341, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3910, + "tokens_per_second_per_gpu": 343.17 + }, + { + "epoch": 0.38972013719739523, + "grad_norm": 0.2501852810382843, + "learning_rate": 0.00018256868624376215, + "loss": 0.2647, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 3920, + "tokens_per_second_per_gpu": 396.07 + }, + { + "epoch": 0.390714321220858, + "grad_norm": 0.36171767115592957, + "learning_rate": 0.00018248004671416704, + "loss": 0.2664, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.69, + "memory/max_allocated (GiB)": 19.69, + "step": 3930, + "tokens_per_second_per_gpu": 426.89 + }, + { + "epoch": 0.3917085052443207, + "grad_norm": 0.47077932953834534, + "learning_rate": 0.00018239120401878432, + "loss": 0.3584, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 3940, + "tokens_per_second_per_gpu": 452.63 + }, + { + "epoch": 0.39270268926778346, + "grad_norm": 0.413924902677536, + "learning_rate": 0.00018230215837645232, + "loss": 0.2715, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3950, + "tokens_per_second_per_gpu": 424.3 + }, + { + "epoch": 0.3936968732912462, + "grad_norm": 0.40877413749694824, + "learning_rate": 0.00018221291000650928, + "loss": 0.2855, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 3960, + "tokens_per_second_per_gpu": 406.29 + }, + { + "epoch": 0.39469105731470894, + "grad_norm": 0.4080711007118225, + "learning_rate": 0.0001821234591287928, + "loss": 0.1893, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 3970, + "tokens_per_second_per_gpu": 347.04 + }, + { + "epoch": 0.3956852413381717, + "grad_norm": 0.2622958719730377, + "learning_rate": 0.00018203380596363932, + "loss": 0.2328, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 3980, + "tokens_per_second_per_gpu": 323.13 + }, + { + "epoch": 0.3966794253616344, + "grad_norm": 0.32758989930152893, + "learning_rate": 0.0001819439507318835, + "loss": 0.196, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 3990, + "tokens_per_second_per_gpu": 255.43 + }, + { + "epoch": 0.39767360938509716, + "grad_norm": 0.4135094881057739, + "learning_rate": 0.00018185389365485774, + "loss": 0.2874, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4000, + "tokens_per_second_per_gpu": 352.42 + }, + { + "epoch": 0.3986677934085599, + "grad_norm": 0.4753275215625763, + "learning_rate": 0.00018176363495439173, + "loss": 0.2796, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 4010, + "tokens_per_second_per_gpu": 300.74 + }, + { + "epoch": 0.39966197743202264, + "grad_norm": 0.41000860929489136, + "learning_rate": 0.00018167317485281168, + "loss": 0.3278, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 4020, + "tokens_per_second_per_gpu": 381.42 + }, + { + "epoch": 0.40065616145548544, + "grad_norm": 0.3132636845111847, + "learning_rate": 0.00018158251357293996, + "loss": 0.2514, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4030, + "tokens_per_second_per_gpu": 340.93 + }, + { + "epoch": 0.4016503454789482, + "grad_norm": 0.3330332338809967, + "learning_rate": 0.00018149165133809442, + "loss": 0.219, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4040, + "tokens_per_second_per_gpu": 331.95 + }, + { + "epoch": 0.4026445295024109, + "grad_norm": 0.5818430781364441, + "learning_rate": 0.000181400588372088, + "loss": 0.3451, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 4050, + "tokens_per_second_per_gpu": 433.43 + }, + { + "epoch": 0.40363871352587366, + "grad_norm": 0.4116646945476532, + "learning_rate": 0.00018130932489922804, + "loss": 0.1907, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4060, + "tokens_per_second_per_gpu": 326.32 + }, + { + "epoch": 0.4046328975493364, + "grad_norm": 0.31805434823036194, + "learning_rate": 0.0001812178611443157, + "loss": 0.18, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4070, + "tokens_per_second_per_gpu": 284.15 + }, + { + "epoch": 0.40562708157279914, + "grad_norm": 0.5397758483886719, + "learning_rate": 0.0001811261973326456, + "loss": 0.2596, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 4080, + "tokens_per_second_per_gpu": 369.06 + }, + { + "epoch": 0.4066212655962619, + "grad_norm": 0.4468703866004944, + "learning_rate": 0.00018103433369000502, + "loss": 0.2464, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 4090, + "tokens_per_second_per_gpu": 343.56 + }, + { + "epoch": 0.4076154496197246, + "grad_norm": 0.3228696286678314, + "learning_rate": 0.0001809422704426736, + "loss": 0.2, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4100, + "tokens_per_second_per_gpu": 301.0 + }, + { + "epoch": 0.40860963364318736, + "grad_norm": 0.5811059474945068, + "learning_rate": 0.00018085000781742252, + "loss": 0.2642, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 4110, + "tokens_per_second_per_gpu": 366.31 + }, + { + "epoch": 0.4096038176666501, + "grad_norm": 0.6858221292495728, + "learning_rate": 0.00018075754604151415, + "loss": 0.2658, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.68, + "memory/max_allocated (GiB)": 19.68, + "step": 4120, + "tokens_per_second_per_gpu": 340.12 + }, + { + "epoch": 0.41059800169011285, + "grad_norm": 0.3095184862613678, + "learning_rate": 0.00018066488534270142, + "loss": 0.2542, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 4130, + "tokens_per_second_per_gpu": 399.71 + }, + { + "epoch": 0.4115921857135756, + "grad_norm": 0.5910835266113281, + "learning_rate": 0.0001805720259492271, + "loss": 0.289, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4140, + "tokens_per_second_per_gpu": 366.05 + }, + { + "epoch": 0.4125863697370383, + "grad_norm": 0.3704458773136139, + "learning_rate": 0.00018047896808982364, + "loss": 0.2581, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 4150, + "tokens_per_second_per_gpu": 348.57 + }, + { + "epoch": 0.41358055376050107, + "grad_norm": 0.4034087359905243, + "learning_rate": 0.00018038571199371215, + "loss": 0.2207, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 4160, + "tokens_per_second_per_gpu": 323.67 + }, + { + "epoch": 0.4145747377839638, + "grad_norm": 0.5003114342689514, + "learning_rate": 0.0001802922578906021, + "loss": 0.2927, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4170, + "tokens_per_second_per_gpu": 325.75 + }, + { + "epoch": 0.41556892180742655, + "grad_norm": 0.29613539576530457, + "learning_rate": 0.0001801986060106907, + "loss": 0.2906, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4180, + "tokens_per_second_per_gpu": 335.06 + }, + { + "epoch": 0.4165631058308893, + "grad_norm": 0.49740076065063477, + "learning_rate": 0.00018010475658466235, + "loss": 0.2853, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4190, + "tokens_per_second_per_gpu": 420.46 + }, + { + "epoch": 0.41755728985435203, + "grad_norm": 0.32316842675209045, + "learning_rate": 0.000180010709843688, + "loss": 0.1658, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 4200, + "tokens_per_second_per_gpu": 288.98 + }, + { + "epoch": 0.4185514738778148, + "grad_norm": 0.29272300004959106, + "learning_rate": 0.00017991646601942467, + "loss": 0.2719, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.23, + "memory/max_allocated (GiB)": 19.23, + "step": 4210, + "tokens_per_second_per_gpu": 364.92 + }, + { + "epoch": 0.4195456579012775, + "grad_norm": 0.5684819221496582, + "learning_rate": 0.0001798220253440148, + "loss": 0.2755, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 4220, + "tokens_per_second_per_gpu": 317.91 + }, + { + "epoch": 0.42053984192474025, + "grad_norm": 0.4488314986228943, + "learning_rate": 0.00017972738805008574, + "loss": 0.2131, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 4230, + "tokens_per_second_per_gpu": 328.07 + }, + { + "epoch": 0.421534025948203, + "grad_norm": 0.3249848484992981, + "learning_rate": 0.0001796325543707491, + "loss": 0.2987, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 4240, + "tokens_per_second_per_gpu": 421.46 + }, + { + "epoch": 0.42252820997166574, + "grad_norm": 0.6481621265411377, + "learning_rate": 0.00017953752453960038, + "loss": 0.2498, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.43, + "memory/max_allocated (GiB)": 20.43, + "step": 4250, + "tokens_per_second_per_gpu": 363.83 + }, + { + "epoch": 0.4235223939951285, + "grad_norm": 0.3045104146003723, + "learning_rate": 0.00017944229879071806, + "loss": 0.2295, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 4260, + "tokens_per_second_per_gpu": 398.66 + }, + { + "epoch": 0.4245165780185912, + "grad_norm": 0.32762956619262695, + "learning_rate": 0.0001793468773586633, + "loss": 0.2406, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4270, + "tokens_per_second_per_gpu": 399.72 + }, + { + "epoch": 0.42551076204205396, + "grad_norm": 0.6278170347213745, + "learning_rate": 0.00017925126047847924, + "loss": 0.2523, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 4280, + "tokens_per_second_per_gpu": 324.65 + }, + { + "epoch": 0.42650494606551675, + "grad_norm": 0.45905986428260803, + "learning_rate": 0.00017915544838569052, + "loss": 0.2615, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4290, + "tokens_per_second_per_gpu": 353.66 + }, + { + "epoch": 0.4274991300889795, + "grad_norm": 0.48581770062446594, + "learning_rate": 0.00017905944131630253, + "loss": 0.2519, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4300, + "tokens_per_second_per_gpu": 321.73 + }, + { + "epoch": 0.42849331411244224, + "grad_norm": 0.49877023696899414, + "learning_rate": 0.00017896323950680098, + "loss": 0.2382, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 4310, + "tokens_per_second_per_gpu": 353.68 + }, + { + "epoch": 0.429487498135905, + "grad_norm": 0.580008327960968, + "learning_rate": 0.00017886684319415127, + "loss": 0.2478, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 4320, + "tokens_per_second_per_gpu": 368.72 + }, + { + "epoch": 0.4304816821593677, + "grad_norm": 0.3998342454433441, + "learning_rate": 0.00017877025261579788, + "loss": 0.2202, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 4330, + "tokens_per_second_per_gpu": 395.43 + }, + { + "epoch": 0.43147586618283046, + "grad_norm": 0.38088393211364746, + "learning_rate": 0.00017867346800966383, + "loss": 0.2521, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4340, + "tokens_per_second_per_gpu": 427.5 + }, + { + "epoch": 0.4324700502062932, + "grad_norm": 0.3859814703464508, + "learning_rate": 0.00017857648961415004, + "loss": 0.2353, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 4350, + "tokens_per_second_per_gpu": 376.77 + }, + { + "epoch": 0.43346423422975594, + "grad_norm": 0.34923896193504333, + "learning_rate": 0.00017847931766813482, + "loss": 0.2567, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 4360, + "tokens_per_second_per_gpu": 368.67 + }, + { + "epoch": 0.4344584182532187, + "grad_norm": 0.34750089049339294, + "learning_rate": 0.0001783819524109732, + "loss": 0.2291, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4370, + "tokens_per_second_per_gpu": 379.66 + }, + { + "epoch": 0.4354526022766814, + "grad_norm": 0.36663779616355896, + "learning_rate": 0.0001782843940824964, + "loss": 0.2866, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 4380, + "tokens_per_second_per_gpu": 411.45 + }, + { + "epoch": 0.43644678630014416, + "grad_norm": 0.3929060697555542, + "learning_rate": 0.00017818664292301118, + "loss": 0.2563, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4390, + "tokens_per_second_per_gpu": 381.12 + }, + { + "epoch": 0.4374409703236069, + "grad_norm": 0.4182446599006653, + "learning_rate": 0.0001780886991732993, + "loss": 0.2268, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 4400, + "tokens_per_second_per_gpu": 297.67 + }, + { + "epoch": 0.43843515434706964, + "grad_norm": 0.5998858213424683, + "learning_rate": 0.00017799056307461696, + "loss": 0.2629, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4410, + "tokens_per_second_per_gpu": 344.26 + }, + { + "epoch": 0.4394293383705324, + "grad_norm": 0.4282694160938263, + "learning_rate": 0.0001778922348686941, + "loss": 0.2523, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 4420, + "tokens_per_second_per_gpu": 328.11 + }, + { + "epoch": 0.4404235223939951, + "grad_norm": 0.5925072431564331, + "learning_rate": 0.00017779371479773382, + "loss": 0.27, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4430, + "tokens_per_second_per_gpu": 336.68 + }, + { + "epoch": 0.44141770641745787, + "grad_norm": 0.5149052739143372, + "learning_rate": 0.00017769500310441192, + "loss": 0.3033, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 4440, + "tokens_per_second_per_gpu": 370.47 + }, + { + "epoch": 0.4424118904409206, + "grad_norm": 0.418197363615036, + "learning_rate": 0.00017759610003187617, + "loss": 0.2193, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4450, + "tokens_per_second_per_gpu": 352.55 + }, + { + "epoch": 0.44340607446438335, + "grad_norm": 0.4415562152862549, + "learning_rate": 0.00017749700582374574, + "loss": 0.1978, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 4460, + "tokens_per_second_per_gpu": 343.08 + }, + { + "epoch": 0.4444002584878461, + "grad_norm": 0.32262691855430603, + "learning_rate": 0.0001773977207241106, + "loss": 0.2448, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 4470, + "tokens_per_second_per_gpu": 319.4 + }, + { + "epoch": 0.44539444251130883, + "grad_norm": 0.49002590775489807, + "learning_rate": 0.00017729824497753093, + "loss": 0.2772, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4480, + "tokens_per_second_per_gpu": 331.65 + }, + { + "epoch": 0.44638862653477157, + "grad_norm": 0.4270131587982178, + "learning_rate": 0.0001771985788290365, + "loss": 0.2557, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 4490, + "tokens_per_second_per_gpu": 300.31 + }, + { + "epoch": 0.4473828105582343, + "grad_norm": 0.5524002909660339, + "learning_rate": 0.00017709872252412616, + "loss": 0.2696, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4500, + "tokens_per_second_per_gpu": 348.17 + }, + { + "epoch": 0.44837699458169705, + "grad_norm": 0.32532012462615967, + "learning_rate": 0.00017699867630876703, + "loss": 0.1997, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4510, + "tokens_per_second_per_gpu": 315.46 + }, + { + "epoch": 0.4493711786051598, + "grad_norm": 0.49136292934417725, + "learning_rate": 0.0001768984404293941, + "loss": 0.2824, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4520, + "tokens_per_second_per_gpu": 391.14 + }, + { + "epoch": 0.45036536262862253, + "grad_norm": 0.43822959065437317, + "learning_rate": 0.00017679801513290956, + "loss": 0.2931, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 4530, + "tokens_per_second_per_gpu": 382.92 + }, + { + "epoch": 0.4513595466520853, + "grad_norm": 0.448585569858551, + "learning_rate": 0.00017669740066668214, + "loss": 0.2444, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4540, + "tokens_per_second_per_gpu": 364.48 + }, + { + "epoch": 0.45235373067554807, + "grad_norm": 0.3125511407852173, + "learning_rate": 0.0001765965972785465, + "loss": 0.2227, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4550, + "tokens_per_second_per_gpu": 360.51 + }, + { + "epoch": 0.4533479146990108, + "grad_norm": 0.6492682695388794, + "learning_rate": 0.00017649560521680266, + "loss": 0.3157, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 4560, + "tokens_per_second_per_gpu": 361.61 + }, + { + "epoch": 0.45434209872247355, + "grad_norm": 0.23529411852359772, + "learning_rate": 0.0001763944247302155, + "loss": 0.2644, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 4570, + "tokens_per_second_per_gpu": 333.05 + }, + { + "epoch": 0.4553362827459363, + "grad_norm": 0.37334245443344116, + "learning_rate": 0.00017629305606801387, + "loss": 0.1995, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4580, + "tokens_per_second_per_gpu": 380.12 + }, + { + "epoch": 0.45633046676939903, + "grad_norm": 0.26320332288742065, + "learning_rate": 0.00017619149947989028, + "loss": 0.201, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4590, + "tokens_per_second_per_gpu": 333.8 + }, + { + "epoch": 0.4573246507928618, + "grad_norm": 0.4711815416812897, + "learning_rate": 0.000176089755216, + "loss": 0.3037, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4600, + "tokens_per_second_per_gpu": 398.88 + }, + { + "epoch": 0.4583188348163245, + "grad_norm": 0.3597434461116791, + "learning_rate": 0.0001759878235269607, + "loss": 0.2393, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4610, + "tokens_per_second_per_gpu": 340.42 + }, + { + "epoch": 0.45931301883978726, + "grad_norm": 0.5157446265220642, + "learning_rate": 0.00017588570466385166, + "loss": 0.314, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4620, + "tokens_per_second_per_gpu": 374.07 + }, + { + "epoch": 0.46030720286325, + "grad_norm": 0.4747403562068939, + "learning_rate": 0.0001757833988782132, + "loss": 0.2606, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.47, + "memory/max_allocated (GiB)": 21.47, + "step": 4630, + "tokens_per_second_per_gpu": 333.75 + }, + { + "epoch": 0.46130138688671274, + "grad_norm": 0.45278453826904297, + "learning_rate": 0.00017568090642204612, + "loss": 0.2106, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 4640, + "tokens_per_second_per_gpu": 352.11 + }, + { + "epoch": 0.4622955709101755, + "grad_norm": 0.7563058137893677, + "learning_rate": 0.00017557822754781102, + "loss": 0.2457, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 4650, + "tokens_per_second_per_gpu": 370.21 + }, + { + "epoch": 0.4632897549336382, + "grad_norm": 0.27436432242393494, + "learning_rate": 0.00017547536250842765, + "loss": 0.2659, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4660, + "tokens_per_second_per_gpu": 347.64 + }, + { + "epoch": 0.46428393895710096, + "grad_norm": 0.3469400107860565, + "learning_rate": 0.00017537231155727428, + "loss": 0.2744, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 4670, + "tokens_per_second_per_gpu": 354.35 + }, + { + "epoch": 0.4652781229805637, + "grad_norm": 0.29021984338760376, + "learning_rate": 0.0001752690749481873, + "loss": 0.236, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4680, + "tokens_per_second_per_gpu": 357.93 + }, + { + "epoch": 0.46627230700402644, + "grad_norm": 0.28982868790626526, + "learning_rate": 0.00017516565293546025, + "loss": 0.2694, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 4690, + "tokens_per_second_per_gpu": 433.34 + }, + { + "epoch": 0.4672664910274892, + "grad_norm": 0.39744624495506287, + "learning_rate": 0.00017506204577384337, + "loss": 0.2209, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4700, + "tokens_per_second_per_gpu": 380.58 + }, + { + "epoch": 0.4682606750509519, + "grad_norm": 0.14510154724121094, + "learning_rate": 0.00017495825371854302, + "loss": 0.2147, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 4710, + "tokens_per_second_per_gpu": 342.34 + }, + { + "epoch": 0.46925485907441467, + "grad_norm": 0.26011091470718384, + "learning_rate": 0.000174854277025221, + "loss": 0.2411, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4720, + "tokens_per_second_per_gpu": 395.32 + }, + { + "epoch": 0.4702490430978774, + "grad_norm": 0.5020186901092529, + "learning_rate": 0.00017475011594999385, + "loss": 0.2466, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 4730, + "tokens_per_second_per_gpu": 385.96 + }, + { + "epoch": 0.47124322712134015, + "grad_norm": 0.5160934925079346, + "learning_rate": 0.0001746457707494323, + "loss": 0.3094, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 4740, + "tokens_per_second_per_gpu": 343.47 + }, + { + "epoch": 0.4722374111448029, + "grad_norm": 0.43567317724227905, + "learning_rate": 0.00017454124168056066, + "loss": 0.2324, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 4750, + "tokens_per_second_per_gpu": 346.39 + }, + { + "epoch": 0.47323159516826563, + "grad_norm": 0.39488813281059265, + "learning_rate": 0.0001744365290008561, + "loss": 0.1983, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4760, + "tokens_per_second_per_gpu": 363.53 + }, + { + "epoch": 0.47422577919172837, + "grad_norm": 0.2595832943916321, + "learning_rate": 0.00017433163296824808, + "loss": 0.2783, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 4770, + "tokens_per_second_per_gpu": 431.69 + }, + { + "epoch": 0.4752199632151911, + "grad_norm": 0.3657257556915283, + "learning_rate": 0.00017422655384111772, + "loss": 0.2223, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4780, + "tokens_per_second_per_gpu": 333.82 + }, + { + "epoch": 0.47621414723865385, + "grad_norm": 0.3269219994544983, + "learning_rate": 0.00017412129187829712, + "loss": 0.2042, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 4790, + "tokens_per_second_per_gpu": 356.0 + }, + { + "epoch": 0.4772083312621166, + "grad_norm": 0.35581299662590027, + "learning_rate": 0.00017401584733906872, + "loss": 0.216, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4800, + "tokens_per_second_per_gpu": 306.32 + }, + { + "epoch": 0.4782025152855794, + "grad_norm": 0.5693733096122742, + "learning_rate": 0.00017391022048316476, + "loss": 0.3306, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.19, + "memory/max_allocated (GiB)": 19.19, + "step": 4810, + "tokens_per_second_per_gpu": 359.5 + }, + { + "epoch": 0.47919669930904213, + "grad_norm": 0.33154231309890747, + "learning_rate": 0.00017380441157076643, + "loss": 0.2469, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 4820, + "tokens_per_second_per_gpu": 363.91 + }, + { + "epoch": 0.48019088333250487, + "grad_norm": 0.417501837015152, + "learning_rate": 0.00017369842086250347, + "loss": 0.2286, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 4830, + "tokens_per_second_per_gpu": 261.89 + }, + { + "epoch": 0.4811850673559676, + "grad_norm": 0.2794663608074188, + "learning_rate": 0.00017359224861945345, + "loss": 0.2415, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 4840, + "tokens_per_second_per_gpu": 350.79 + }, + { + "epoch": 0.48217925137943035, + "grad_norm": 0.31123244762420654, + "learning_rate": 0.00017348589510314096, + "loss": 0.2396, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 4850, + "tokens_per_second_per_gpu": 380.51 + }, + { + "epoch": 0.4831734354028931, + "grad_norm": 0.21615763008594513, + "learning_rate": 0.00017337936057553726, + "loss": 0.2286, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 4860, + "tokens_per_second_per_gpu": 430.28 + }, + { + "epoch": 0.48416761942635583, + "grad_norm": 0.38201475143432617, + "learning_rate": 0.0001732726452990594, + "loss": 0.2213, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4870, + "tokens_per_second_per_gpu": 337.65 + }, + { + "epoch": 0.4851618034498186, + "grad_norm": 0.4545513987541199, + "learning_rate": 0.00017316574953656958, + "loss": 0.2696, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4880, + "tokens_per_second_per_gpu": 388.83 + }, + { + "epoch": 0.4861559874732813, + "grad_norm": 0.2672022879123688, + "learning_rate": 0.00017305867355137475, + "loss": 0.1962, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 4890, + "tokens_per_second_per_gpu": 318.92 + }, + { + "epoch": 0.48715017149674406, + "grad_norm": 0.4752904772758484, + "learning_rate": 0.00017295141760722567, + "loss": 0.2107, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 4900, + "tokens_per_second_per_gpu": 361.56 + }, + { + "epoch": 0.4881443555202068, + "grad_norm": 0.4459490180015564, + "learning_rate": 0.0001728439819683164, + "loss": 0.2648, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 4910, + "tokens_per_second_per_gpu": 302.7 + }, + { + "epoch": 0.48913853954366954, + "grad_norm": 0.6155937314033508, + "learning_rate": 0.00017273636689928357, + "loss": 0.2714, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 4920, + "tokens_per_second_per_gpu": 379.02 + }, + { + "epoch": 0.4901327235671323, + "grad_norm": 0.4557485282421112, + "learning_rate": 0.00017262857266520595, + "loss": 0.1966, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 4930, + "tokens_per_second_per_gpu": 369.72 + }, + { + "epoch": 0.491126907590595, + "grad_norm": 0.4636807143688202, + "learning_rate": 0.0001725205995316034, + "loss": 0.2803, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 4940, + "tokens_per_second_per_gpu": 373.68 + }, + { + "epoch": 0.49212109161405776, + "grad_norm": 0.45894595980644226, + "learning_rate": 0.00017241244776443666, + "loss": 0.3439, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 4950, + "tokens_per_second_per_gpu": 426.86 + }, + { + "epoch": 0.4931152756375205, + "grad_norm": 0.39921844005584717, + "learning_rate": 0.0001723041176301063, + "loss": 0.3035, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 4960, + "tokens_per_second_per_gpu": 375.09 + }, + { + "epoch": 0.49410945966098324, + "grad_norm": 0.28210845589637756, + "learning_rate": 0.00017219560939545246, + "loss": 0.2043, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 4970, + "tokens_per_second_per_gpu": 320.21 + }, + { + "epoch": 0.495103643684446, + "grad_norm": 0.5301778316497803, + "learning_rate": 0.00017208692332775375, + "loss": 0.2293, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 4980, + "tokens_per_second_per_gpu": 327.97 + }, + { + "epoch": 0.4960978277079087, + "grad_norm": 0.40421542525291443, + "learning_rate": 0.000171978059694727, + "loss": 0.3101, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 4990, + "tokens_per_second_per_gpu": 395.91 + }, + { + "epoch": 0.49709201173137146, + "grad_norm": 0.383989155292511, + "learning_rate": 0.0001718690187645263, + "loss": 0.2447, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 5000, + "tokens_per_second_per_gpu": 344.59 + }, + { + "epoch": 0.4980861957548342, + "grad_norm": 0.559518039226532, + "learning_rate": 0.00017175980080574247, + "loss": 0.3176, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 5010, + "tokens_per_second_per_gpu": 378.55 + }, + { + "epoch": 0.49908037977829695, + "grad_norm": 0.46650487184524536, + "learning_rate": 0.00017165040608740255, + "loss": 0.2006, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 5020, + "tokens_per_second_per_gpu": 334.42 + }, + { + "epoch": 0.5000745638017597, + "grad_norm": 0.3541397750377655, + "learning_rate": 0.00017154083487896872, + "loss": 0.2542, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 5030, + "tokens_per_second_per_gpu": 360.31 + }, + { + "epoch": 0.5010687478252225, + "grad_norm": 0.48084208369255066, + "learning_rate": 0.00017143108745033811, + "loss": 0.2133, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 5040, + "tokens_per_second_per_gpu": 281.78 + }, + { + "epoch": 0.5020629318486852, + "grad_norm": 0.28656521439552307, + "learning_rate": 0.0001713211640718418, + "loss": 0.246, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 5050, + "tokens_per_second_per_gpu": 334.53 + }, + { + "epoch": 0.503057115872148, + "grad_norm": 0.48624783754348755, + "learning_rate": 0.0001712110650142443, + "loss": 0.2497, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.65, + "memory/max_allocated (GiB)": 20.65, + "step": 5060, + "tokens_per_second_per_gpu": 397.02 + }, + { + "epoch": 0.5040512998956107, + "grad_norm": 0.3334798216819763, + "learning_rate": 0.00017110079054874288, + "loss": 0.2366, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5070, + "tokens_per_second_per_gpu": 371.25 + }, + { + "epoch": 0.5050454839190734, + "grad_norm": 0.4432489275932312, + "learning_rate": 0.00017099034094696685, + "loss": 0.2104, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5080, + "tokens_per_second_per_gpu": 384.5 + }, + { + "epoch": 0.5060396679425362, + "grad_norm": 0.5348425507545471, + "learning_rate": 0.00017087971648097693, + "loss": 0.2292, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5090, + "tokens_per_second_per_gpu": 311.48 + }, + { + "epoch": 0.5070338519659989, + "grad_norm": 0.4587235748767853, + "learning_rate": 0.00017076891742326452, + "loss": 0.2297, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 5100, + "tokens_per_second_per_gpu": 371.52 + }, + { + "epoch": 0.5080280359894617, + "grad_norm": 0.3222121298313141, + "learning_rate": 0.00017065794404675112, + "loss": 0.2447, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 5110, + "tokens_per_second_per_gpu": 346.06 + }, + { + "epoch": 0.5090222200129244, + "grad_norm": 0.38898178935050964, + "learning_rate": 0.0001705467966247877, + "loss": 0.192, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5120, + "tokens_per_second_per_gpu": 388.93 + }, + { + "epoch": 0.5100164040363871, + "grad_norm": 0.49159225821495056, + "learning_rate": 0.00017043547543115373, + "loss": 0.2604, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 5130, + "tokens_per_second_per_gpu": 348.77 + }, + { + "epoch": 0.5110105880598499, + "grad_norm": 0.34262073040008545, + "learning_rate": 0.0001703239807400569, + "loss": 0.1843, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 5140, + "tokens_per_second_per_gpu": 361.68 + }, + { + "epoch": 0.5120047720833126, + "grad_norm": 0.5016794800758362, + "learning_rate": 0.00017021231282613223, + "loss": 0.2527, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 5150, + "tokens_per_second_per_gpu": 385.36 + }, + { + "epoch": 0.5129989561067754, + "grad_norm": 0.44959282875061035, + "learning_rate": 0.00017010047196444137, + "loss": 0.2349, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5160, + "tokens_per_second_per_gpu": 319.58 + }, + { + "epoch": 0.5139931401302381, + "grad_norm": 0.27737611532211304, + "learning_rate": 0.00016998845843047193, + "loss": 0.2564, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 5170, + "tokens_per_second_per_gpu": 325.49 + }, + { + "epoch": 0.5149873241537009, + "grad_norm": 0.5175918340682983, + "learning_rate": 0.00016987627250013702, + "loss": 0.2375, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 5180, + "tokens_per_second_per_gpu": 334.38 + }, + { + "epoch": 0.5159815081771636, + "grad_norm": 0.5541722178459167, + "learning_rate": 0.00016976391444977425, + "loss": 0.2478, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.81, + "memory/max_allocated (GiB)": 19.81, + "step": 5190, + "tokens_per_second_per_gpu": 316.46 + }, + { + "epoch": 0.5169756922006263, + "grad_norm": 1.0252865552902222, + "learning_rate": 0.00016965138455614525, + "loss": 0.2371, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5200, + "tokens_per_second_per_gpu": 339.79 + }, + { + "epoch": 0.5179698762240891, + "grad_norm": 0.582073450088501, + "learning_rate": 0.00016953868309643491, + "loss": 0.2311, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5210, + "tokens_per_second_per_gpu": 359.2 + }, + { + "epoch": 0.5189640602475518, + "grad_norm": 0.5147042870521545, + "learning_rate": 0.0001694258103482508, + "loss": 0.2693, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 5220, + "tokens_per_second_per_gpu": 379.55 + }, + { + "epoch": 0.5199582442710146, + "grad_norm": 0.6816319823265076, + "learning_rate": 0.0001693127665896223, + "loss": 0.2228, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 5230, + "tokens_per_second_per_gpu": 296.31 + }, + { + "epoch": 0.5209524282944773, + "grad_norm": 0.765450656414032, + "learning_rate": 0.00016919955209900012, + "loss": 0.2764, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 5240, + "tokens_per_second_per_gpu": 389.55 + }, + { + "epoch": 0.52194661231794, + "grad_norm": 0.3426934778690338, + "learning_rate": 0.00016908616715525544, + "loss": 0.3197, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 5250, + "tokens_per_second_per_gpu": 404.47 + }, + { + "epoch": 0.5229407963414028, + "grad_norm": 0.45074665546417236, + "learning_rate": 0.0001689726120376794, + "loss": 0.2624, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 5260, + "tokens_per_second_per_gpu": 362.3 + }, + { + "epoch": 0.5239349803648655, + "grad_norm": 0.4062357246875763, + "learning_rate": 0.00016885888702598218, + "loss": 0.2068, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.52, + "memory/max_allocated (GiB)": 20.52, + "step": 5270, + "tokens_per_second_per_gpu": 383.64 + }, + { + "epoch": 0.5249291643883283, + "grad_norm": 0.4385395050048828, + "learning_rate": 0.00016874499240029253, + "loss": 0.2886, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 5280, + "tokens_per_second_per_gpu": 332.1 + }, + { + "epoch": 0.525923348411791, + "grad_norm": 0.4379644989967346, + "learning_rate": 0.0001686309284411571, + "loss": 0.2305, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.08, + "memory/max_allocated (GiB)": 19.08, + "step": 5290, + "tokens_per_second_per_gpu": 328.75 + }, + { + "epoch": 0.5269175324352537, + "grad_norm": 0.5357609987258911, + "learning_rate": 0.00016851669542953935, + "loss": 0.2526, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 5300, + "tokens_per_second_per_gpu": 396.9 + }, + { + "epoch": 0.5279117164587165, + "grad_norm": 0.5790386199951172, + "learning_rate": 0.00016840229364681948, + "loss": 0.193, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5310, + "tokens_per_second_per_gpu": 313.5 + }, + { + "epoch": 0.5289059004821792, + "grad_norm": 0.4063149690628052, + "learning_rate": 0.00016828772337479318, + "loss": 0.2071, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 5320, + "tokens_per_second_per_gpu": 290.65 + }, + { + "epoch": 0.529900084505642, + "grad_norm": 0.3840952515602112, + "learning_rate": 0.00016817298489567127, + "loss": 0.2086, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 5330, + "tokens_per_second_per_gpu": 372.36 + }, + { + "epoch": 0.5308942685291047, + "grad_norm": 0.36974212527275085, + "learning_rate": 0.0001680580784920789, + "loss": 0.2556, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5340, + "tokens_per_second_per_gpu": 369.63 + }, + { + "epoch": 0.5318884525525674, + "grad_norm": 0.5495437979698181, + "learning_rate": 0.00016794300444705477, + "loss": 0.2667, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 5350, + "tokens_per_second_per_gpu": 327.36 + }, + { + "epoch": 0.5328826365760302, + "grad_norm": 0.5131349563598633, + "learning_rate": 0.0001678277630440506, + "loss": 0.2311, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 5360, + "tokens_per_second_per_gpu": 357.39 + }, + { + "epoch": 0.5338768205994929, + "grad_norm": 0.28221410512924194, + "learning_rate": 0.00016771235456693035, + "loss": 0.2532, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 5370, + "tokens_per_second_per_gpu": 385.44 + }, + { + "epoch": 0.5348710046229557, + "grad_norm": 0.5543202757835388, + "learning_rate": 0.0001675967792999695, + "loss": 0.2999, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.19, + "memory/max_allocated (GiB)": 19.19, + "step": 5380, + "tokens_per_second_per_gpu": 393.99 + }, + { + "epoch": 0.5358651886464184, + "grad_norm": 0.3077482283115387, + "learning_rate": 0.00016748103752785426, + "loss": 0.2071, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 5390, + "tokens_per_second_per_gpu": 372.33 + }, + { + "epoch": 0.5368593726698812, + "grad_norm": 0.6371411681175232, + "learning_rate": 0.00016736512953568117, + "loss": 0.1986, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.43, + "memory/max_allocated (GiB)": 20.43, + "step": 5400, + "tokens_per_second_per_gpu": 325.81 + }, + { + "epoch": 0.5378535566933439, + "grad_norm": 0.4577232003211975, + "learning_rate": 0.0001672490556089561, + "loss": 0.253, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5410, + "tokens_per_second_per_gpu": 388.89 + }, + { + "epoch": 0.5388477407168066, + "grad_norm": 0.2831481099128723, + "learning_rate": 0.00016713281603359366, + "loss": 0.1994, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 5420, + "tokens_per_second_per_gpu": 373.61 + }, + { + "epoch": 0.5398419247402694, + "grad_norm": 0.45899850130081177, + "learning_rate": 0.00016701641109591648, + "loss": 0.1997, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 5430, + "tokens_per_second_per_gpu": 386.1 + }, + { + "epoch": 0.5408361087637321, + "grad_norm": 0.2898833453655243, + "learning_rate": 0.0001668998410826545, + "loss": 0.276, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 5440, + "tokens_per_second_per_gpu": 374.49 + }, + { + "epoch": 0.5418302927871949, + "grad_norm": 0.37996944785118103, + "learning_rate": 0.00016678310628094438, + "loss": 0.2529, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 5450, + "tokens_per_second_per_gpu": 359.7 + }, + { + "epoch": 0.5428244768106577, + "grad_norm": 0.255087673664093, + "learning_rate": 0.0001666662069783285, + "loss": 0.2682, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 5460, + "tokens_per_second_per_gpu": 346.66 + }, + { + "epoch": 0.5438186608341204, + "grad_norm": 0.4574085474014282, + "learning_rate": 0.00016654914346275466, + "loss": 0.2516, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 5470, + "tokens_per_second_per_gpu": 350.42 + }, + { + "epoch": 0.5448128448575832, + "grad_norm": 0.5208483934402466, + "learning_rate": 0.00016643191602257496, + "loss": 0.2549, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 5480, + "tokens_per_second_per_gpu": 320.05 + }, + { + "epoch": 0.5458070288810459, + "grad_norm": 0.31626808643341064, + "learning_rate": 0.00016631452494654541, + "loss": 0.2151, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 5490, + "tokens_per_second_per_gpu": 335.31 + }, + { + "epoch": 0.5468012129045087, + "grad_norm": 0.28286507725715637, + "learning_rate": 0.000166196970523825, + "loss": 0.2847, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 5500, + "tokens_per_second_per_gpu": 406.77 + }, + { + "epoch": 0.5477953969279714, + "grad_norm": 0.3647201955318451, + "learning_rate": 0.00016607925304397517, + "loss": 0.1912, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 5510, + "tokens_per_second_per_gpu": 334.31 + }, + { + "epoch": 0.5487895809514342, + "grad_norm": 0.48184719681739807, + "learning_rate": 0.0001659613727969589, + "loss": 0.2336, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5520, + "tokens_per_second_per_gpu": 355.21 + }, + { + "epoch": 0.5497837649748969, + "grad_norm": 0.5385854244232178, + "learning_rate": 0.00016584333007314017, + "loss": 0.2764, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 5530, + "tokens_per_second_per_gpu": 384.64 + }, + { + "epoch": 0.5507779489983596, + "grad_norm": 0.2823385000228882, + "learning_rate": 0.00016572512516328317, + "loss": 0.3002, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.82, + "memory/max_allocated (GiB)": 18.82, + "step": 5540, + "tokens_per_second_per_gpu": 417.12 + }, + { + "epoch": 0.5517721330218224, + "grad_norm": 0.4133371114730835, + "learning_rate": 0.0001656067583585516, + "loss": 0.2066, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 5550, + "tokens_per_second_per_gpu": 314.99 + }, + { + "epoch": 0.5527663170452851, + "grad_norm": 0.2247128188610077, + "learning_rate": 0.00016548822995050787, + "loss": 0.2582, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 5560, + "tokens_per_second_per_gpu": 341.74 + }, + { + "epoch": 0.5537605010687479, + "grad_norm": 0.5088145136833191, + "learning_rate": 0.0001653695402311125, + "loss": 0.3169, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 5570, + "tokens_per_second_per_gpu": 385.48 + }, + { + "epoch": 0.5547546850922106, + "grad_norm": 0.600646436214447, + "learning_rate": 0.0001652506894927234, + "loss": 0.3391, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 5580, + "tokens_per_second_per_gpu": 338.0 + }, + { + "epoch": 0.5557488691156733, + "grad_norm": 0.34408292174339294, + "learning_rate": 0.00016513167802809502, + "loss": 0.2003, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5590, + "tokens_per_second_per_gpu": 341.35 + }, + { + "epoch": 0.5567430531391361, + "grad_norm": 0.44201189279556274, + "learning_rate": 0.0001650125061303778, + "loss": 0.2494, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 5600, + "tokens_per_second_per_gpu": 386.01 + }, + { + "epoch": 0.5577372371625988, + "grad_norm": 0.5484967827796936, + "learning_rate": 0.00016489317409311717, + "loss": 0.2881, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 5610, + "tokens_per_second_per_gpu": 327.21 + }, + { + "epoch": 0.5587314211860616, + "grad_norm": 0.45674219727516174, + "learning_rate": 0.00016477368221025333, + "loss": 0.2377, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 5620, + "tokens_per_second_per_gpu": 365.25 + }, + { + "epoch": 0.5597256052095243, + "grad_norm": 0.3383237421512604, + "learning_rate": 0.00016465403077612001, + "loss": 0.2251, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5630, + "tokens_per_second_per_gpu": 357.99 + }, + { + "epoch": 0.560719789232987, + "grad_norm": 0.3748398423194885, + "learning_rate": 0.00016453422008544388, + "loss": 0.2279, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 5640, + "tokens_per_second_per_gpu": 335.12 + }, + { + "epoch": 0.5617139732564498, + "grad_norm": 0.4504337012767792, + "learning_rate": 0.00016441425043334413, + "loss": 0.261, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 5650, + "tokens_per_second_per_gpu": 336.53 + }, + { + "epoch": 0.5627081572799125, + "grad_norm": 0.3367341458797455, + "learning_rate": 0.00016429412211533127, + "loss": 0.1855, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5660, + "tokens_per_second_per_gpu": 347.1 + }, + { + "epoch": 0.5637023413033753, + "grad_norm": 0.3390282988548279, + "learning_rate": 0.00016417383542730675, + "loss": 0.2428, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 5670, + "tokens_per_second_per_gpu": 385.32 + }, + { + "epoch": 0.564696525326838, + "grad_norm": 0.6450189352035522, + "learning_rate": 0.00016405339066556212, + "loss": 0.3651, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 5680, + "tokens_per_second_per_gpu": 427.22 + }, + { + "epoch": 0.5656907093503007, + "grad_norm": 0.5655054450035095, + "learning_rate": 0.0001639327881267783, + "loss": 0.2204, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 5690, + "tokens_per_second_per_gpu": 310.84 + }, + { + "epoch": 0.5666848933737635, + "grad_norm": 0.4713475704193115, + "learning_rate": 0.00016381202810802483, + "loss": 0.2294, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.47, + "memory/max_allocated (GiB)": 21.47, + "step": 5700, + "tokens_per_second_per_gpu": 363.21 + }, + { + "epoch": 0.5676790773972262, + "grad_norm": 0.19377703964710236, + "learning_rate": 0.00016369111090675916, + "loss": 0.2522, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 5710, + "tokens_per_second_per_gpu": 412.63 + }, + { + "epoch": 0.568673261420689, + "grad_norm": 0.5475621819496155, + "learning_rate": 0.0001635700368208259, + "loss": 0.3132, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 5720, + "tokens_per_second_per_gpu": 367.47 + }, + { + "epoch": 0.5696674454441517, + "grad_norm": 0.3860287368297577, + "learning_rate": 0.00016344880614845608, + "loss": 0.2623, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5730, + "tokens_per_second_per_gpu": 381.97 + }, + { + "epoch": 0.5706616294676145, + "grad_norm": 0.3809770345687866, + "learning_rate": 0.00016332741918826654, + "loss": 0.2365, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 5740, + "tokens_per_second_per_gpu": 326.73 + }, + { + "epoch": 0.5716558134910772, + "grad_norm": 0.5006048083305359, + "learning_rate": 0.00016320587623925895, + "loss": 0.2661, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 5750, + "tokens_per_second_per_gpu": 334.44 + }, + { + "epoch": 0.5726499975145399, + "grad_norm": 0.2004530131816864, + "learning_rate": 0.00016308417760081936, + "loss": 0.1923, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 5760, + "tokens_per_second_per_gpu": 377.56 + }, + { + "epoch": 0.5736441815380027, + "grad_norm": 0.30563804507255554, + "learning_rate": 0.00016296232357271718, + "loss": 0.2089, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 5770, + "tokens_per_second_per_gpu": 351.29 + }, + { + "epoch": 0.5746383655614654, + "grad_norm": 0.3142051100730896, + "learning_rate": 0.00016284031445510465, + "loss": 0.1931, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 5780, + "tokens_per_second_per_gpu": 327.41 + }, + { + "epoch": 0.5756325495849282, + "grad_norm": 0.4678092300891876, + "learning_rate": 0.000162718150548516, + "loss": 0.3011, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 5790, + "tokens_per_second_per_gpu": 367.12 + }, + { + "epoch": 0.5766267336083909, + "grad_norm": 0.44859957695007324, + "learning_rate": 0.00016259583215386675, + "loss": 0.2855, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5800, + "tokens_per_second_per_gpu": 377.51 + }, + { + "epoch": 0.5776209176318536, + "grad_norm": 0.3087036609649658, + "learning_rate": 0.00016247335957245303, + "loss": 0.2181, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.21, + "memory/max_allocated (GiB)": 18.21, + "step": 5810, + "tokens_per_second_per_gpu": 292.23 + }, + { + "epoch": 0.5786151016553164, + "grad_norm": 0.29946303367614746, + "learning_rate": 0.00016235073310595058, + "loss": 0.203, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 5820, + "tokens_per_second_per_gpu": 294.26 + }, + { + "epoch": 0.5796092856787791, + "grad_norm": 0.41978803277015686, + "learning_rate": 0.0001622279530564144, + "loss": 0.2186, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 5830, + "tokens_per_second_per_gpu": 332.7 + }, + { + "epoch": 0.5806034697022419, + "grad_norm": 0.5330750942230225, + "learning_rate": 0.00016210501972627764, + "loss": 0.2726, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 5840, + "tokens_per_second_per_gpu": 372.14 + }, + { + "epoch": 0.5815976537257046, + "grad_norm": 0.4443431794643402, + "learning_rate": 0.0001619819334183511, + "loss": 0.2121, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 5850, + "tokens_per_second_per_gpu": 329.51 + }, + { + "epoch": 0.5825918377491673, + "grad_norm": 0.4602350890636444, + "learning_rate": 0.00016185869443582237, + "loss": 0.2845, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.85, + "memory/max_allocated (GiB)": 18.85, + "step": 5860, + "tokens_per_second_per_gpu": 334.35 + }, + { + "epoch": 0.5835860217726301, + "grad_norm": 0.2751925587654114, + "learning_rate": 0.00016173530308225513, + "loss": 0.1812, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 5870, + "tokens_per_second_per_gpu": 397.13 + }, + { + "epoch": 0.5845802057960928, + "grad_norm": 0.5334510803222656, + "learning_rate": 0.00016161175966158834, + "loss": 0.2088, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 5880, + "tokens_per_second_per_gpu": 297.42 + }, + { + "epoch": 0.5855743898195556, + "grad_norm": 0.36908817291259766, + "learning_rate": 0.00016148806447813553, + "loss": 0.2197, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 5890, + "tokens_per_second_per_gpu": 294.32 + }, + { + "epoch": 0.5865685738430183, + "grad_norm": 0.24384719133377075, + "learning_rate": 0.00016136421783658416, + "loss": 0.2757, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.81, + "memory/max_allocated (GiB)": 18.81, + "step": 5900, + "tokens_per_second_per_gpu": 315.54 + }, + { + "epoch": 0.587562757866481, + "grad_norm": 0.41709259152412415, + "learning_rate": 0.0001612402200419946, + "loss": 0.2858, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 5910, + "tokens_per_second_per_gpu": 385.92 + }, + { + "epoch": 0.5885569418899438, + "grad_norm": 0.5187587738037109, + "learning_rate": 0.00016111607139979967, + "loss": 0.2347, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.64, + "memory/max_allocated (GiB)": 20.64, + "step": 5920, + "tokens_per_second_per_gpu": 377.8 + }, + { + "epoch": 0.5895511259134065, + "grad_norm": 0.29502683877944946, + "learning_rate": 0.00016099177221580373, + "loss": 0.2563, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.84, + "memory/max_allocated (GiB)": 18.84, + "step": 5930, + "tokens_per_second_per_gpu": 335.51 + }, + { + "epoch": 0.5905453099368693, + "grad_norm": 0.360385000705719, + "learning_rate": 0.00016086732279618188, + "loss": 0.1982, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 5940, + "tokens_per_second_per_gpu": 315.07 + }, + { + "epoch": 0.591539493960332, + "grad_norm": 0.35591524839401245, + "learning_rate": 0.0001607427234474794, + "loss": 0.2535, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 5950, + "tokens_per_second_per_gpu": 379.0 + }, + { + "epoch": 0.5925336779837947, + "grad_norm": 0.49772289395332336, + "learning_rate": 0.0001606179744766108, + "loss": 0.3105, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 5960, + "tokens_per_second_per_gpu": 403.62 + }, + { + "epoch": 0.5935278620072575, + "grad_norm": 0.3001823425292969, + "learning_rate": 0.00016049307619085915, + "loss": 0.2011, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 5970, + "tokens_per_second_per_gpu": 289.33 + }, + { + "epoch": 0.5945220460307203, + "grad_norm": 0.593662679195404, + "learning_rate": 0.00016036802889787536, + "loss": 0.2728, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 5980, + "tokens_per_second_per_gpu": 397.95 + }, + { + "epoch": 0.5955162300541831, + "grad_norm": 0.38041049242019653, + "learning_rate": 0.00016024283290567732, + "loss": 0.2016, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 5990, + "tokens_per_second_per_gpu": 310.64 + }, + { + "epoch": 0.5965104140776458, + "grad_norm": 0.28381025791168213, + "learning_rate": 0.0001601174885226492, + "loss": 0.2, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 6000, + "tokens_per_second_per_gpu": 368.07 + }, + { + "epoch": 0.5975045981011086, + "grad_norm": 0.4913434088230133, + "learning_rate": 0.0001599919960575407, + "loss": 0.2653, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6010, + "tokens_per_second_per_gpu": 365.97 + }, + { + "epoch": 0.5984987821245713, + "grad_norm": 0.4141637086868286, + "learning_rate": 0.00015986635581946638, + "loss": 0.2687, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 6020, + "tokens_per_second_per_gpu": 363.69 + }, + { + "epoch": 0.599492966148034, + "grad_norm": 0.45988166332244873, + "learning_rate": 0.00015974056811790462, + "loss": 0.2625, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6030, + "tokens_per_second_per_gpu": 402.4 + }, + { + "epoch": 0.6004871501714968, + "grad_norm": 0.5327023267745972, + "learning_rate": 0.0001596146332626971, + "loss": 0.2462, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 6040, + "tokens_per_second_per_gpu": 382.41 + }, + { + "epoch": 0.6014813341949595, + "grad_norm": 0.2608936131000519, + "learning_rate": 0.00015948855156404802, + "loss": 0.2171, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6050, + "tokens_per_second_per_gpu": 369.97 + }, + { + "epoch": 0.6024755182184223, + "grad_norm": 0.5496955513954163, + "learning_rate": 0.00015936232333252327, + "loss": 0.3213, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 6060, + "tokens_per_second_per_gpu": 365.39 + }, + { + "epoch": 0.603469702241885, + "grad_norm": 0.40054014325141907, + "learning_rate": 0.00015923594887904964, + "loss": 0.1949, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 6070, + "tokens_per_second_per_gpu": 293.13 + }, + { + "epoch": 0.6044638862653477, + "grad_norm": 0.7320500612258911, + "learning_rate": 0.0001591094285149141, + "loss": 0.2662, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6080, + "tokens_per_second_per_gpu": 336.97 + }, + { + "epoch": 0.6054580702888105, + "grad_norm": 0.484331876039505, + "learning_rate": 0.00015898276255176303, + "loss": 0.2487, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 6090, + "tokens_per_second_per_gpu": 359.19 + }, + { + "epoch": 0.6064522543122732, + "grad_norm": 0.4304184019565582, + "learning_rate": 0.00015885595130160155, + "loss": 0.2546, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6100, + "tokens_per_second_per_gpu": 329.56 + }, + { + "epoch": 0.607446438335736, + "grad_norm": 0.49904513359069824, + "learning_rate": 0.00015872899507679252, + "loss": 0.2622, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6110, + "tokens_per_second_per_gpu": 381.18 + }, + { + "epoch": 0.6084406223591987, + "grad_norm": 0.525607168674469, + "learning_rate": 0.00015860189419005595, + "loss": 0.2424, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 6120, + "tokens_per_second_per_gpu": 331.0 + }, + { + "epoch": 0.6094348063826615, + "grad_norm": 0.3618375360965729, + "learning_rate": 0.0001584746489544682, + "loss": 0.3099, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 6130, + "tokens_per_second_per_gpu": 435.27 + }, + { + "epoch": 0.6104289904061242, + "grad_norm": 0.41309383511543274, + "learning_rate": 0.00015834725968346116, + "loss": 0.2337, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6140, + "tokens_per_second_per_gpu": 381.0 + }, + { + "epoch": 0.6114231744295869, + "grad_norm": 0.37885645031929016, + "learning_rate": 0.00015821972669082156, + "loss": 0.3318, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6150, + "tokens_per_second_per_gpu": 365.41 + }, + { + "epoch": 0.6124173584530497, + "grad_norm": 0.4314253032207489, + "learning_rate": 0.0001580920502906901, + "loss": 0.2213, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6160, + "tokens_per_second_per_gpu": 343.04 + }, + { + "epoch": 0.6134115424765124, + "grad_norm": 0.5435411930084229, + "learning_rate": 0.00015796423079756074, + "loss": 0.243, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 6170, + "tokens_per_second_per_gpu": 365.75 + }, + { + "epoch": 0.6144057264999752, + "grad_norm": 0.31716790795326233, + "learning_rate": 0.00015783626852627992, + "loss": 0.2484, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 6180, + "tokens_per_second_per_gpu": 435.76 + }, + { + "epoch": 0.6153999105234379, + "grad_norm": 0.3328079879283905, + "learning_rate": 0.0001577081637920457, + "loss": 0.223, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6190, + "tokens_per_second_per_gpu": 310.86 + }, + { + "epoch": 0.6163940945469006, + "grad_norm": 0.3827805519104004, + "learning_rate": 0.00015757991691040722, + "loss": 0.2311, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6200, + "tokens_per_second_per_gpu": 333.74 + }, + { + "epoch": 0.6173882785703634, + "grad_norm": 0.3756648004055023, + "learning_rate": 0.00015745152819726356, + "loss": 0.229, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 6210, + "tokens_per_second_per_gpu": 352.29 + }, + { + "epoch": 0.6183824625938261, + "grad_norm": 0.4345835745334625, + "learning_rate": 0.0001573229979688633, + "loss": 0.2694, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 6220, + "tokens_per_second_per_gpu": 411.14 + }, + { + "epoch": 0.6193766466172889, + "grad_norm": 0.23183055222034454, + "learning_rate": 0.00015719432654180357, + "loss": 0.1925, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6230, + "tokens_per_second_per_gpu": 363.23 + }, + { + "epoch": 0.6203708306407516, + "grad_norm": 0.2867846190929413, + "learning_rate": 0.00015706551423302925, + "loss": 0.1506, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6240, + "tokens_per_second_per_gpu": 275.12 + }, + { + "epoch": 0.6213650146642143, + "grad_norm": 0.42951786518096924, + "learning_rate": 0.00015693656135983233, + "loss": 0.2867, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 6250, + "tokens_per_second_per_gpu": 383.43 + }, + { + "epoch": 0.6223591986876771, + "grad_norm": 0.5975165963172913, + "learning_rate": 0.00015680746823985094, + "loss": 0.263, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 6260, + "tokens_per_second_per_gpu": 304.9 + }, + { + "epoch": 0.6233533827111398, + "grad_norm": 0.48659709095954895, + "learning_rate": 0.00015667823519106873, + "loss": 0.2079, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 6270, + "tokens_per_second_per_gpu": 340.54 + }, + { + "epoch": 0.6243475667346026, + "grad_norm": 0.461224764585495, + "learning_rate": 0.00015654886253181402, + "loss": 0.2537, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6280, + "tokens_per_second_per_gpu": 321.0 + }, + { + "epoch": 0.6253417507580653, + "grad_norm": 0.36294886469841003, + "learning_rate": 0.00015641935058075904, + "loss": 0.2009, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 6290, + "tokens_per_second_per_gpu": 308.81 + }, + { + "epoch": 0.626335934781528, + "grad_norm": 0.4274291396141052, + "learning_rate": 0.0001562896996569191, + "loss": 0.2343, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6300, + "tokens_per_second_per_gpu": 319.11 + }, + { + "epoch": 0.6273301188049908, + "grad_norm": 0.47336748242378235, + "learning_rate": 0.00015615991007965176, + "loss": 0.211, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6310, + "tokens_per_second_per_gpu": 355.64 + }, + { + "epoch": 0.6283243028284535, + "grad_norm": 0.4946894347667694, + "learning_rate": 0.00015602998216865624, + "loss": 0.2492, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6320, + "tokens_per_second_per_gpu": 360.15 + }, + { + "epoch": 0.6293184868519163, + "grad_norm": 0.38327473402023315, + "learning_rate": 0.00015589991624397244, + "loss": 0.2308, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.61, + "memory/max_allocated (GiB)": 20.61, + "step": 6330, + "tokens_per_second_per_gpu": 301.77 + }, + { + "epoch": 0.630312670875379, + "grad_norm": 0.46532171964645386, + "learning_rate": 0.00015576971262598024, + "loss": 0.2812, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6340, + "tokens_per_second_per_gpu": 344.05 + }, + { + "epoch": 0.6313068548988418, + "grad_norm": 0.3749048709869385, + "learning_rate": 0.00015563937163539862, + "loss": 0.2415, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 6350, + "tokens_per_second_per_gpu": 348.41 + }, + { + "epoch": 0.6323010389223045, + "grad_norm": 0.2943509519100189, + "learning_rate": 0.000155508893593285, + "loss": 0.2228, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.77, + "memory/max_allocated (GiB)": 20.77, + "step": 6360, + "tokens_per_second_per_gpu": 288.63 + }, + { + "epoch": 0.6332952229457672, + "grad_norm": 0.5494127869606018, + "learning_rate": 0.00015537827882103442, + "loss": 0.2499, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 6370, + "tokens_per_second_per_gpu": 356.07 + }, + { + "epoch": 0.63428940696923, + "grad_norm": 0.48988381028175354, + "learning_rate": 0.0001552475276403786, + "loss": 0.2142, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 6380, + "tokens_per_second_per_gpu": 343.8 + }, + { + "epoch": 0.6352835909926927, + "grad_norm": 0.3422715365886688, + "learning_rate": 0.00015511664037338538, + "loss": 0.2364, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 6390, + "tokens_per_second_per_gpu": 409.18 + }, + { + "epoch": 0.6362777750161555, + "grad_norm": 0.6021102070808411, + "learning_rate": 0.00015498561734245776, + "loss": 0.2392, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6400, + "tokens_per_second_per_gpu": 306.59 + }, + { + "epoch": 0.6372719590396182, + "grad_norm": 0.6073122620582581, + "learning_rate": 0.00015485445887033317, + "loss": 0.2798, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6410, + "tokens_per_second_per_gpu": 342.93 + }, + { + "epoch": 0.6382661430630809, + "grad_norm": 0.3407362401485443, + "learning_rate": 0.0001547231652800826, + "loss": 0.2477, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 6420, + "tokens_per_second_per_gpu": 367.05 + }, + { + "epoch": 0.6392603270865437, + "grad_norm": 0.506592869758606, + "learning_rate": 0.00015459173689510994, + "loss": 0.2399, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 6430, + "tokens_per_second_per_gpu": 412.24 + }, + { + "epoch": 0.6402545111100064, + "grad_norm": 0.5419439077377319, + "learning_rate": 0.0001544601740391511, + "loss": 0.1948, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 6440, + "tokens_per_second_per_gpu": 329.23 + }, + { + "epoch": 0.6412486951334692, + "grad_norm": 0.48251059651374817, + "learning_rate": 0.00015432847703627316, + "loss": 0.2146, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 6450, + "tokens_per_second_per_gpu": 317.65 + }, + { + "epoch": 0.6422428791569319, + "grad_norm": 0.22626249492168427, + "learning_rate": 0.0001541966462108737, + "loss": 0.2593, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6460, + "tokens_per_second_per_gpu": 331.91 + }, + { + "epoch": 0.6432370631803946, + "grad_norm": 0.5113493204116821, + "learning_rate": 0.0001540646818876799, + "loss": 0.2162, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 6470, + "tokens_per_second_per_gpu": 311.21 + }, + { + "epoch": 0.6442312472038574, + "grad_norm": 0.3097884953022003, + "learning_rate": 0.0001539325843917478, + "loss": 0.1879, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 6480, + "tokens_per_second_per_gpu": 320.21 + }, + { + "epoch": 0.6452254312273201, + "grad_norm": 0.32837173342704773, + "learning_rate": 0.0001538003540484614, + "loss": 0.217, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.68, + "memory/max_allocated (GiB)": 19.68, + "step": 6490, + "tokens_per_second_per_gpu": 351.68 + }, + { + "epoch": 0.646219615250783, + "grad_norm": 0.519063413143158, + "learning_rate": 0.00015366799118353202, + "loss": 0.2531, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 6500, + "tokens_per_second_per_gpu": 379.3 + }, + { + "epoch": 0.6472137992742457, + "grad_norm": 0.3581913113594055, + "learning_rate": 0.0001535354961229974, + "loss": 0.291, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.53, + "memory/max_allocated (GiB)": 21.53, + "step": 6510, + "tokens_per_second_per_gpu": 389.02 + }, + { + "epoch": 0.6482079832977085, + "grad_norm": 0.3630671799182892, + "learning_rate": 0.0001534028691932208, + "loss": 0.2409, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6520, + "tokens_per_second_per_gpu": 313.47 + }, + { + "epoch": 0.6492021673211712, + "grad_norm": 0.3670892119407654, + "learning_rate": 0.00015327011072089044, + "loss": 0.2133, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 6530, + "tokens_per_second_per_gpu": 290.75 + }, + { + "epoch": 0.6501963513446339, + "grad_norm": 0.40198561549186707, + "learning_rate": 0.00015313722103301852, + "loss": 0.27, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.85, + "memory/max_allocated (GiB)": 18.85, + "step": 6540, + "tokens_per_second_per_gpu": 429.99 + }, + { + "epoch": 0.6511905353680967, + "grad_norm": 0.3494684398174286, + "learning_rate": 0.00015300420045694034, + "loss": 0.1676, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6550, + "tokens_per_second_per_gpu": 368.89 + }, + { + "epoch": 0.6521847193915594, + "grad_norm": 0.42560404539108276, + "learning_rate": 0.00015287104932031374, + "loss": 0.2585, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 6560, + "tokens_per_second_per_gpu": 317.83 + }, + { + "epoch": 0.6531789034150222, + "grad_norm": 0.511513352394104, + "learning_rate": 0.00015273776795111813, + "loss": 0.2129, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6570, + "tokens_per_second_per_gpu": 327.26 + }, + { + "epoch": 0.6541730874384849, + "grad_norm": 0.3022279441356659, + "learning_rate": 0.00015260435667765364, + "loss": 0.2674, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 6580, + "tokens_per_second_per_gpu": 332.65 + }, + { + "epoch": 0.6551672714619476, + "grad_norm": 0.3808051347732544, + "learning_rate": 0.00015247081582854053, + "loss": 0.2512, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6590, + "tokens_per_second_per_gpu": 379.8 + }, + { + "epoch": 0.6561614554854104, + "grad_norm": 0.4839475154876709, + "learning_rate": 0.00015233714573271802, + "loss": 0.2376, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6600, + "tokens_per_second_per_gpu": 329.54 + }, + { + "epoch": 0.6571556395088731, + "grad_norm": 0.7145663499832153, + "learning_rate": 0.0001522033467194439, + "loss": 0.2289, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6610, + "tokens_per_second_per_gpu": 327.02 + }, + { + "epoch": 0.6581498235323359, + "grad_norm": 0.4483419358730316, + "learning_rate": 0.00015206941911829336, + "loss": 0.2619, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 6620, + "tokens_per_second_per_gpu": 344.26 + }, + { + "epoch": 0.6591440075557986, + "grad_norm": 0.7042835354804993, + "learning_rate": 0.00015193536325915842, + "loss": 0.3162, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 6630, + "tokens_per_second_per_gpu": 325.39 + }, + { + "epoch": 0.6601381915792613, + "grad_norm": 0.44085246324539185, + "learning_rate": 0.00015180117947224698, + "loss": 0.1955, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 6640, + "tokens_per_second_per_gpu": 356.35 + }, + { + "epoch": 0.6611323756027241, + "grad_norm": 0.32135269045829773, + "learning_rate": 0.00015166686808808208, + "loss": 0.2302, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 6650, + "tokens_per_second_per_gpu": 401.75 + }, + { + "epoch": 0.6621265596261868, + "grad_norm": 0.5171180367469788, + "learning_rate": 0.00015153242943750103, + "loss": 0.251, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 6660, + "tokens_per_second_per_gpu": 328.4 + }, + { + "epoch": 0.6631207436496496, + "grad_norm": 0.5205950140953064, + "learning_rate": 0.00015139786385165462, + "loss": 0.2186, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 6670, + "tokens_per_second_per_gpu": 325.18 + }, + { + "epoch": 0.6641149276731123, + "grad_norm": 0.31780245900154114, + "learning_rate": 0.0001512631716620064, + "loss": 0.1604, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6680, + "tokens_per_second_per_gpu": 319.84 + }, + { + "epoch": 0.665109111696575, + "grad_norm": 0.29278233647346497, + "learning_rate": 0.00015112835320033163, + "loss": 0.266, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6690, + "tokens_per_second_per_gpu": 408.19 + }, + { + "epoch": 0.6661032957200378, + "grad_norm": 0.47382065653800964, + "learning_rate": 0.00015099340879871668, + "loss": 0.1933, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 6700, + "tokens_per_second_per_gpu": 302.45 + }, + { + "epoch": 0.6670974797435005, + "grad_norm": 0.3947311043739319, + "learning_rate": 0.00015085833878955823, + "loss": 0.2225, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 6710, + "tokens_per_second_per_gpu": 418.2 + }, + { + "epoch": 0.6680916637669633, + "grad_norm": 0.5490260720252991, + "learning_rate": 0.00015072314350556213, + "loss": 0.2056, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 6720, + "tokens_per_second_per_gpu": 319.35 + }, + { + "epoch": 0.669085847790426, + "grad_norm": 0.412194162607193, + "learning_rate": 0.000150587823279743, + "loss": 0.2377, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 6730, + "tokens_per_second_per_gpu": 304.73 + }, + { + "epoch": 0.6700800318138888, + "grad_norm": 0.40393805503845215, + "learning_rate": 0.00015045237844542317, + "loss": 0.2622, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 6740, + "tokens_per_second_per_gpu": 335.77 + }, + { + "epoch": 0.6710742158373515, + "grad_norm": 0.5896100401878357, + "learning_rate": 0.00015031680933623188, + "loss": 0.3129, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 6750, + "tokens_per_second_per_gpu": 304.37 + }, + { + "epoch": 0.6720683998608142, + "grad_norm": 0.5198945999145508, + "learning_rate": 0.00015018111628610446, + "loss": 0.2704, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6760, + "tokens_per_second_per_gpu": 380.02 + }, + { + "epoch": 0.673062583884277, + "grad_norm": 0.32067760825157166, + "learning_rate": 0.00015004529962928164, + "loss": 0.2495, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 6770, + "tokens_per_second_per_gpu": 369.15 + }, + { + "epoch": 0.6740567679077397, + "grad_norm": 0.49704423546791077, + "learning_rate": 0.0001499093597003085, + "loss": 0.2095, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 6780, + "tokens_per_second_per_gpu": 324.06 + }, + { + "epoch": 0.6750509519312025, + "grad_norm": 0.42155733704566956, + "learning_rate": 0.00014977329683403385, + "loss": 0.1743, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 6790, + "tokens_per_second_per_gpu": 309.09 + }, + { + "epoch": 0.6760451359546652, + "grad_norm": 0.5538848638534546, + "learning_rate": 0.00014963711136560924, + "loss": 0.3424, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6800, + "tokens_per_second_per_gpu": 347.86 + }, + { + "epoch": 0.6770393199781279, + "grad_norm": 0.3429434299468994, + "learning_rate": 0.00014950080363048833, + "loss": 0.2047, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6810, + "tokens_per_second_per_gpu": 341.24 + }, + { + "epoch": 0.6780335040015907, + "grad_norm": 0.39259403944015503, + "learning_rate": 0.0001493643739644258, + "loss": 0.187, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 6820, + "tokens_per_second_per_gpu": 367.18 + }, + { + "epoch": 0.6790276880250534, + "grad_norm": 0.37642526626586914, + "learning_rate": 0.00014922782270347686, + "loss": 0.236, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6830, + "tokens_per_second_per_gpu": 324.51 + }, + { + "epoch": 0.6800218720485162, + "grad_norm": 0.5826324820518494, + "learning_rate": 0.00014909115018399603, + "loss": 0.2494, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.68, + "memory/max_allocated (GiB)": 19.68, + "step": 6840, + "tokens_per_second_per_gpu": 343.26 + }, + { + "epoch": 0.6810160560719789, + "grad_norm": 0.39206662774086, + "learning_rate": 0.00014895435674263662, + "loss": 0.2522, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6850, + "tokens_per_second_per_gpu": 330.16 + }, + { + "epoch": 0.6820102400954416, + "grad_norm": 0.21635837852954865, + "learning_rate": 0.00014881744271634986, + "loss": 0.2534, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 6860, + "tokens_per_second_per_gpu": 319.9 + }, + { + "epoch": 0.6830044241189044, + "grad_norm": 0.25813058018684387, + "learning_rate": 0.00014868040844238386, + "loss": 0.2255, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 6870, + "tokens_per_second_per_gpu": 352.96 + }, + { + "epoch": 0.6839986081423671, + "grad_norm": 0.46098119020462036, + "learning_rate": 0.00014854325425828305, + "loss": 0.2135, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 6880, + "tokens_per_second_per_gpu": 278.19 + }, + { + "epoch": 0.6849927921658299, + "grad_norm": 0.45799604058265686, + "learning_rate": 0.00014840598050188715, + "loss": 0.2283, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6890, + "tokens_per_second_per_gpu": 358.38 + }, + { + "epoch": 0.6859869761892926, + "grad_norm": 0.6016408205032349, + "learning_rate": 0.00014826858751133042, + "loss": 0.2261, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 6900, + "tokens_per_second_per_gpu": 368.61 + }, + { + "epoch": 0.6869811602127553, + "grad_norm": 0.488506555557251, + "learning_rate": 0.00014813107562504084, + "loss": 0.2799, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 6910, + "tokens_per_second_per_gpu": 367.4 + }, + { + "epoch": 0.6879753442362181, + "grad_norm": 0.6327788829803467, + "learning_rate": 0.00014799344518173928, + "loss": 0.1868, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 6920, + "tokens_per_second_per_gpu": 300.97 + }, + { + "epoch": 0.6889695282596808, + "grad_norm": 0.4955579340457916, + "learning_rate": 0.00014785569652043856, + "loss": 0.2496, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 6930, + "tokens_per_second_per_gpu": 343.13 + }, + { + "epoch": 0.6899637122831436, + "grad_norm": 0.5724585652351379, + "learning_rate": 0.0001477178299804428, + "loss": 0.2611, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.82, + "memory/max_allocated (GiB)": 18.82, + "step": 6940, + "tokens_per_second_per_gpu": 326.77 + }, + { + "epoch": 0.6909578963066063, + "grad_norm": 0.2057613730430603, + "learning_rate": 0.00014757984590134642, + "loss": 0.1107, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 6950, + "tokens_per_second_per_gpu": 293.99 + }, + { + "epoch": 0.691952080330069, + "grad_norm": 0.3206622004508972, + "learning_rate": 0.00014744174462303334, + "loss": 0.2379, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 6960, + "tokens_per_second_per_gpu": 371.29 + }, + { + "epoch": 0.6929462643535318, + "grad_norm": 0.3926986753940582, + "learning_rate": 0.00014730352648567623, + "loss": 0.2558, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 6970, + "tokens_per_second_per_gpu": 377.71 + }, + { + "epoch": 0.6939404483769945, + "grad_norm": 0.34591636061668396, + "learning_rate": 0.00014716519182973552, + "loss": 0.2601, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 6980, + "tokens_per_second_per_gpu": 352.12 + }, + { + "epoch": 0.6949346324004573, + "grad_norm": 0.5908513069152832, + "learning_rate": 0.00014702674099595876, + "loss": 0.2027, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 6990, + "tokens_per_second_per_gpu": 345.62 + }, + { + "epoch": 0.69592881642392, + "grad_norm": 0.3830493986606598, + "learning_rate": 0.00014688817432537962, + "loss": 0.1987, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7000, + "tokens_per_second_per_gpu": 359.97 + }, + { + "epoch": 0.6969230004473828, + "grad_norm": 0.444762647151947, + "learning_rate": 0.00014674949215931707, + "loss": 0.2059, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 7010, + "tokens_per_second_per_gpu": 289.86 + }, + { + "epoch": 0.6979171844708456, + "grad_norm": 0.31576088070869446, + "learning_rate": 0.00014661069483937458, + "loss": 0.2115, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 7020, + "tokens_per_second_per_gpu": 339.22 + }, + { + "epoch": 0.6989113684943083, + "grad_norm": 0.4755282700061798, + "learning_rate": 0.00014647178270743932, + "loss": 0.265, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 7030, + "tokens_per_second_per_gpu": 329.94 + }, + { + "epoch": 0.6999055525177711, + "grad_norm": 0.4698229134082794, + "learning_rate": 0.00014633275610568123, + "loss": 0.2492, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7040, + "tokens_per_second_per_gpu": 315.54 + }, + { + "epoch": 0.7008997365412338, + "grad_norm": 0.3248315453529358, + "learning_rate": 0.00014619361537655215, + "loss": 0.2412, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.4, + "memory/max_allocated (GiB)": 21.4, + "step": 7050, + "tokens_per_second_per_gpu": 294.7 + }, + { + "epoch": 0.7018939205646966, + "grad_norm": 0.48639553785324097, + "learning_rate": 0.0001460543608627852, + "loss": 0.2356, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 7060, + "tokens_per_second_per_gpu": 280.17 + }, + { + "epoch": 0.7028881045881593, + "grad_norm": 0.5937051773071289, + "learning_rate": 0.00014591499290739362, + "loss": 0.1679, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7070, + "tokens_per_second_per_gpu": 336.7 + }, + { + "epoch": 0.703882288611622, + "grad_norm": 0.3488394021987915, + "learning_rate": 0.00014577551185367013, + "loss": 0.2474, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7080, + "tokens_per_second_per_gpu": 400.34 + }, + { + "epoch": 0.7048764726350848, + "grad_norm": 0.4485851526260376, + "learning_rate": 0.0001456359180451861, + "loss": 0.2709, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 7090, + "tokens_per_second_per_gpu": 392.29 + }, + { + "epoch": 0.7058706566585475, + "grad_norm": 0.4746951758861542, + "learning_rate": 0.00014549621182579055, + "loss": 0.2353, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7100, + "tokens_per_second_per_gpu": 330.73 + }, + { + "epoch": 0.7068648406820103, + "grad_norm": 0.5027205944061279, + "learning_rate": 0.00014535639353960942, + "loss": 0.2576, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 7110, + "tokens_per_second_per_gpu": 379.65 + }, + { + "epoch": 0.707859024705473, + "grad_norm": 0.449788361787796, + "learning_rate": 0.00014521646353104472, + "loss": 0.2186, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7120, + "tokens_per_second_per_gpu": 269.74 + }, + { + "epoch": 0.7088532087289358, + "grad_norm": 0.31661751866340637, + "learning_rate": 0.00014507642214477362, + "loss": 0.2481, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 7130, + "tokens_per_second_per_gpu": 302.22 + }, + { + "epoch": 0.7098473927523985, + "grad_norm": 0.3295125663280487, + "learning_rate": 0.00014493626972574765, + "loss": 0.2284, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7140, + "tokens_per_second_per_gpu": 350.96 + }, + { + "epoch": 0.7108415767758612, + "grad_norm": 0.5383651256561279, + "learning_rate": 0.0001447960066191919, + "loss": 0.2427, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 7150, + "tokens_per_second_per_gpu": 345.67 + }, + { + "epoch": 0.711835760799324, + "grad_norm": 0.3970474898815155, + "learning_rate": 0.00014465563317060394, + "loss": 0.2434, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 7160, + "tokens_per_second_per_gpu": 374.3 + }, + { + "epoch": 0.7128299448227867, + "grad_norm": 0.16766348481178284, + "learning_rate": 0.00014451514972575332, + "loss": 0.1649, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7170, + "tokens_per_second_per_gpu": 304.14 + }, + { + "epoch": 0.7138241288462495, + "grad_norm": 0.4426742196083069, + "learning_rate": 0.00014437455663068042, + "loss": 0.2633, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7180, + "tokens_per_second_per_gpu": 374.18 + }, + { + "epoch": 0.7148183128697122, + "grad_norm": 0.4757481515407562, + "learning_rate": 0.00014423385423169575, + "loss": 0.2584, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7190, + "tokens_per_second_per_gpu": 302.88 + }, + { + "epoch": 0.7158124968931749, + "grad_norm": 0.4964188039302826, + "learning_rate": 0.00014409304287537906, + "loss": 0.2386, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 7200, + "tokens_per_second_per_gpu": 325.69 + }, + { + "epoch": 0.7168066809166377, + "grad_norm": 0.5026222467422485, + "learning_rate": 0.0001439521229085785, + "loss": 0.2161, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 7210, + "tokens_per_second_per_gpu": 332.92 + }, + { + "epoch": 0.7178008649401004, + "grad_norm": 0.41850724816322327, + "learning_rate": 0.00014381109467840976, + "loss": 0.2157, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 7220, + "tokens_per_second_per_gpu": 328.35 + }, + { + "epoch": 0.7187950489635632, + "grad_norm": 0.3922070264816284, + "learning_rate": 0.00014366995853225514, + "loss": 0.2112, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7230, + "tokens_per_second_per_gpu": 263.36 + }, + { + "epoch": 0.7197892329870259, + "grad_norm": 0.5679214000701904, + "learning_rate": 0.0001435287148177628, + "loss": 0.2715, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7240, + "tokens_per_second_per_gpu": 434.11 + }, + { + "epoch": 0.7207834170104886, + "grad_norm": 0.5302831530570984, + "learning_rate": 0.0001433873638828458, + "loss": 0.252, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 7250, + "tokens_per_second_per_gpu": 358.75 + }, + { + "epoch": 0.7217776010339514, + "grad_norm": 0.49475687742233276, + "learning_rate": 0.00014324590607568149, + "loss": 0.2613, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 7260, + "tokens_per_second_per_gpu": 377.3 + }, + { + "epoch": 0.7227717850574141, + "grad_norm": 0.4263441264629364, + "learning_rate": 0.00014310434174471024, + "loss": 0.288, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7270, + "tokens_per_second_per_gpu": 378.27 + }, + { + "epoch": 0.7237659690808769, + "grad_norm": 0.4663153886795044, + "learning_rate": 0.000142962671238635, + "loss": 0.2572, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7280, + "tokens_per_second_per_gpu": 390.5 + }, + { + "epoch": 0.7247601531043396, + "grad_norm": 0.3563691973686218, + "learning_rate": 0.0001428208949064201, + "loss": 0.2024, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 7290, + "tokens_per_second_per_gpu": 330.99 + }, + { + "epoch": 0.7257543371278024, + "grad_norm": 0.2805791199207306, + "learning_rate": 0.00014267901309729066, + "loss": 0.2371, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 7300, + "tokens_per_second_per_gpu": 410.4 + }, + { + "epoch": 0.7267485211512651, + "grad_norm": 0.30967897176742554, + "learning_rate": 0.00014253702616073155, + "loss": 0.231, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7310, + "tokens_per_second_per_gpu": 302.94 + }, + { + "epoch": 0.7277427051747278, + "grad_norm": 0.353834867477417, + "learning_rate": 0.00014239493444648658, + "loss": 0.1885, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 7320, + "tokens_per_second_per_gpu": 296.75 + }, + { + "epoch": 0.7287368891981906, + "grad_norm": 0.32480210065841675, + "learning_rate": 0.00014225273830455773, + "loss": 0.2713, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7330, + "tokens_per_second_per_gpu": 389.69 + }, + { + "epoch": 0.7297310732216533, + "grad_norm": 0.6818671226501465, + "learning_rate": 0.00014211043808520405, + "loss": 0.3248, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7340, + "tokens_per_second_per_gpu": 355.32 + }, + { + "epoch": 0.730725257245116, + "grad_norm": 0.5786187648773193, + "learning_rate": 0.0001419680341389412, + "loss": 0.2262, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7350, + "tokens_per_second_per_gpu": 321.29 + }, + { + "epoch": 0.7317194412685788, + "grad_norm": 0.5133084058761597, + "learning_rate": 0.0001418255268165401, + "loss": 0.2653, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7360, + "tokens_per_second_per_gpu": 348.43 + }, + { + "epoch": 0.7327136252920415, + "grad_norm": 0.4247760474681854, + "learning_rate": 0.0001416829164690264, + "loss": 0.2312, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 7370, + "tokens_per_second_per_gpu": 360.14 + }, + { + "epoch": 0.7337078093155043, + "grad_norm": 0.32232165336608887, + "learning_rate": 0.00014154020344767955, + "loss": 0.2825, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7380, + "tokens_per_second_per_gpu": 379.26 + }, + { + "epoch": 0.734701993338967, + "grad_norm": 0.4452918767929077, + "learning_rate": 0.0001413973881040319, + "loss": 0.2205, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7390, + "tokens_per_second_per_gpu": 359.25 + }, + { + "epoch": 0.7356961773624298, + "grad_norm": 0.3855791985988617, + "learning_rate": 0.0001412544707898678, + "loss": 0.2868, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 7400, + "tokens_per_second_per_gpu": 410.81 + }, + { + "epoch": 0.7366903613858925, + "grad_norm": 0.42609113454818726, + "learning_rate": 0.00014111145185722283, + "loss": 0.2523, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 7410, + "tokens_per_second_per_gpu": 331.0 + }, + { + "epoch": 0.7376845454093552, + "grad_norm": 0.47836732864379883, + "learning_rate": 0.00014096833165838283, + "loss": 0.2962, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 7420, + "tokens_per_second_per_gpu": 344.4 + }, + { + "epoch": 0.738678729432818, + "grad_norm": 0.508818507194519, + "learning_rate": 0.0001408251105458831, + "loss": 0.3254, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7430, + "tokens_per_second_per_gpu": 389.2 + }, + { + "epoch": 0.7396729134562807, + "grad_norm": 0.3887844681739807, + "learning_rate": 0.00014068178887250752, + "loss": 0.2353, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 7440, + "tokens_per_second_per_gpu": 380.02 + }, + { + "epoch": 0.7406670974797435, + "grad_norm": 0.41547468304634094, + "learning_rate": 0.00014053836699128765, + "loss": 0.2424, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7450, + "tokens_per_second_per_gpu": 376.26 + }, + { + "epoch": 0.7416612815032062, + "grad_norm": 0.5015019178390503, + "learning_rate": 0.00014039484525550186, + "loss": 0.2329, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 7460, + "tokens_per_second_per_gpu": 351.38 + }, + { + "epoch": 0.742655465526669, + "grad_norm": 0.43546929955482483, + "learning_rate": 0.0001402512240186746, + "loss": 0.2186, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7470, + "tokens_per_second_per_gpu": 303.33 + }, + { + "epoch": 0.7436496495501317, + "grad_norm": 0.5051418542861938, + "learning_rate": 0.0001401075036345753, + "loss": 0.2439, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.19, + "memory/max_allocated (GiB)": 19.19, + "step": 7480, + "tokens_per_second_per_gpu": 375.31 + }, + { + "epoch": 0.7446438335735944, + "grad_norm": 0.35766085982322693, + "learning_rate": 0.0001399636844572176, + "loss": 0.277, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 7490, + "tokens_per_second_per_gpu": 343.63 + }, + { + "epoch": 0.7456380175970572, + "grad_norm": 0.5930467247962952, + "learning_rate": 0.0001398197668408586, + "loss": 0.2474, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 7500, + "tokens_per_second_per_gpu": 396.18 + }, + { + "epoch": 0.7466322016205199, + "grad_norm": 0.4920576810836792, + "learning_rate": 0.00013967575113999777, + "loss": 0.2408, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 7510, + "tokens_per_second_per_gpu": 382.95 + }, + { + "epoch": 0.7476263856439826, + "grad_norm": 0.44312262535095215, + "learning_rate": 0.0001395316377093762, + "loss": 0.2249, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 7520, + "tokens_per_second_per_gpu": 387.85 + }, + { + "epoch": 0.7486205696674454, + "grad_norm": 0.4043440818786621, + "learning_rate": 0.00013938742690397575, + "loss": 0.2141, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 7530, + "tokens_per_second_per_gpu": 360.87 + }, + { + "epoch": 0.7496147536909082, + "grad_norm": 0.3910767138004303, + "learning_rate": 0.00013924311907901813, + "loss": 0.1528, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 7540, + "tokens_per_second_per_gpu": 308.96 + }, + { + "epoch": 0.750608937714371, + "grad_norm": 0.3407839238643646, + "learning_rate": 0.00013909871458996399, + "loss": 0.2192, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7550, + "tokens_per_second_per_gpu": 316.74 + }, + { + "epoch": 0.7516031217378337, + "grad_norm": 0.316240519285202, + "learning_rate": 0.00013895421379251207, + "loss": 0.2317, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 7560, + "tokens_per_second_per_gpu": 332.89 + }, + { + "epoch": 0.7525973057612965, + "grad_norm": 0.49255135655403137, + "learning_rate": 0.00013880961704259846, + "loss": 0.2413, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7570, + "tokens_per_second_per_gpu": 352.94 + }, + { + "epoch": 0.7535914897847592, + "grad_norm": 0.4979618489742279, + "learning_rate": 0.0001386649246963955, + "loss": 0.2434, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 7580, + "tokens_per_second_per_gpu": 307.59 + }, + { + "epoch": 0.754585673808222, + "grad_norm": 0.2949107885360718, + "learning_rate": 0.00013852013711031095, + "loss": 0.2112, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 7590, + "tokens_per_second_per_gpu": 363.41 + }, + { + "epoch": 0.7555798578316847, + "grad_norm": 0.3708727955818176, + "learning_rate": 0.0001383752546409873, + "loss": 0.2232, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7600, + "tokens_per_second_per_gpu": 386.01 + }, + { + "epoch": 0.7565740418551474, + "grad_norm": 0.6432907581329346, + "learning_rate": 0.00013823027764530067, + "loss": 0.2707, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 7610, + "tokens_per_second_per_gpu": 313.62 + }, + { + "epoch": 0.7575682258786102, + "grad_norm": 0.3710288405418396, + "learning_rate": 0.00013808520648036005, + "loss": 0.2353, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7620, + "tokens_per_second_per_gpu": 432.88 + }, + { + "epoch": 0.7585624099020729, + "grad_norm": 0.3577297031879425, + "learning_rate": 0.00013794004150350636, + "loss": 0.212, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 7630, + "tokens_per_second_per_gpu": 323.27 + }, + { + "epoch": 0.7595565939255357, + "grad_norm": 0.4883553385734558, + "learning_rate": 0.00013779478307231164, + "loss": 0.2747, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7640, + "tokens_per_second_per_gpu": 292.24 + }, + { + "epoch": 0.7605507779489984, + "grad_norm": 0.19372917711734772, + "learning_rate": 0.00013764943154457812, + "loss": 0.233, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7650, + "tokens_per_second_per_gpu": 375.4 + }, + { + "epoch": 0.7615449619724611, + "grad_norm": 0.46450668573379517, + "learning_rate": 0.00013750398727833735, + "loss": 0.219, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 7660, + "tokens_per_second_per_gpu": 351.49 + }, + { + "epoch": 0.7625391459959239, + "grad_norm": 0.3964915871620178, + "learning_rate": 0.00013735845063184921, + "loss": 0.2376, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7670, + "tokens_per_second_per_gpu": 309.02 + }, + { + "epoch": 0.7635333300193866, + "grad_norm": 0.6207079887390137, + "learning_rate": 0.00013721282196360127, + "loss": 0.2547, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7680, + "tokens_per_second_per_gpu": 338.05 + }, + { + "epoch": 0.7645275140428494, + "grad_norm": 0.2084685117006302, + "learning_rate": 0.00013706710163230773, + "loss": 0.2504, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7690, + "tokens_per_second_per_gpu": 294.32 + }, + { + "epoch": 0.7655216980663121, + "grad_norm": 0.4136933386325836, + "learning_rate": 0.0001369212899969086, + "loss": 0.1809, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7700, + "tokens_per_second_per_gpu": 314.54 + }, + { + "epoch": 0.7665158820897748, + "grad_norm": 0.529629111289978, + "learning_rate": 0.0001367753874165687, + "loss": 0.255, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7710, + "tokens_per_second_per_gpu": 357.08 + }, + { + "epoch": 0.7675100661132376, + "grad_norm": 0.36684682965278625, + "learning_rate": 0.0001366293942506769, + "loss": 0.2128, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 7720, + "tokens_per_second_per_gpu": 318.04 + }, + { + "epoch": 0.7685042501367003, + "grad_norm": 0.40612316131591797, + "learning_rate": 0.00013648331085884527, + "loss": 0.2159, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 7730, + "tokens_per_second_per_gpu": 364.09 + }, + { + "epoch": 0.7694984341601631, + "grad_norm": 0.13119497895240784, + "learning_rate": 0.0001363371376009081, + "loss": 0.2255, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7740, + "tokens_per_second_per_gpu": 336.24 + }, + { + "epoch": 0.7704926181836258, + "grad_norm": 0.5006715655326843, + "learning_rate": 0.00013619087483692099, + "loss": 0.2595, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7750, + "tokens_per_second_per_gpu": 338.82 + }, + { + "epoch": 0.7714868022070885, + "grad_norm": 0.3994678258895874, + "learning_rate": 0.00013604452292716003, + "loss": 0.203, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 7760, + "tokens_per_second_per_gpu": 327.9 + }, + { + "epoch": 0.7724809862305513, + "grad_norm": 0.17447052896022797, + "learning_rate": 0.00013589808223212087, + "loss": 0.2537, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7770, + "tokens_per_second_per_gpu": 376.09 + }, + { + "epoch": 0.773475170254014, + "grad_norm": 0.5262983441352844, + "learning_rate": 0.000135751553112518, + "loss": 0.2112, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7780, + "tokens_per_second_per_gpu": 338.13 + }, + { + "epoch": 0.7744693542774768, + "grad_norm": 0.32633262872695923, + "learning_rate": 0.00013560493592928356, + "loss": 0.2235, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 7790, + "tokens_per_second_per_gpu": 331.86 + }, + { + "epoch": 0.7754635383009395, + "grad_norm": 0.4296337068080902, + "learning_rate": 0.00013545823104356663, + "loss": 0.297, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7800, + "tokens_per_second_per_gpu": 370.8 + }, + { + "epoch": 0.7764577223244022, + "grad_norm": 0.5057851672172546, + "learning_rate": 0.00013531143881673237, + "loss": 0.1952, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 7810, + "tokens_per_second_per_gpu": 321.52 + }, + { + "epoch": 0.777451906347865, + "grad_norm": 0.49617013335227966, + "learning_rate": 0.00013516455961036104, + "loss": 0.2589, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 7820, + "tokens_per_second_per_gpu": 299.57 + }, + { + "epoch": 0.7784460903713277, + "grad_norm": 0.3173094689846039, + "learning_rate": 0.00013501759378624722, + "loss": 0.2328, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7830, + "tokens_per_second_per_gpu": 349.07 + }, + { + "epoch": 0.7794402743947905, + "grad_norm": 0.4631012976169586, + "learning_rate": 0.00013487054170639877, + "loss": 0.2472, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 7840, + "tokens_per_second_per_gpu": 361.13 + }, + { + "epoch": 0.7804344584182532, + "grad_norm": 0.3672430217266083, + "learning_rate": 0.000134723403733036, + "loss": 0.2183, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 7850, + "tokens_per_second_per_gpu": 324.51 + }, + { + "epoch": 0.781428642441716, + "grad_norm": 0.5141401886940002, + "learning_rate": 0.00013457618022859092, + "loss": 0.3104, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.47, + "memory/max_allocated (GiB)": 21.47, + "step": 7860, + "tokens_per_second_per_gpu": 384.61 + }, + { + "epoch": 0.7824228264651787, + "grad_norm": 0.43661215901374817, + "learning_rate": 0.00013442887155570607, + "loss": 0.2228, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 7870, + "tokens_per_second_per_gpu": 359.28 + }, + { + "epoch": 0.7834170104886414, + "grad_norm": 0.375987708568573, + "learning_rate": 0.00013428147807723387, + "loss": 0.2215, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 7880, + "tokens_per_second_per_gpu": 327.47 + }, + { + "epoch": 0.7844111945121042, + "grad_norm": 0.2800423204898834, + "learning_rate": 0.00013413400015623562, + "loss": 0.263, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 7890, + "tokens_per_second_per_gpu": 367.65 + }, + { + "epoch": 0.7854053785355669, + "grad_norm": 0.44610151648521423, + "learning_rate": 0.00013398643815598063, + "loss": 0.2533, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7900, + "tokens_per_second_per_gpu": 364.29 + }, + { + "epoch": 0.7863995625590297, + "grad_norm": 0.6232859492301941, + "learning_rate": 0.0001338387924399452, + "loss": 0.2273, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 7910, + "tokens_per_second_per_gpu": 335.13 + }, + { + "epoch": 0.7873937465824924, + "grad_norm": 0.3155955374240875, + "learning_rate": 0.00013369106337181202, + "loss": 0.2007, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 7920, + "tokens_per_second_per_gpu": 357.92 + }, + { + "epoch": 0.7883879306059551, + "grad_norm": 0.47753843665122986, + "learning_rate": 0.00013354325131546902, + "loss": 0.1722, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 7930, + "tokens_per_second_per_gpu": 350.28 + }, + { + "epoch": 0.7893821146294179, + "grad_norm": 0.6098092198371887, + "learning_rate": 0.0001333953566350085, + "loss": 0.2172, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 7940, + "tokens_per_second_per_gpu": 328.34 + }, + { + "epoch": 0.7903762986528806, + "grad_norm": 0.40892454981803894, + "learning_rate": 0.00013324737969472628, + "loss": 0.2365, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 7950, + "tokens_per_second_per_gpu": 376.03 + }, + { + "epoch": 0.7913704826763434, + "grad_norm": 0.6622501015663147, + "learning_rate": 0.00013309932085912092, + "loss": 0.265, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 7960, + "tokens_per_second_per_gpu": 401.88 + }, + { + "epoch": 0.7923646666998061, + "grad_norm": 0.5111701488494873, + "learning_rate": 0.00013295118049289255, + "loss": 0.2164, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 7970, + "tokens_per_second_per_gpu": 359.69 + }, + { + "epoch": 0.7933588507232688, + "grad_norm": 0.5144445300102234, + "learning_rate": 0.00013280295896094224, + "loss": 0.2567, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 7980, + "tokens_per_second_per_gpu": 301.89 + }, + { + "epoch": 0.7943530347467316, + "grad_norm": 0.5570478439331055, + "learning_rate": 0.00013265465662837093, + "loss": 0.1934, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 7990, + "tokens_per_second_per_gpu": 336.55 + }, + { + "epoch": 0.7953472187701943, + "grad_norm": 0.33518052101135254, + "learning_rate": 0.00013250627386047866, + "loss": 0.2247, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8000, + "tokens_per_second_per_gpu": 335.15 + }, + { + "epoch": 0.7963414027936571, + "grad_norm": 0.508229672908783, + "learning_rate": 0.0001323578110227635, + "loss": 0.1587, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.8, + "memory/max_allocated (GiB)": 19.8, + "step": 8010, + "tokens_per_second_per_gpu": 351.3 + }, + { + "epoch": 0.7973355868171198, + "grad_norm": 0.3688840866088867, + "learning_rate": 0.0001322092684809208, + "loss": 0.1929, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 8020, + "tokens_per_second_per_gpu": 289.68 + }, + { + "epoch": 0.7983297708405825, + "grad_norm": 0.31160444021224976, + "learning_rate": 0.00013206064660084227, + "loss": 0.2318, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8030, + "tokens_per_second_per_gpu": 351.62 + }, + { + "epoch": 0.7993239548640453, + "grad_norm": 0.46203359961509705, + "learning_rate": 0.000131911945748615, + "loss": 0.2808, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8040, + "tokens_per_second_per_gpu": 360.06 + }, + { + "epoch": 0.800318138887508, + "grad_norm": 0.4416126608848572, + "learning_rate": 0.00013176316629052054, + "loss": 0.2065, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8050, + "tokens_per_second_per_gpu": 312.23 + }, + { + "epoch": 0.8013123229109709, + "grad_norm": 0.16777446866035461, + "learning_rate": 0.00013161430859303427, + "loss": 0.1713, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 8060, + "tokens_per_second_per_gpu": 347.86 + }, + { + "epoch": 0.8023065069344336, + "grad_norm": 0.3447447121143341, + "learning_rate": 0.0001314653730228241, + "loss": 0.1954, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8070, + "tokens_per_second_per_gpu": 372.63 + }, + { + "epoch": 0.8033006909578964, + "grad_norm": 0.4045270085334778, + "learning_rate": 0.0001313163599467498, + "loss": 0.1929, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8080, + "tokens_per_second_per_gpu": 355.89 + }, + { + "epoch": 0.8042948749813591, + "grad_norm": 0.462365984916687, + "learning_rate": 0.00013116726973186208, + "loss": 0.2551, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.45, + "memory/max_allocated (GiB)": 20.45, + "step": 8090, + "tokens_per_second_per_gpu": 361.83 + }, + { + "epoch": 0.8052890590048218, + "grad_norm": 0.5786636471748352, + "learning_rate": 0.00013101810274540168, + "loss": 0.2499, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 8100, + "tokens_per_second_per_gpu": 300.27 + }, + { + "epoch": 0.8062832430282846, + "grad_norm": 0.3487481474876404, + "learning_rate": 0.0001308688593547984, + "loss": 0.2186, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 17.11, + "memory/max_allocated (GiB)": 17.11, + "step": 8110, + "tokens_per_second_per_gpu": 312.52 + }, + { + "epoch": 0.8072774270517473, + "grad_norm": 0.3610248863697052, + "learning_rate": 0.00013071953992767015, + "loss": 0.2167, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 8120, + "tokens_per_second_per_gpu": 328.39 + }, + { + "epoch": 0.8082716110752101, + "grad_norm": 0.37153443694114685, + "learning_rate": 0.00013057014483182242, + "loss": 0.241, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 8130, + "tokens_per_second_per_gpu": 344.6 + }, + { + "epoch": 0.8092657950986728, + "grad_norm": 0.3705120086669922, + "learning_rate": 0.00013042067443524681, + "loss": 0.2749, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 8140, + "tokens_per_second_per_gpu": 332.87 + }, + { + "epoch": 0.8102599791221355, + "grad_norm": 0.3014324903488159, + "learning_rate": 0.00013027112910612052, + "loss": 0.1438, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8150, + "tokens_per_second_per_gpu": 308.05 + }, + { + "epoch": 0.8112541631455983, + "grad_norm": 0.5011573433876038, + "learning_rate": 0.00013012150921280527, + "loss": 0.2032, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 8160, + "tokens_per_second_per_gpu": 296.89 + }, + { + "epoch": 0.812248347169061, + "grad_norm": 0.3287215530872345, + "learning_rate": 0.00012997181512384653, + "loss": 0.2055, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 8170, + "tokens_per_second_per_gpu": 405.75 + }, + { + "epoch": 0.8132425311925238, + "grad_norm": 0.7308348417282104, + "learning_rate": 0.00012982204720797245, + "loss": 0.2805, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8180, + "tokens_per_second_per_gpu": 443.15 + }, + { + "epoch": 0.8142367152159865, + "grad_norm": 0.5808447599411011, + "learning_rate": 0.00012967220583409304, + "loss": 0.2066, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8190, + "tokens_per_second_per_gpu": 401.43 + }, + { + "epoch": 0.8152308992394492, + "grad_norm": 0.31127744913101196, + "learning_rate": 0.0001295222913712993, + "loss": 0.2464, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 8200, + "tokens_per_second_per_gpu": 343.59 + }, + { + "epoch": 0.816225083262912, + "grad_norm": 0.2803351581096649, + "learning_rate": 0.00012937230418886224, + "loss": 0.1986, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 8210, + "tokens_per_second_per_gpu": 401.58 + }, + { + "epoch": 0.8172192672863747, + "grad_norm": 0.45312055945396423, + "learning_rate": 0.000129222244656232, + "loss": 0.1904, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 8220, + "tokens_per_second_per_gpu": 290.19 + }, + { + "epoch": 0.8182134513098375, + "grad_norm": 0.4169121980667114, + "learning_rate": 0.0001290721131430369, + "loss": 0.2074, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 8230, + "tokens_per_second_per_gpu": 320.72 + }, + { + "epoch": 0.8192076353333002, + "grad_norm": 0.5725305080413818, + "learning_rate": 0.0001289219100190826, + "loss": 0.2809, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 8240, + "tokens_per_second_per_gpu": 334.56 + }, + { + "epoch": 0.820201819356763, + "grad_norm": 0.4698241055011749, + "learning_rate": 0.00012877163565435114, + "loss": 0.1873, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.66, + "memory/max_allocated (GiB)": 19.66, + "step": 8250, + "tokens_per_second_per_gpu": 319.13 + }, + { + "epoch": 0.8211960033802257, + "grad_norm": 0.33453720808029175, + "learning_rate": 0.000128621290419, + "loss": 0.2172, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8260, + "tokens_per_second_per_gpu": 306.65 + }, + { + "epoch": 0.8221901874036884, + "grad_norm": 0.20304298400878906, + "learning_rate": 0.00012847087468336135, + "loss": 0.2102, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8270, + "tokens_per_second_per_gpu": 311.33 + }, + { + "epoch": 0.8231843714271512, + "grad_norm": 0.27367445826530457, + "learning_rate": 0.00012832038881794086, + "loss": 0.2437, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8280, + "tokens_per_second_per_gpu": 417.4 + }, + { + "epoch": 0.8241785554506139, + "grad_norm": 0.3914351463317871, + "learning_rate": 0.00012816983319341712, + "loss": 0.2692, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 8290, + "tokens_per_second_per_gpu": 342.88 + }, + { + "epoch": 0.8251727394740767, + "grad_norm": 0.5103800296783447, + "learning_rate": 0.00012801920818064034, + "loss": 0.2341, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 8300, + "tokens_per_second_per_gpu": 362.66 + }, + { + "epoch": 0.8261669234975394, + "grad_norm": 0.1988827884197235, + "learning_rate": 0.00012786851415063185, + "loss": 0.2141, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 8310, + "tokens_per_second_per_gpu": 404.61 + }, + { + "epoch": 0.8271611075210021, + "grad_norm": 0.482526570558548, + "learning_rate": 0.00012771775147458288, + "loss": 0.2341, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8320, + "tokens_per_second_per_gpu": 366.94 + }, + { + "epoch": 0.8281552915444649, + "grad_norm": 0.5179364085197449, + "learning_rate": 0.0001275669205238537, + "loss": 0.2458, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 8330, + "tokens_per_second_per_gpu": 308.79 + }, + { + "epoch": 0.8291494755679276, + "grad_norm": 0.4961225390434265, + "learning_rate": 0.00012741602166997288, + "loss": 0.2324, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 8340, + "tokens_per_second_per_gpu": 312.63 + }, + { + "epoch": 0.8301436595913904, + "grad_norm": 0.6281317472457886, + "learning_rate": 0.0001272650552846362, + "loss": 0.2808, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8350, + "tokens_per_second_per_gpu": 365.38 + }, + { + "epoch": 0.8311378436148531, + "grad_norm": 0.3268338739871979, + "learning_rate": 0.00012711402173970574, + "loss": 0.2125, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8360, + "tokens_per_second_per_gpu": 381.45 + }, + { + "epoch": 0.8321320276383158, + "grad_norm": 0.5050214529037476, + "learning_rate": 0.00012696292140720907, + "loss": 0.3039, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.67, + "memory/max_allocated (GiB)": 19.67, + "step": 8370, + "tokens_per_second_per_gpu": 394.88 + }, + { + "epoch": 0.8331262116617786, + "grad_norm": 0.3680170178413391, + "learning_rate": 0.00012681175465933822, + "loss": 0.1876, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 8380, + "tokens_per_second_per_gpu": 310.52 + }, + { + "epoch": 0.8341203956852413, + "grad_norm": 0.6084402799606323, + "learning_rate": 0.00012666052186844883, + "loss": 0.2137, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 8390, + "tokens_per_second_per_gpu": 308.84 + }, + { + "epoch": 0.8351145797087041, + "grad_norm": 0.41174978017807007, + "learning_rate": 0.00012650922340705925, + "loss": 0.2423, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8400, + "tokens_per_second_per_gpu": 389.04 + }, + { + "epoch": 0.8361087637321668, + "grad_norm": 0.3895050883293152, + "learning_rate": 0.0001263578596478496, + "loss": 0.2144, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.99, + "memory/max_allocated (GiB)": 19.99, + "step": 8410, + "tokens_per_second_per_gpu": 325.56 + }, + { + "epoch": 0.8371029477556295, + "grad_norm": 0.2492019683122635, + "learning_rate": 0.00012620643096366077, + "loss": 0.2292, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8420, + "tokens_per_second_per_gpu": 312.76 + }, + { + "epoch": 0.8380971317790923, + "grad_norm": 0.5276951789855957, + "learning_rate": 0.0001260549377274936, + "loss": 0.2566, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 8430, + "tokens_per_second_per_gpu": 341.75 + }, + { + "epoch": 0.839091315802555, + "grad_norm": 0.3949599266052246, + "learning_rate": 0.00012590338031250796, + "loss": 0.2108, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 8440, + "tokens_per_second_per_gpu": 370.88 + }, + { + "epoch": 0.8400854998260178, + "grad_norm": 0.3476475179195404, + "learning_rate": 0.00012575175909202186, + "loss": 0.1811, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8450, + "tokens_per_second_per_gpu": 336.59 + }, + { + "epoch": 0.8410796838494805, + "grad_norm": 0.26099368929862976, + "learning_rate": 0.00012560007443951032, + "loss": 0.2144, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 8460, + "tokens_per_second_per_gpu": 353.23 + }, + { + "epoch": 0.8420738678729432, + "grad_norm": 0.4799298346042633, + "learning_rate": 0.00012544832672860474, + "loss": 0.1781, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8470, + "tokens_per_second_per_gpu": 360.11 + }, + { + "epoch": 0.843068051896406, + "grad_norm": 0.46986958384513855, + "learning_rate": 0.0001252965163330918, + "loss": 0.209, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 8480, + "tokens_per_second_per_gpu": 393.35 + }, + { + "epoch": 0.8440622359198687, + "grad_norm": 1.179408311843872, + "learning_rate": 0.00012514464362691258, + "loss": 0.2061, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8490, + "tokens_per_second_per_gpu": 377.29 + }, + { + "epoch": 0.8450564199433315, + "grad_norm": 0.2782179117202759, + "learning_rate": 0.0001249927089841617, + "loss": 0.2643, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8500, + "tokens_per_second_per_gpu": 352.65 + }, + { + "epoch": 0.8460506039667942, + "grad_norm": 0.5103908181190491, + "learning_rate": 0.00012484071277908622, + "loss": 0.2086, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 8510, + "tokens_per_second_per_gpu": 313.7 + }, + { + "epoch": 0.847044787990257, + "grad_norm": 0.34362590312957764, + "learning_rate": 0.000124688655386085, + "loss": 0.2545, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8520, + "tokens_per_second_per_gpu": 357.23 + }, + { + "epoch": 0.8480389720137197, + "grad_norm": 0.5372098088264465, + "learning_rate": 0.00012453653717970747, + "loss": 0.2191, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 8530, + "tokens_per_second_per_gpu": 275.82 + }, + { + "epoch": 0.8490331560371824, + "grad_norm": 0.21182872354984283, + "learning_rate": 0.00012438435853465296, + "loss": 0.2291, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 8540, + "tokens_per_second_per_gpu": 394.59 + }, + { + "epoch": 0.8500273400606452, + "grad_norm": 0.4921363890171051, + "learning_rate": 0.0001242321198257696, + "loss": 0.2142, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 8550, + "tokens_per_second_per_gpu": 342.56 + }, + { + "epoch": 0.8510215240841079, + "grad_norm": 0.3340144157409668, + "learning_rate": 0.00012407982142805356, + "loss": 0.2034, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 8560, + "tokens_per_second_per_gpu": 381.5 + }, + { + "epoch": 0.8520157081075707, + "grad_norm": 0.45434701442718506, + "learning_rate": 0.00012392746371664797, + "loss": 0.2031, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 8570, + "tokens_per_second_per_gpu": 337.22 + }, + { + "epoch": 0.8530098921310335, + "grad_norm": 0.2646021544933319, + "learning_rate": 0.00012377504706684206, + "loss": 0.1807, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8580, + "tokens_per_second_per_gpu": 369.87 + }, + { + "epoch": 0.8540040761544962, + "grad_norm": 0.4125272333621979, + "learning_rate": 0.00012362257185407022, + "loss": 0.2258, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 8590, + "tokens_per_second_per_gpu": 374.23 + }, + { + "epoch": 0.854998260177959, + "grad_norm": 0.3133401572704315, + "learning_rate": 0.00012347003845391118, + "loss": 0.1624, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 19.98, + "memory/max_allocated (GiB)": 19.98, + "step": 8600, + "tokens_per_second_per_gpu": 336.61 + }, + { + "epoch": 0.8559924442014217, + "grad_norm": 0.23082919418811798, + "learning_rate": 0.00012331744724208694, + "loss": 0.207, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8610, + "tokens_per_second_per_gpu": 316.78 + }, + { + "epoch": 0.8569866282248845, + "grad_norm": 0.45513761043548584, + "learning_rate": 0.00012316479859446187, + "loss": 0.2465, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8620, + "tokens_per_second_per_gpu": 355.67 + }, + { + "epoch": 0.8579808122483472, + "grad_norm": 0.4129338562488556, + "learning_rate": 0.00012301209288704184, + "loss": 0.2563, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.38, + "memory/max_allocated (GiB)": 21.38, + "step": 8630, + "tokens_per_second_per_gpu": 323.02 + }, + { + "epoch": 0.85897499627181, + "grad_norm": 0.37343230843544006, + "learning_rate": 0.00012285933049597335, + "loss": 0.154, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8640, + "tokens_per_second_per_gpu": 326.66 + }, + { + "epoch": 0.8599691802952727, + "grad_norm": 0.32739633321762085, + "learning_rate": 0.00012270651179754243, + "loss": 0.2135, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8650, + "tokens_per_second_per_gpu": 340.36 + }, + { + "epoch": 0.8609633643187354, + "grad_norm": 0.5008605718612671, + "learning_rate": 0.0001225536371681738, + "loss": 0.2221, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8660, + "tokens_per_second_per_gpu": 295.75 + }, + { + "epoch": 0.8619575483421982, + "grad_norm": 2.8313498497009277, + "learning_rate": 0.00012240070698443, + "loss": 0.2402, + "memory/device_reserved (GiB)": 22.45, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8670, + "tokens_per_second_per_gpu": 343.07 + }, + { + "epoch": 0.8629517323656609, + "grad_norm": 3.499013662338257, + "learning_rate": 0.00012224772162301042, + "loss": 0.2588, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.58, + "memory/max_allocated (GiB)": 21.58, + "step": 8680, + "tokens_per_second_per_gpu": 348.94 + }, + { + "epoch": 0.8639459163891237, + "grad_norm": 0.4069509208202362, + "learning_rate": 0.0001220946814607503, + "loss": 0.2485, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 8690, + "tokens_per_second_per_gpu": 365.05 + }, + { + "epoch": 0.8649401004125864, + "grad_norm": 0.5977867841720581, + "learning_rate": 0.00012194158687461992, + "loss": 0.2119, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 8700, + "tokens_per_second_per_gpu": 326.01 + }, + { + "epoch": 0.8659342844360491, + "grad_norm": 6.747653961181641, + "learning_rate": 0.00012178843824172361, + "loss": 0.2719, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.53, + "memory/max_allocated (GiB)": 20.53, + "step": 8710, + "tokens_per_second_per_gpu": 390.15 + }, + { + "epoch": 0.8669284684595119, + "grad_norm": 0.46154770255088806, + "learning_rate": 0.00012163523593929884, + "loss": 0.1836, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 8720, + "tokens_per_second_per_gpu": 343.33 + }, + { + "epoch": 0.8679226524829746, + "grad_norm": 0.272504985332489, + "learning_rate": 0.00012148198034471524, + "loss": 0.2419, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8730, + "tokens_per_second_per_gpu": 347.62 + }, + { + "epoch": 0.8689168365064374, + "grad_norm": 0.4848499596118927, + "learning_rate": 0.00012132867183547372, + "loss": 0.2379, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 8740, + "tokens_per_second_per_gpu": 315.42 + }, + { + "epoch": 0.8699110205299001, + "grad_norm": 0.42282259464263916, + "learning_rate": 0.00012117531078920556, + "loss": 0.2358, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 8750, + "tokens_per_second_per_gpu": 404.19 + }, + { + "epoch": 0.8709052045533628, + "grad_norm": 0.5138429999351501, + "learning_rate": 0.00012102189758367142, + "loss": 0.2602, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 8760, + "tokens_per_second_per_gpu": 350.33 + }, + { + "epoch": 0.8718993885768256, + "grad_norm": 0.2951951324939728, + "learning_rate": 0.00012086843259676041, + "loss": 0.303, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8770, + "tokens_per_second_per_gpu": 396.42 + }, + { + "epoch": 0.8728935726002883, + "grad_norm": 0.4597817361354828, + "learning_rate": 0.00012071491620648934, + "loss": 0.2519, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.68, + "memory/max_allocated (GiB)": 19.68, + "step": 8780, + "tokens_per_second_per_gpu": 338.59 + }, + { + "epoch": 0.8738877566237511, + "grad_norm": 0.13618378341197968, + "learning_rate": 0.00012056134879100138, + "loss": 0.2235, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.43, + "memory/max_allocated (GiB)": 20.43, + "step": 8790, + "tokens_per_second_per_gpu": 325.39 + }, + { + "epoch": 0.8748819406472138, + "grad_norm": 0.5435792207717896, + "learning_rate": 0.00012040773072856566, + "loss": 0.2563, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8800, + "tokens_per_second_per_gpu": 346.95 + }, + { + "epoch": 0.8758761246706765, + "grad_norm": 0.24607907235622406, + "learning_rate": 0.00012025406239757588, + "loss": 0.1721, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8810, + "tokens_per_second_per_gpu": 305.91 + }, + { + "epoch": 0.8768703086941393, + "grad_norm": 0.3879210650920868, + "learning_rate": 0.00012010034417654962, + "loss": 0.2026, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 8820, + "tokens_per_second_per_gpu": 305.29 + }, + { + "epoch": 0.877864492717602, + "grad_norm": 0.4256599545478821, + "learning_rate": 0.00011994657644412734, + "loss": 0.1985, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 8830, + "tokens_per_second_per_gpu": 330.52 + }, + { + "epoch": 0.8788586767410648, + "grad_norm": 0.2512848973274231, + "learning_rate": 0.00011979275957907146, + "loss": 0.2153, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8840, + "tokens_per_second_per_gpu": 321.37 + }, + { + "epoch": 0.8798528607645275, + "grad_norm": 0.38337549567222595, + "learning_rate": 0.00011963889396026547, + "loss": 0.2383, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 8850, + "tokens_per_second_per_gpu": 359.62 + }, + { + "epoch": 0.8808470447879903, + "grad_norm": 0.3447147309780121, + "learning_rate": 0.00011948497996671286, + "loss": 0.2304, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8860, + "tokens_per_second_per_gpu": 332.81 + }, + { + "epoch": 0.881841228811453, + "grad_norm": 0.40846845507621765, + "learning_rate": 0.00011933101797753637, + "loss": 0.2297, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8870, + "tokens_per_second_per_gpu": 384.1 + }, + { + "epoch": 0.8828354128349157, + "grad_norm": 0.5454410910606384, + "learning_rate": 0.0001191770083719769, + "loss": 0.2242, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 8880, + "tokens_per_second_per_gpu": 387.99 + }, + { + "epoch": 0.8838295968583785, + "grad_norm": 0.4706827700138092, + "learning_rate": 0.00011902295152939262, + "loss": 0.2381, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 8890, + "tokens_per_second_per_gpu": 304.64 + }, + { + "epoch": 0.8848237808818412, + "grad_norm": 0.6073552370071411, + "learning_rate": 0.00011886884782925816, + "loss": 0.2417, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 8900, + "tokens_per_second_per_gpu": 315.52 + }, + { + "epoch": 0.885817964905304, + "grad_norm": 0.3200027644634247, + "learning_rate": 0.00011871469765116346, + "loss": 0.2117, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 8910, + "tokens_per_second_per_gpu": 306.02 + }, + { + "epoch": 0.8868121489287667, + "grad_norm": 0.42333486676216125, + "learning_rate": 0.00011856050137481301, + "loss": 0.2552, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 8920, + "tokens_per_second_per_gpu": 355.23 + }, + { + "epoch": 0.8878063329522294, + "grad_norm": 0.2578692138195038, + "learning_rate": 0.00011840625938002481, + "loss": 0.1743, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 8930, + "tokens_per_second_per_gpu": 356.78 + }, + { + "epoch": 0.8888005169756922, + "grad_norm": 0.3321487605571747, + "learning_rate": 0.00011825197204672952, + "loss": 0.2637, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8940, + "tokens_per_second_per_gpu": 424.07 + }, + { + "epoch": 0.8897947009991549, + "grad_norm": 0.3663140833377838, + "learning_rate": 0.00011809763975496944, + "loss": 0.2272, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 8950, + "tokens_per_second_per_gpu": 360.15 + }, + { + "epoch": 0.8907888850226177, + "grad_norm": 0.2619111239910126, + "learning_rate": 0.00011794326288489761, + "loss": 0.2723, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.03, + "memory/max_allocated (GiB)": 20.03, + "step": 8960, + "tokens_per_second_per_gpu": 357.64 + }, + { + "epoch": 0.8917830690460804, + "grad_norm": 0.4359273612499237, + "learning_rate": 0.0001177888418167769, + "loss": 0.2633, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.79, + "memory/max_allocated (GiB)": 19.79, + "step": 8970, + "tokens_per_second_per_gpu": 406.63 + }, + { + "epoch": 0.8927772530695431, + "grad_norm": 0.29716983437538147, + "learning_rate": 0.00011763437693097903, + "loss": 0.2789, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 8980, + "tokens_per_second_per_gpu": 386.97 + }, + { + "epoch": 0.8937714370930059, + "grad_norm": 0.6009801626205444, + "learning_rate": 0.00011747986860798368, + "loss": 0.2057, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.43, + "memory/max_allocated (GiB)": 20.43, + "step": 8990, + "tokens_per_second_per_gpu": 318.77 + }, + { + "epoch": 0.8947656211164686, + "grad_norm": 0.4606180191040039, + "learning_rate": 0.0001173253172283775, + "loss": 0.2232, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9000, + "tokens_per_second_per_gpu": 283.3 + }, + { + "epoch": 0.8957598051399314, + "grad_norm": 0.3887878358364105, + "learning_rate": 0.00011717072317285318, + "loss": 0.2175, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9010, + "tokens_per_second_per_gpu": 313.78 + }, + { + "epoch": 0.8967539891633941, + "grad_norm": 0.3322821855545044, + "learning_rate": 0.0001170160868222086, + "loss": 0.2491, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9020, + "tokens_per_second_per_gpu": 339.47 + }, + { + "epoch": 0.8977481731868568, + "grad_norm": 0.5119565725326538, + "learning_rate": 0.00011686140855734571, + "loss": 0.2568, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.66, + "memory/max_allocated (GiB)": 19.66, + "step": 9030, + "tokens_per_second_per_gpu": 322.62 + }, + { + "epoch": 0.8987423572103196, + "grad_norm": 0.32013964653015137, + "learning_rate": 0.00011670668875926982, + "loss": 0.3019, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9040, + "tokens_per_second_per_gpu": 356.77 + }, + { + "epoch": 0.8997365412337823, + "grad_norm": 0.19132547080516815, + "learning_rate": 0.00011655192780908849, + "loss": 0.1927, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.16, + "memory/max_allocated (GiB)": 18.16, + "step": 9050, + "tokens_per_second_per_gpu": 306.79 + }, + { + "epoch": 0.9007307252572451, + "grad_norm": 0.4459245502948761, + "learning_rate": 0.00011639712608801059, + "loss": 0.2013, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 9060, + "tokens_per_second_per_gpu": 369.24 + }, + { + "epoch": 0.9017249092807078, + "grad_norm": 0.3540112376213074, + "learning_rate": 0.00011624228397734556, + "loss": 0.1513, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9070, + "tokens_per_second_per_gpu": 323.78 + }, + { + "epoch": 0.9027190933041706, + "grad_norm": 0.3951474130153656, + "learning_rate": 0.00011608740185850219, + "loss": 0.2055, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9080, + "tokens_per_second_per_gpu": 358.92 + }, + { + "epoch": 0.9037132773276333, + "grad_norm": 0.3915760815143585, + "learning_rate": 0.00011593248011298791, + "loss": 0.2148, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9090, + "tokens_per_second_per_gpu": 397.61 + }, + { + "epoch": 0.9047074613510961, + "grad_norm": 0.49453213810920715, + "learning_rate": 0.00011577751912240771, + "loss": 0.187, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9100, + "tokens_per_second_per_gpu": 379.24 + }, + { + "epoch": 0.9057016453745589, + "grad_norm": 0.5185866951942444, + "learning_rate": 0.00011562251926846326, + "loss": 0.219, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9110, + "tokens_per_second_per_gpu": 307.18 + }, + { + "epoch": 0.9066958293980216, + "grad_norm": 0.44589415192604065, + "learning_rate": 0.00011546748093295195, + "loss": 0.2127, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 9120, + "tokens_per_second_per_gpu": 360.35 + }, + { + "epoch": 0.9076900134214844, + "grad_norm": 0.4594690501689911, + "learning_rate": 0.00011531240449776594, + "loss": 0.2057, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9130, + "tokens_per_second_per_gpu": 351.45 + }, + { + "epoch": 0.9086841974449471, + "grad_norm": 0.4069642722606659, + "learning_rate": 0.00011515729034489133, + "loss": 0.2213, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 9140, + "tokens_per_second_per_gpu": 331.15 + }, + { + "epoch": 0.9096783814684098, + "grad_norm": 0.5103911757469177, + "learning_rate": 0.00011500213885640705, + "loss": 0.2258, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 9150, + "tokens_per_second_per_gpu": 323.85 + }, + { + "epoch": 0.9106725654918726, + "grad_norm": 0.5270497798919678, + "learning_rate": 0.00011484695041448399, + "loss": 0.2709, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 9160, + "tokens_per_second_per_gpu": 363.69 + }, + { + "epoch": 0.9116667495153353, + "grad_norm": 0.48765021562576294, + "learning_rate": 0.00011469172540138407, + "loss": 0.1935, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 9170, + "tokens_per_second_per_gpu": 329.09 + }, + { + "epoch": 0.9126609335387981, + "grad_norm": 0.4249947667121887, + "learning_rate": 0.00011453646419945934, + "loss": 0.2296, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 9180, + "tokens_per_second_per_gpu": 412.86 + }, + { + "epoch": 0.9136551175622608, + "grad_norm": 0.36251431703567505, + "learning_rate": 0.00011438116719115089, + "loss": 0.2361, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 9190, + "tokens_per_second_per_gpu": 343.29 + }, + { + "epoch": 0.9146493015857236, + "grad_norm": 0.3858913481235504, + "learning_rate": 0.00011422583475898814, + "loss": 0.2446, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9200, + "tokens_per_second_per_gpu": 331.61 + }, + { + "epoch": 0.9156434856091863, + "grad_norm": 0.43422338366508484, + "learning_rate": 0.00011407046728558768, + "loss": 0.1683, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 9210, + "tokens_per_second_per_gpu": 303.63 + }, + { + "epoch": 0.916637669632649, + "grad_norm": 0.43746358156204224, + "learning_rate": 0.00011391506515365245, + "loss": 0.1423, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9220, + "tokens_per_second_per_gpu": 337.04 + }, + { + "epoch": 0.9176318536561118, + "grad_norm": 0.30726879835128784, + "learning_rate": 0.00011375962874597073, + "loss": 0.179, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9230, + "tokens_per_second_per_gpu": 312.89 + }, + { + "epoch": 0.9186260376795745, + "grad_norm": 0.5088397264480591, + "learning_rate": 0.00011360415844541523, + "loss": 0.2571, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9240, + "tokens_per_second_per_gpu": 359.77 + }, + { + "epoch": 0.9196202217030373, + "grad_norm": 0.545219898223877, + "learning_rate": 0.00011344865463494219, + "loss": 0.2228, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 9250, + "tokens_per_second_per_gpu": 337.28 + }, + { + "epoch": 0.9206144057265, + "grad_norm": 0.5700411796569824, + "learning_rate": 0.00011329311769759035, + "loss": 0.2236, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 9260, + "tokens_per_second_per_gpu": 340.23 + }, + { + "epoch": 0.9216085897499627, + "grad_norm": 0.3975035548210144, + "learning_rate": 0.00011313754801648003, + "loss": 0.2487, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9270, + "tokens_per_second_per_gpu": 345.97 + }, + { + "epoch": 0.9226027737734255, + "grad_norm": 0.5616829991340637, + "learning_rate": 0.00011298194597481226, + "loss": 0.2511, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9280, + "tokens_per_second_per_gpu": 331.41 + }, + { + "epoch": 0.9235969577968882, + "grad_norm": 0.38982534408569336, + "learning_rate": 0.00011282631195586777, + "loss": 0.2809, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 9290, + "tokens_per_second_per_gpu": 361.76 + }, + { + "epoch": 0.924591141820351, + "grad_norm": 0.531318187713623, + "learning_rate": 0.00011267064634300603, + "loss": 0.2608, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9300, + "tokens_per_second_per_gpu": 393.15 + }, + { + "epoch": 0.9255853258438137, + "grad_norm": 0.5956375002861023, + "learning_rate": 0.00011251494951966437, + "loss": 0.2229, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 9310, + "tokens_per_second_per_gpu": 284.92 + }, + { + "epoch": 0.9265795098672764, + "grad_norm": 0.42666998505592346, + "learning_rate": 0.0001123592218693569, + "loss": 0.1985, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 9320, + "tokens_per_second_per_gpu": 362.3 + }, + { + "epoch": 0.9275736938907392, + "grad_norm": 0.4182765781879425, + "learning_rate": 0.00011220346377567381, + "loss": 0.2535, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 9330, + "tokens_per_second_per_gpu": 348.83 + }, + { + "epoch": 0.9285678779142019, + "grad_norm": 0.5867879390716553, + "learning_rate": 0.00011204767562228017, + "loss": 0.2309, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9340, + "tokens_per_second_per_gpu": 334.46 + }, + { + "epoch": 0.9295620619376647, + "grad_norm": 0.27041056752204895, + "learning_rate": 0.00011189185779291515, + "loss": 0.232, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 9350, + "tokens_per_second_per_gpu": 331.56 + }, + { + "epoch": 0.9305562459611274, + "grad_norm": 0.5081501603126526, + "learning_rate": 0.00011173601067139099, + "loss": 0.2399, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9360, + "tokens_per_second_per_gpu": 336.84 + }, + { + "epoch": 0.9315504299845901, + "grad_norm": 0.7966908812522888, + "learning_rate": 0.00011158013464159208, + "loss": 0.2606, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 9370, + "tokens_per_second_per_gpu": 313.66 + }, + { + "epoch": 0.9325446140080529, + "grad_norm": 0.45302364230155945, + "learning_rate": 0.00011142423008747403, + "loss": 0.1581, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 9380, + "tokens_per_second_per_gpu": 322.72 + }, + { + "epoch": 0.9335387980315156, + "grad_norm": 0.5959784984588623, + "learning_rate": 0.00011126829739306271, + "loss": 0.2115, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.1, + "memory/max_allocated (GiB)": 19.1, + "step": 9390, + "tokens_per_second_per_gpu": 304.85 + }, + { + "epoch": 0.9345329820549784, + "grad_norm": 0.5530646443367004, + "learning_rate": 0.00011111233694245328, + "loss": 0.1854, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 9400, + "tokens_per_second_per_gpu": 347.73 + }, + { + "epoch": 0.9355271660784411, + "grad_norm": 0.4127480387687683, + "learning_rate": 0.00011095634911980933, + "loss": 0.2307, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9410, + "tokens_per_second_per_gpu": 325.27 + }, + { + "epoch": 0.9365213501019038, + "grad_norm": 0.4038192629814148, + "learning_rate": 0.0001108003343093618, + "loss": 0.1953, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9420, + "tokens_per_second_per_gpu": 327.3 + }, + { + "epoch": 0.9375155341253666, + "grad_norm": 0.4245215654373169, + "learning_rate": 0.00011064429289540821, + "loss": 0.2505, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9430, + "tokens_per_second_per_gpu": 337.4 + }, + { + "epoch": 0.9385097181488293, + "grad_norm": 0.4768611192703247, + "learning_rate": 0.00011048822526231148, + "loss": 0.1584, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.83, + "memory/max_allocated (GiB)": 18.83, + "step": 9440, + "tokens_per_second_per_gpu": 314.23 + }, + { + "epoch": 0.9395039021722921, + "grad_norm": 0.3262840807437897, + "learning_rate": 0.00011033213179449917, + "loss": 0.2287, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.21, + "memory/max_allocated (GiB)": 19.21, + "step": 9450, + "tokens_per_second_per_gpu": 358.18 + }, + { + "epoch": 0.9404980861957548, + "grad_norm": 0.5882810354232788, + "learning_rate": 0.00011017601287646251, + "loss": 0.206, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9460, + "tokens_per_second_per_gpu": 405.83 + }, + { + "epoch": 0.9414922702192176, + "grad_norm": 0.4905533492565155, + "learning_rate": 0.0001100198688927554, + "loss": 0.2209, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 9470, + "tokens_per_second_per_gpu": 327.29 + }, + { + "epoch": 0.9424864542426803, + "grad_norm": 0.5608656406402588, + "learning_rate": 0.00010986370022799346, + "loss": 0.2418, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 9480, + "tokens_per_second_per_gpu": 335.39 + }, + { + "epoch": 0.943480638266143, + "grad_norm": 0.4696904718875885, + "learning_rate": 0.00010970750726685309, + "loss": 0.2742, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 9490, + "tokens_per_second_per_gpu": 399.98 + }, + { + "epoch": 0.9444748222896058, + "grad_norm": 0.423969030380249, + "learning_rate": 0.00010955129039407062, + "loss": 0.2259, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9500, + "tokens_per_second_per_gpu": 336.84 + }, + { + "epoch": 0.9454690063130685, + "grad_norm": 0.438812792301178, + "learning_rate": 0.0001093950499944412, + "loss": 0.1855, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.22, + "memory/max_allocated (GiB)": 19.22, + "step": 9510, + "tokens_per_second_per_gpu": 318.59 + }, + { + "epoch": 0.9464631903365313, + "grad_norm": 0.5361111164093018, + "learning_rate": 0.00010923878645281794, + "loss": 0.2713, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9520, + "tokens_per_second_per_gpu": 354.0 + }, + { + "epoch": 0.947457374359994, + "grad_norm": 0.39556553959846497, + "learning_rate": 0.000109082500154111, + "loss": 0.2539, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.44, + "memory/max_allocated (GiB)": 20.44, + "step": 9530, + "tokens_per_second_per_gpu": 342.96 + }, + { + "epoch": 0.9484515583834567, + "grad_norm": 0.42129939794540405, + "learning_rate": 0.00010892619148328654, + "loss": 0.2282, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9540, + "tokens_per_second_per_gpu": 375.18 + }, + { + "epoch": 0.9494457424069195, + "grad_norm": 0.49444761872291565, + "learning_rate": 0.00010876986082536584, + "loss": 0.2342, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.68, + "memory/max_allocated (GiB)": 19.68, + "step": 9550, + "tokens_per_second_per_gpu": 394.82 + }, + { + "epoch": 0.9504399264303822, + "grad_norm": 0.3940083682537079, + "learning_rate": 0.0001086135085654244, + "loss": 0.1993, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.52, + "memory/max_allocated (GiB)": 21.52, + "step": 9560, + "tokens_per_second_per_gpu": 346.47 + }, + { + "epoch": 0.951434110453845, + "grad_norm": 0.40422961115837097, + "learning_rate": 0.00010845713508859088, + "loss": 0.2479, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 9570, + "tokens_per_second_per_gpu": 338.94 + }, + { + "epoch": 0.9524282944773077, + "grad_norm": 0.5008521676063538, + "learning_rate": 0.00010830074078004615, + "loss": 0.2217, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 9580, + "tokens_per_second_per_gpu": 390.26 + }, + { + "epoch": 0.9534224785007704, + "grad_norm": 0.37146544456481934, + "learning_rate": 0.00010814432602502246, + "loss": 0.2178, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 18.17, + "memory/max_allocated (GiB)": 18.17, + "step": 9590, + "tokens_per_second_per_gpu": 347.55 + }, + { + "epoch": 0.9544166625242332, + "grad_norm": 0.4229485094547272, + "learning_rate": 0.00010798789120880246, + "loss": 0.2183, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 9600, + "tokens_per_second_per_gpu": 311.66 + }, + { + "epoch": 0.9554108465476959, + "grad_norm": 0.38394778966903687, + "learning_rate": 0.00010783143671671813, + "loss": 0.2663, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.57, + "memory/max_allocated (GiB)": 20.57, + "step": 9610, + "tokens_per_second_per_gpu": 329.51 + }, + { + "epoch": 0.9564050305711588, + "grad_norm": 0.4587366580963135, + "learning_rate": 0.00010767496293414996, + "loss": 0.2776, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9620, + "tokens_per_second_per_gpu": 464.41 + }, + { + "epoch": 0.9573992145946215, + "grad_norm": 0.5589081645011902, + "learning_rate": 0.0001075184702465259, + "loss": 0.2387, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.47, + "memory/max_allocated (GiB)": 21.47, + "step": 9630, + "tokens_per_second_per_gpu": 358.44 + }, + { + "epoch": 0.9583933986180843, + "grad_norm": 0.6210941672325134, + "learning_rate": 0.0001073619590393206, + "loss": 0.2587, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9640, + "tokens_per_second_per_gpu": 373.16 + }, + { + "epoch": 0.959387582641547, + "grad_norm": 0.6241095066070557, + "learning_rate": 0.0001072054296980542, + "loss": 0.2378, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9650, + "tokens_per_second_per_gpu": 385.04 + }, + { + "epoch": 0.9603817666650097, + "grad_norm": 0.40504932403564453, + "learning_rate": 0.00010704888260829156, + "loss": 0.2767, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9660, + "tokens_per_second_per_gpu": 384.99 + }, + { + "epoch": 0.9613759506884725, + "grad_norm": 0.4956580400466919, + "learning_rate": 0.0001068923181556412, + "loss": 0.2388, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 9670, + "tokens_per_second_per_gpu": 378.96 + }, + { + "epoch": 0.9623701347119352, + "grad_norm": 0.2848256230354309, + "learning_rate": 0.00010673573672575454, + "loss": 0.2238, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9680, + "tokens_per_second_per_gpu": 359.23 + }, + { + "epoch": 0.963364318735398, + "grad_norm": 0.38930168747901917, + "learning_rate": 0.00010657913870432468, + "loss": 0.2305, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.78, + "memory/max_allocated (GiB)": 19.78, + "step": 9690, + "tokens_per_second_per_gpu": 309.48 + }, + { + "epoch": 0.9643585027588607, + "grad_norm": 0.4527674913406372, + "learning_rate": 0.00010642252447708563, + "loss": 0.2731, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9700, + "tokens_per_second_per_gpu": 349.69 + }, + { + "epoch": 0.9653526867823234, + "grad_norm": 0.4257194697856903, + "learning_rate": 0.00010626589442981138, + "loss": 0.2635, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.63, + "memory/max_allocated (GiB)": 20.63, + "step": 9710, + "tokens_per_second_per_gpu": 414.72 + }, + { + "epoch": 0.9663468708057862, + "grad_norm": 0.3962489068508148, + "learning_rate": 0.00010610924894831483, + "loss": 0.2862, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 9720, + "tokens_per_second_per_gpu": 365.4 + }, + { + "epoch": 0.9673410548292489, + "grad_norm": 0.29012176394462585, + "learning_rate": 0.00010595258841844688, + "loss": 0.2353, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.72, + "memory/max_allocated (GiB)": 20.72, + "step": 9730, + "tokens_per_second_per_gpu": 364.81 + }, + { + "epoch": 0.9683352388527117, + "grad_norm": 0.5301884412765503, + "learning_rate": 0.00010579591322609559, + "loss": 0.1947, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.61, + "memory/max_allocated (GiB)": 20.61, + "step": 9740, + "tokens_per_second_per_gpu": 369.66 + }, + { + "epoch": 0.9693294228761744, + "grad_norm": 0.41399112343788147, + "learning_rate": 0.000105639223757185, + "loss": 0.1859, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.98, + "memory/max_allocated (GiB)": 19.98, + "step": 9750, + "tokens_per_second_per_gpu": 314.55 + }, + { + "epoch": 0.9703236068996371, + "grad_norm": 0.3453201353549957, + "learning_rate": 0.00010548252039767443, + "loss": 0.1971, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9760, + "tokens_per_second_per_gpu": 368.71 + }, + { + "epoch": 0.9713177909230999, + "grad_norm": 0.5453227162361145, + "learning_rate": 0.00010532580353355734, + "loss": 0.3006, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.47, + "memory/max_allocated (GiB)": 21.47, + "step": 9770, + "tokens_per_second_per_gpu": 420.65 + }, + { + "epoch": 0.9723119749465626, + "grad_norm": 0.42438754439353943, + "learning_rate": 0.00010516907355086055, + "loss": 0.2027, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.51, + "memory/max_allocated (GiB)": 21.51, + "step": 9780, + "tokens_per_second_per_gpu": 396.2 + }, + { + "epoch": 0.9733061589700254, + "grad_norm": 0.5116370916366577, + "learning_rate": 0.00010501233083564306, + "loss": 0.1968, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 9790, + "tokens_per_second_per_gpu": 345.38 + }, + { + "epoch": 0.9743003429934881, + "grad_norm": 0.23473972082138062, + "learning_rate": 0.00010485557577399536, + "loss": 0.1676, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9800, + "tokens_per_second_per_gpu": 386.72 + }, + { + "epoch": 0.9752945270169509, + "grad_norm": 0.5427513718605042, + "learning_rate": 0.00010469880875203827, + "loss": 0.2066, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 9810, + "tokens_per_second_per_gpu": 372.28 + }, + { + "epoch": 0.9762887110404136, + "grad_norm": 0.4717273712158203, + "learning_rate": 0.00010454203015592214, + "loss": 0.2097, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9820, + "tokens_per_second_per_gpu": 317.9 + }, + { + "epoch": 0.9772828950638763, + "grad_norm": 0.2797994017601013, + "learning_rate": 0.00010438524037182573, + "loss": 0.3073, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9830, + "tokens_per_second_per_gpu": 371.67 + }, + { + "epoch": 0.9782770790873391, + "grad_norm": 0.3694850504398346, + "learning_rate": 0.00010422843978595542, + "loss": 0.2309, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.2, + "memory/max_allocated (GiB)": 19.2, + "step": 9840, + "tokens_per_second_per_gpu": 426.94 + }, + { + "epoch": 0.9792712631108018, + "grad_norm": 0.3863958716392517, + "learning_rate": 0.00010407162878454423, + "loss": 0.2697, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 9850, + "tokens_per_second_per_gpu": 338.41 + }, + { + "epoch": 0.9802654471342646, + "grad_norm": 0.22021318972110748, + "learning_rate": 0.00010391480775385078, + "loss": 0.1866, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.5, + "memory/max_allocated (GiB)": 21.5, + "step": 9860, + "tokens_per_second_per_gpu": 331.0 + }, + { + "epoch": 0.9812596311577273, + "grad_norm": 0.3698722720146179, + "learning_rate": 0.00010375797708015844, + "loss": 0.1992, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.76, + "memory/max_allocated (GiB)": 20.76, + "step": 9870, + "tokens_per_second_per_gpu": 328.05 + }, + { + "epoch": 0.98225381518119, + "grad_norm": 0.45455843210220337, + "learning_rate": 0.00010360113714977428, + "loss": 0.2508, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.39, + "memory/max_allocated (GiB)": 21.39, + "step": 9880, + "tokens_per_second_per_gpu": 409.92 + }, + { + "epoch": 0.9832479992046528, + "grad_norm": 0.3562302887439728, + "learning_rate": 0.00010344428834902822, + "loss": 0.187, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 9890, + "tokens_per_second_per_gpu": 262.76 + }, + { + "epoch": 0.9842421832281155, + "grad_norm": 0.5735921263694763, + "learning_rate": 0.00010328743106427197, + "loss": 0.2517, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.62, + "memory/max_allocated (GiB)": 20.62, + "step": 9900, + "tokens_per_second_per_gpu": 338.79 + }, + { + "epoch": 0.9852363672515783, + "grad_norm": 0.47819098830223083, + "learning_rate": 0.00010313056568187818, + "loss": 0.2298, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 9910, + "tokens_per_second_per_gpu": 351.88 + }, + { + "epoch": 0.986230551275041, + "grad_norm": 0.29087749123573303, + "learning_rate": 0.00010297369258823948, + "loss": 0.1617, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 9920, + "tokens_per_second_per_gpu": 311.57 + }, + { + "epoch": 0.9872247352985037, + "grad_norm": 0.5746156573295593, + "learning_rate": 0.00010281681216976742, + "loss": 0.2215, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9930, + "tokens_per_second_per_gpu": 360.68 + }, + { + "epoch": 0.9882189193219665, + "grad_norm": 0.5306881666183472, + "learning_rate": 0.00010265992481289164, + "loss": 0.2165, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.56, + "memory/max_allocated (GiB)": 20.56, + "step": 9940, + "tokens_per_second_per_gpu": 350.01 + }, + { + "epoch": 0.9892131033454292, + "grad_norm": 0.5628572106361389, + "learning_rate": 0.00010250303090405886, + "loss": 0.2246, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.75, + "memory/max_allocated (GiB)": 20.75, + "step": 9950, + "tokens_per_second_per_gpu": 331.08 + }, + { + "epoch": 0.990207287368892, + "grad_norm": 0.38296106457710266, + "learning_rate": 0.00010234613082973195, + "loss": 0.2294, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 9960, + "tokens_per_second_per_gpu": 310.12 + }, + { + "epoch": 0.9912014713923547, + "grad_norm": 0.5048671960830688, + "learning_rate": 0.00010218922497638893, + "loss": 0.2263, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 9970, + "tokens_per_second_per_gpu": 360.03 + }, + { + "epoch": 0.9921956554158174, + "grad_norm": 0.2620450258255005, + "learning_rate": 0.00010203231373052205, + "loss": 0.2391, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.37, + "memory/max_allocated (GiB)": 21.37, + "step": 9980, + "tokens_per_second_per_gpu": 397.95 + }, + { + "epoch": 0.9931898394392802, + "grad_norm": 0.37807098031044006, + "learning_rate": 0.00010187539747863693, + "loss": 0.2654, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.54, + "memory/max_allocated (GiB)": 20.54, + "step": 9990, + "tokens_per_second_per_gpu": 413.64 + }, + { + "epoch": 0.9941840234627429, + "grad_norm": 0.3059203624725342, + "learning_rate": 0.00010171847660725147, + "loss": 0.2081, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.48, + "memory/max_allocated (GiB)": 21.48, + "step": 10000, + "tokens_per_second_per_gpu": 317.3 + }, + { + "epoch": 0.9951782074862057, + "grad_norm": 0.38892289996147156, + "learning_rate": 0.0001015615515028949, + "loss": 0.2826, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 21.49, + "memory/max_allocated (GiB)": 21.49, + "step": 10010, + "tokens_per_second_per_gpu": 379.68 + }, + { + "epoch": 0.9961723915096684, + "grad_norm": 0.22751818597316742, + "learning_rate": 0.00010140462255210696, + "loss": 0.2674, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.55, + "memory/max_allocated (GiB)": 20.55, + "step": 10020, + "tokens_per_second_per_gpu": 362.08 + }, + { + "epoch": 0.9971665755331311, + "grad_norm": 0.3578786551952362, + "learning_rate": 0.00010124769014143678, + "loss": 0.2125, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 19.77, + "memory/max_allocated (GiB)": 19.77, + "step": 10030, + "tokens_per_second_per_gpu": 367.81 + }, + { + "epoch": 0.9981607595565939, + "grad_norm": 0.32971230149269104, + "learning_rate": 0.00010109075465744208, + "loss": 0.1599, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.74, + "memory/max_allocated (GiB)": 20.74, + "step": 10040, + "tokens_per_second_per_gpu": 336.7 + }, + { + "epoch": 0.9991549435800566, + "grad_norm": 0.4897179901599884, + "learning_rate": 0.00010093381648668813, + "loss": 0.2485, + "memory/device_reserved (GiB)": 22.49, + "memory/max_active (GiB)": 20.73, + "memory/max_allocated (GiB)": 20.73, + "step": 10050, + "tokens_per_second_per_gpu": 303.41 + } + ], + "logging_steps": 10, + "max_steps": 20117, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1187739692210586e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}