diff --git "a/slurm.out" "b/slurm.out" new file mode 100644--- /dev/null +++ "b/slurm.out" @@ -0,0 +1,3023 @@ +1: [2025-08-06 23:16:07,347] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:118] [PID:2862582] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +3: [2025-08-06 23:16:07,347] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:118] [PID:391404] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +2: [2025-08-06 23:16:07,347] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:118] [PID:1823156] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +0: [2025-08-06 23:16:07,349] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:118] [PID:411490] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +1: [2025-08-06 23:16:07,347] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:217] [PID:2862582] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +3: [2025-08-06 23:16:07,347] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:217] [PID:391404] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +2: [2025-08-06 23:16:07,348] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:217] [PID:1823156] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +0: [2025-08-06 23:16:07,349] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:217] [PID:411490] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +0: [2025-08-06 23:16:16,186] [INFO] [axolotl.cli.config.load_cfg:244] [PID:411490] [RANK:0] config: +0: { +0: "activation_offloading": false, +0: "auto_resume_from_checkpoints": true, +0: "axolotl_config_path": "/lustre/fswork/projects/rech/dgo/udv55np/train/tmp/1754476677918612596.yaml", +0: "base_model": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-0.5B", +0: "base_model_config": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-0.5B", +0: "batch_size": 16, +0: "bf16": true, +0: "capabilities": { +0: "bf16": true, +0: "compute_capability": "sm_90", +0: "fp8": false, +0: "n_gpu": 4, +0: "n_node": 1 +0: }, +0: "chat_template": "qwen_25", +0: "dataloader_num_workers": 4, +0: "dataloader_pin_memory": true, +0: "dataloader_prefetch_factor": 256, +0: "dataset_prepared_path": "/lustre/fsn1/projects/rech/dgo/udv55np/dataset/Qwen3-235B-A22B/Qwen2.5-0.5B/1", +0: "dataset_processes": 192, +0: "datasets": [ +0: { +0: "chat_template": "tokenizer_default", +0: "data_files": [ +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0007.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0009.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0005.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0006.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0014.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0010.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0012.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0008.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0001.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0002.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0013.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0015.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0004.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0011.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0000.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking/0003.jsonl" +0: ], +0: "ds_type": "json", +0: "field_messages": "conversations", +0: "message_property_mappings": { +0: "content": "content", +0: "role": "role" +0: }, +0: "path": "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Qwen3-235B-A22B/thinking", +0: "trust_remote_code": false, +0: "type": "chat_template" +0: } +0: ], +0: "ddp": true, +0: "deepspeed": { +0: "bf16": { +0: "enabled": true +0: }, +0: "gradient_accumulation_steps": "auto", +0: "gradient_clipping": "auto", +0: "train_batch_size": "auto", +0: "train_micro_batch_size_per_gpu": "auto", +0: "wall_clock_breakdown": false, +0: "zero_optimization": { +0: "contiguous_gradients": true, +0: "overlap_comm": true, +0: "reduce_bucket_size": "auto", +0: "stage": 3, +0: "stage3_gather_16bit_weights_on_model_save": true, +0: "stage3_param_persistence_threshold": "auto", +0: "stage3_prefetch_bucket_size": "auto", +0: "sub_group_size": 0 +0: } +0: }, +0: "device": "cuda:0", +0: "device_map": { +0: "": 0 +0: }, +0: "env_capabilities": { +0: "torch_version": "2.6.0" +0: }, +0: "eval_batch_size": 1, +0: "eval_causal_lm_metrics": [ +0: "sacrebleu", +0: "comet", +0: "ter", +0: "chrf" +0: ], +0: "eval_max_new_tokens": 128, +0: "eval_sample_packing": true, +0: "eval_table_size": 0, +0: "evals_per_epoch": 0, +0: "flash_attention": true, +0: "fp16": false, +0: "gradient_accumulation_steps": 4, +0: "gradient_checkpointing": true, +0: "gradient_checkpointing_kwargs": { +0: "use_reentrant": true +0: }, +0: "learning_rate": 2e-05, +0: "lisa_layers_attribute": "model.layers", +0: "load_best_model_at_end": false, +0: "load_in_4bit": false, +0: "load_in_8bit": false, +0: "local_rank": 0, +0: "logging_steps": 10, +0: "lora_dropout": 0.0, +0: "loraplus_lr_embedding": 1e-06, +0: "lr_scheduler": "warmup_stable_decay", +0: "lr_scheduler_kwargs": { +0: "min_lr_ratio": 0.1, +0: "num_decay_steps": 300 +0: }, +0: "max_prompt_len": 512, +0: "mean_resizing_embeddings": false, +0: "micro_batch_size": 1, +0: "model_config_type": "qwen2", +0: "num_epochs": 1.0, +0: "optimizer": "adamw_torch_fused", +0: "output_dir": "/lustre/fswork/projects/rech/dgo/udv55np/ift/Qwen3-235B-A22B/Qwen2.5-0.5B/1", +0: "pad_to_sequence_len": true, +0: "pretrain_multipack_attn": true, +0: "pretrain_multipack_buffer_size": 10000, +0: "profiler_steps_start": 0, +0: "qlora_sharded_model_loading": false, +0: "ray_num_workers": 1, +0: "resources_per_worker": { +0: "GPU": 1 +0: }, +0: "sample_packing": true, +0: "sample_packing_bin_size": 200, +0: "sample_packing_group_size": 100000, +0: "save_only_model": false, +0: "save_safetensors": true, +0: "save_steps": 0.2, +0: "save_total_limit": 20, +0: "sequence_len": 16384, +0: "sequence_parallel_degree": 1, +0: "shuffle_merged_datasets": true, +0: "skip_prepare_dataset": false, +0: "special_tokens": { +0: "bos_token": "<|im_start|>", +0: "eos_token": "<|im_end|>", +0: "pad_token": "<|endoftext|>" +0: }, +0: "strict": false, +0: "tensor_parallel_size": 1, +0: "tf32": false, +0: "tiled_mlp_use_original_mlp": true, +0: "tokenizer_config": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-0.5B", +0: "torch_dtype": "torch.bfloat16", +0: "train_on_inputs": false, +0: "trl": { +0: "log_completions": false, +0: "mask_truncated_completions": false, +0: "ref_model_mixup_alpha": 0.9, +0: "ref_model_sync_steps": 64, +0: "scale_rewards": true, +0: "sync_ref_model": false, +0: "use_vllm": false, +0: "vllm_server_host": "0.0.0.0", +0: "vllm_server_port": 8000 +0: }, +0: "use_ray": false, +0: "use_tensorboard": true, +0: "val_set_size": 0.0, +0: "vllm": { +0: "device": "auto", +0: "dtype": "auto", +0: "gpu_memory_utilization": 0.9, +0: "host": "0.0.0.0", +0: "port": 8000 +0: }, +0: "warmup_steps": 150, +0: "weight_decay": 0.0, +0: "world_size": 4 +0: } +0: [2025-08-06 23:16:16,187] [INFO] [axolotl.cli.checks.check_user_token:35] [PID:411490] [RANK:0] Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used. +1: [2025-08-06 23:16:16,465] [INFO] [axolotl.utils.data.sft._load_raw_datasets:310] [PID:2862582] [RANK:0] Loading raw datasets... +1: Downloading data: 0%| | 0/16 [00:0016384) (num_proc=192): 0%| | 0/1393783 [00:0016384) (num_proc=192): 0%| | 1000/1393783 [00:03<1:15:21, 308.01 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 1%| | 9000/1393783 [00:03<06:18, 3663.31 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 1%| | 17000/1393783 [00:03<02:53, 7956.95 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 2%|▏ | 28000/1393783 [00:03<01:28, 15457.07 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 3%|▎ | 39000/1393783 [00:03<00:55, 24352.33 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 3%|▎ | 48000/1393783 [00:03<00:44, 30345.14 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 4%|▍ | 56000/1393783 [00:04<00:40, 33304.18 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 5%|▍ | 63000/1393783 [00:04<00:35, 37685.02 examples +1: /s] Dropping Long Sequences (>16384) (num_proc=192): 5%|▌ | 70000/1393783 [00:04<00:37, 35123.55 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 5%|▌ | 76000/1393783 [00:04<00:34, 38499.55 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 6%|▌ | 82000/1393783 [00:04<00:36, 36050.94 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 7%|▋ | 91000/1393783 [00:04<00:28, 45807.76 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 7%|▋ | 100000/1393783 [00:04<00:24, 52347.60 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 8%|▊ | 110000/1393783 [00:05<00:20, 61515.77 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 9%|▊ | 119000/1393783 [00:05<00:18, 67561.43 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 9%|▉ | 127000/1393783 [00:05<00:19, 66665.42 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 10%|▉ | 135000/13937 +1: 83 [00:05<00:18, 69231.58 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 10%|█ | 143000/1393783 [00:05<00:18, 68328.69 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 11%|█▏ | 158000/1393783 [00:05<00:13, 89344.28 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 12%|█▏ | 172000/1393783 [00:05<00:12, 100685.92 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 13%|█▎ | 186000/1393783 [00:05<00:11, 108263.37 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 14%|█▍ | 198000/1393783 [00:05<00:10, 110210.28 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 15%|█▌ | 216000/1393783 [00:06<00:09, 125808.58 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 16%|█▋ | 229000/1393783 [00:06<00:10, 115097.03 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 17%|█▋ | 241000/1393783 [00:06<00:12, 91900.09 examples/s] Dropping Long Sequence +1: s (>16384) (num_proc=192): 18%|█▊ | 254000/1393783 [00:06<00:14, 81309.70 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 19%|█▉ | 263000/1393783 [00:06<00:14, 75891.65 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 20%|█▉ | 272000/1393783 [00:06<00:18, 60345.67 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 20%|██ | 279000/1393783 [00:07<00:19, 57297.43 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 21%|██ | 286000/1393783 [00:07<00:19, 57839.51 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 21%|██ | 295000/1393783 [00:07<00:17, 64434.57 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 22%|██▏ | 311000/1393783 [00:07<00:13, 80963.23 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 23%|██▎ | 320000/1393783 [00:07<00:14, 72986.29 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 24%|██▎ | 330000/139 +1: 3783 [00:07<00:14, 75242.70 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 25%|██▍ | 342000/1393783 [00:07<00:12, 83033.20 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 26%|██▌ | 358000/1393783 [00:07<00:10, 94675.54 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 27%|██▋ | 373000/1393783 [00:08<00:09, 106771.83 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 28%|██▊ | 390000/1393783 [00:08<00:08, 121905.94 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 29%|██▉ | 403000/1393783 [00:08<00:08, 113675.16 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 30%|██▉ | 415000/1393783 [00:08<00:08, 108820.98 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 31%|███ | 427000/1393783 [00:08<00:11, 85227.38 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 31%|███▏ | 437000/1393783 [00:08<00:11, 82652.19 examples/s] D +1: ropping Long Sequences (>16384) (num_proc=192): 32%|███▏ | 446000/1393783 [00:08<00:11, 79575.10 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 33%|███▎ | 455000/1393783 [00:09<00:13, 68759.10 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 33%|███▎ | 463000/1393783 [00:09<00:16, 55618.54 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 34%|███▎ | 470000/1393783 [00:09<00:16, 57171.59 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 34%|███▍ | 477000/1393783 [00:09<00:16, 57097.10 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 35%|███▌ | 491000/1393783 [00:09<00:12, 75187.18 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 36%|███▌ | 500000/1393783 [00:09<00:11, 77037.41 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 37%|███▋ | 510000/1393783 [00:09<00:11, 78896.28 examples/s] Dropping Long Sequences (>16384) +1: (num_proc=192): 37%|███▋ | 519000/1393783 [00:09<00:10, 79867.25 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 38%|███▊ | 532000/1393783 [00:10<00:10, 85446.67 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 39%|███▉ | 545000/1393783 [00:10<00:08, 94496.65 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 40%|███▉ | 557000/1393783 [00:10<00:08, 100077.59 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 41%|████ | 568000/1393783 [00:10<00:08, 101379.29 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 42%|████▏ | 587000/1393783 [00:10<00:06, 125445.32 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 43%|████▎ | 600000/1393783 [00:10<00:06, 115282.11 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 44%|████▍ | 612000/1393783 [00:10<00:07, 104360.58 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 45%| +1: ████▍ | 623000/1393783 [00:11<00:08, 87372.10 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 45%|████▌ | 633000/1393783 [00:11<00:09, 78317.99 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 46%|████▌ | 642000/1393783 [00:11<00:11, 66998.57 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 47%|████▋ | 650000/1393783 [00:11<00:11, 65844.89 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 47%|████▋ | 657000/1393783 [00:11<00:13, 56535.17 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 48%|████▊ | 667000/1393783 [00:11<00:11, 64515.88 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 49%|████▊ | 677000/1393783 [00:11<00:10, 71071.21 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 49%|████▉ | 685000/1393783 [00:12<00:10, 66745.48 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 50%|████▉ +1: | 693000/1393783 [00:12<00:10, 69505.37 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 50%|█████ | 703000/1393783 [00:12<00:09, 72489.92 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 51%|█████▏ | 715000/1393783 [00:12<00:08, 83749.96 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 52%|█████▏ | 726000/1393783 [00:12<00:07, 88686.74 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 53%|█████▎ | 737000/1393783 [00:12<00:07, 92980.40 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 54%|█████▍ | 751000/1393783 [00:12<00:06, 105704.07 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 55%|█████▍ | 762000/1393783 [00:12<00:06, 99562.26 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 56%|█████▌ | 778000/1393783 [00:12<00:05, 113264.42 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 57%|█████� +1: �� | 790000/1393783 [00:13<00:05, 103555.06 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 57%|█████▋ | 801000/1393783 [00:13<00:05, 104945.89 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 58%|█████▊ | 812000/1393783 [00:13<00:07, 75410.39 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 59%|█████▉ | 823000/1393783 [00:13<00:07, 81258.83 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 60%|█████▉ | 833000/1393783 [00:13<00:07, 79615.55 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 60%|██████ | 842000/1393783 [00:13<00:08, 61948.14 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 61%|██████ | 850000/1393783 [00:14<00:08, 61522.13 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 62%|██████▏ | 859000/1393783 [00:14<00:07, 67154.54 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 62%|███� +1: �██▏ | 868000/1393783 [00:14<00:07, 66619.19 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 63%|██████▎ | 876000/1393783 [00:14<00:07, 66747.09 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 63%|██████▎ | 884000/1393783 [00:14<00:07, 67771.17 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 64%|██████▍ | 898000/1393783 [00:14<00:05, 85049.65 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 65%|██████▌ | 911000/1393783 [00:14<00:05, 95032.42 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 66%|██████▌ | 921000/1393783 [00:14<00:05, 90200.73 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 67%|██████▋ | 932000/1393783 [00:14<00:04, 94727.78 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 68%|██████▊ | 948000/1393783 [00:15<00:04, 104801.28 examples/s] Dropping Long Sequences (>16384) (num_proc=192): +1: 69%|██████▉ | 962000/1393783 [00:15<00:03, 111277.57 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 70%|██████▉ | 974000/1393783 [00:15<00:04, 95782.78 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 71%|███████ | 987000/1393783 [00:15<00:04, 97175.25 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 72%|███████▏ | 998000/1393783 [00:15<00:04, 83098.07 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 72%|███████▏ | 1007000/1393783 [00:15<00:04, 80162.22 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 73%|███████▎ | 1016000/1393783 [00:15<00:04, 77947.37 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 73%|███████▎ | 1024000/1393783 [00:16<00:04, 77771.92 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 74%|███████▍ | 1032000/1393783 [00:16<00:05, 64961.47 examples/s] Dropping Long Se +1: quences (>16384) (num_proc=192): 75%|███████▍ | 1039000/1393783 [00:16<00:05, 61622.26 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 75%|███████▌ | 1048000/1393783 [00:16<00:05, 67548.42 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 76%|███████▌ | 1057000/1393783 [00:16<00:04, 70105.04 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 76%|███████▋ | 1065000/1393783 [00:16<00:04, 69123.15 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 77%|███████▋ | 1073000/1393783 [00:16<00:05, 61376.05 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 78%|███████▊ | 1088000/1393783 [00:16<00:03, 80620.61 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 79%|███████▉ | 1100000/1393783 [00:17<00:03, 89906.55 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 80%|███████▉ | 1112000/1393783 [00:17<00:0 +1: 3, 90223.55 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 81%|████████ | 1129000/1393783 [00:17<00:02, 108153.46 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 82%|████████▏ | 1141000/1393783 [00:17<00:02, 101542.29 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 83%|████████▎ | 1153259/1393783 [00:17<00:02, 101163.42 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 84%|████████▎ | 1164297/1393783 [00:17<00:02, 98999.73 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 84%|████████▍ | 1174594/1393783 [00:17<00:02, 92686.96 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 85%|████████▌ | 1186191/1393783 [00:17<00:02, 98558.06 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 86%|████████▌ | 1197710/1393783 [00:18<00:01, 98391.10 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 87% +1: |████████▋ | 1207784/1393783 [00:18<00:01, 98985.52 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 87%|████████▋ | 1218341/1393783 [00:18<00:01, 96713.93 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 88%|████████▊ | 1228640/1393783 [00:18<00:01, 91407.24 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 89%|████████▉ | 1240457/1393783 [00:18<00:01, 97396.87 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 90%|████████▉ | 1252273/1393783 [00:18<00:01, 102134.09 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 91%|█████████ | 1264611/1393783 [00:18<00:01, 104116.28 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 92%|█████████▏| 1280166/1393783 [00:18<00:00, 116974.21 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 93%|█████████▎| 1294795/1393783 [00:18<00:00, 124889.40 e +1: xamples/s] Dropping Long Sequences (>16384) (num_proc=192): 94%|█████████▍| 1308909/1393783 [00:18<00:00, 129451.85 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 95%|█████████▍| 1322019/1393783 [00:19<00:00, 126984.21 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 96%|█████████▌| 1335868/1393783 [00:19<00:00, 128478.69 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 97%|█████████▋| 1349237/1393783 [00:19<00:00, 120699.29 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 98%|█████████▊| 1362865/1393783 [00:19<00:00, 123995.91 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 99%|█████████▊| 1375531/1393783 [00:19<00:00, 106603.46 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 99%|█████████▉| 1386675/1393783 [00:19<00:00, 89054.18 examples/s] Dropping Long Sequences (>16384) (num_proc=192) +1: : 100%|██████████| 1393783/1393783 [00:20<00:00, 67644.96 examples/s] +1: Drop Samples with Zero Trainable Tokens (num_proc=192): 0%| | 0/1386595 [00:00