acceleration: load_text_encoder_in_8bit: false mixed_precision_mode: bf16 quantization: int8-quanto checkpoints: interval: 100 keep_last_n: 2 precision: bfloat16 data: num_dataloader_workers: 2 preprocessed_data_root: /fast/aviad/github/LTX-2/datasets/celebv-hq-step9-v2/.precomputed_celebvhq_ltxv2.3 flow_matching: timestep_sampling_mode: shifted_logit_normal timestep_sampling_params: {} hub: hub_model_id: null push_to_hub: false lora: alpha: 128 dropout: 0.0 rank: 128 target_modules: - audio_attn1.to_k - audio_attn1.to_q - audio_attn1.to_v - audio_attn1.to_out.0 - audio_attn2.to_k - audio_attn2.to_q - audio_attn2.to_v - audio_attn2.to_out.0 - video_to_audio_attn.to_k - video_to_audio_attn.to_q - video_to_audio_attn.to_v - video_to_audio_attn.to_out.0 - audio_to_video_attn.to_k - audio_to_video_attn.to_q - audio_to_video_attn.to_v - audio_to_video_attn.to_out.0 - audio_ff.net.0.proj - audio_ff.net.2 model: load_checkpoint: null model_path: /fast/aviad/github/LTX-2/model/ltx-2.3-22b-dev.safetensors text_encoder_path: /scratch/aviad/cache/hub/models--google--gemma-3-12b-it-qat-q4_0-unquantized/snapshots/68f7ee4fbd59087436ada77ed2d62f373fdd4482 training_mode: lora optimization: batch_size: 1 enable_gradient_checkpointing: true gradient_accumulation_steps: 4 learning_rate: 0.0002 max_grad_norm: 1.0 optimizer_type: adamw8bit scheduler_params: {} scheduler_type: linear steps: 3000 output_dir: /fast/aviad/github/LTX-2/experiments/week4_training/outputs/2026-03-16/celebvhq_audio_ref_only_ic_negpos_r128_3k_ltxv2.3 seed: 42 training_strategy: audio_latents_dir: audio_latents first_frame_conditioning_p: 0.9 mask_cross_attention_to_reference: true mask_reference_from_text_attention: true name: audio_ref_only_ic reference_audio_latents_dir: reference_audio_latents use_negative_ref_positions: true validation: captions: null clap_metric: null compute_metrics: true face_metric: arcface frame_rate: 25.0 generate_audio: true guidance_scale: 4.0 images: - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/first_frames/IgKQ8z_xNhk_6_seg1.png - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/first_frames/oBvB83S_U4w_3_seg0.png - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/first_frames/teoyewW1bUY_1_seg1.png - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/first_frames/cJyhEAxnQ-U_2_seg1.png include_reference_in_output: true inference_steps: 30 interval: 100 negative_prompt: worst quality, inconsistent motion, blurry, jittery, distorted prompts: - '[VISUAL]: A close-up shot of actor Brad Pitt, who has medium-length, brownish hair styled to fall over his forehead. He appears to be in a dimly lit environment, possibly indoors, with some light source coming from the side, highlighting his features. He is wearing a dark jacket with a zipper partially undone over a gray shirt. His expression shifts from slightly closed eyes and a mouth slightly open as if speaking, to his eyes opening and looking slightly upward with a subtle, knowing smile or smirk. [SPEECH]: We''ll change the game. And that''s what I want. [SOUNDS]: A low, ambient soundscape is present, consistent with a dramatic movie scene. [TEXT]: None' - '[VISUAL]: A close-up shot features a blonde woman with her hair styled in soft waves, wearing a wide-brimmed black hat and a black dress or top adorned with a sparkling, ornate brooch near the neckline. She is holding the handle of a black umbrella that has a white patterned edge visible at the top of the frame. Her expression is focused and somewhat serious as she looks slightly down and to her right. The background is a pale, slightly greenish-tinged sky. [SPEECH]: If you come with me, maybe we can find [SOUNDS]: Faint, ethereal music or background ambiance with a slightly dramatic tone. [TEXT]: None' - '[VISUAL]: A close-up shot features a pale, bald man with what appear to be milky or obscured eyes, suggesting blindness or an unusual condition. He has a somewhat stern or intense expression and is wearing a dark, possibly hooded, garment that looks rough or textured. In the blurred background to the left, a person wearing what looks like a military uniform and a dark beret is partially visible. The lighting is dim, highlighting the subject''s face. [SPEECH]: I would use reason. Your planet will burn. [SOUNDS]: Low, dramatic background music is present, consistent with a scene of tension or confrontation. [TEXT]: None' - '[VISUAL]: A close-up shot of a woman with dark, wavy hair styled partially up. She has striking features, including defined eyebrows and bright red lipstick. She is wearing large, gold hoop earrings. Her skin is illuminated with a soft, purplish-pink light, creating a dramatic and somewhat ethereal look against a dark background, possibly indoors or at night. She appears distressed, with her eyes looking down and slightly to the side, and her mouth slightly open as if speaking or crying softly. Her left hand, with painted red nails, is visible in the lower left, resting on something dark that she is leaning into. The overall image has a slightly soft focus or grainy quality typical of older film footage. [SPEECH]: I need to be loved while I''m hiding and afraid. [SOUNDS]: Soft, melancholic, dramatic orchestral music swells gently in the background. [TEXT]: None' reference_downscale_factor: 1 reference_videos: - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121_clean/zeO0kdrR4EU_0_seg1.mp4 - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121_clean/K5H6UBXFuBY_0_seg0.mp4 - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121_clean/teoyewW1bUY_1_seg2.mp4 - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121_clean/cJyhEAxnQ-U_2_seg0.mp4 seed: 42 skip_initial_validation: false speaker_metric: wavlm_ecapa stg_blocks: - 29 stg_mode: stg_av stg_scale: 1.0 target_videos: - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121/IgKQ8z_xNhk_6_seg1.mp4 - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121/oBvB83S_U4w_3_seg0.mp4 - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121/teoyewW1bUY_1_seg1.mp4 - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121/cJyhEAxnQ-U_2_seg1.mp4 video_dims: !!python/tuple - 512 - 512 - 121 videos_per_prompt: 1 wandb: enabled: true entity: null log_validation_videos: true project: av-ic-lora tags: - ltx2.3 - 22b - audio-ref-only-ic - r128 - celebvhq - envsplit_diverse - negative_pos - masked_xattn - text_audio_mask - week4 - 25fps - i2v - 3k_steps