acceleration:
  load_text_encoder_in_8bit: false
  mixed_precision_mode: bf16
  quantization: int8-quanto
checkpoints:
  interval: 100
  keep_last_n: 2
  precision: bfloat16
data:
  num_dataloader_workers: 2
  preprocessed_data_root: /fast/aviad/github/LTX-2/datasets/celebv-hq-step9-v2/.precomputed_celebvhq_ltxv2.3
flow_matching:
  timestep_sampling_mode: shifted_logit_normal
  timestep_sampling_params: {}
hub:
  hub_model_id: null
  push_to_hub: false
lora:
  alpha: 128
  dropout: 0.0
  rank: 128
  target_modules:
  - audio_attn1.to_k
  - audio_attn1.to_q
  - audio_attn1.to_v
  - audio_attn1.to_out.0
  - audio_attn2.to_k
  - audio_attn2.to_q
  - audio_attn2.to_v
  - audio_attn2.to_out.0
  - video_to_audio_attn.to_k
  - video_to_audio_attn.to_q
  - video_to_audio_attn.to_v
  - video_to_audio_attn.to_out.0
  - audio_to_video_attn.to_k
  - audio_to_video_attn.to_q
  - audio_to_video_attn.to_v
  - audio_to_video_attn.to_out.0
  - audio_ff.net.0.proj
  - audio_ff.net.2
model:
  load_checkpoint: null
  model_path: /fast/aviad/github/LTX-2/model/ltx-2.3-22b-dev.safetensors
  text_encoder_path: /scratch/aviad/cache/hub/models--google--gemma-3-12b-it-qat-q4_0-unquantized/snapshots/68f7ee4fbd59087436ada77ed2d62f373fdd4482
  training_mode: lora
optimization:
  batch_size: 1
  enable_gradient_checkpointing: true
  gradient_accumulation_steps: 4
  learning_rate: 0.0002
  max_grad_norm: 1.0
  optimizer_type: adamw8bit
  scheduler_params: {}
  scheduler_type: linear
  steps: 3000
output_dir: /fast/aviad/github/LTX-2/experiments/week4_training/outputs/2026-03-16/celebvhq_audio_ref_only_ic_negpos_r128_3k_ltxv2.3
seed: 42
training_strategy:
  audio_latents_dir: audio_latents
  first_frame_conditioning_p: 0.9
  mask_cross_attention_to_reference: true
  mask_reference_from_text_attention: true
  name: audio_ref_only_ic
  reference_audio_latents_dir: reference_audio_latents
  use_negative_ref_positions: true
validation:
  captions: null
  clap_metric: null
  compute_metrics: true
  face_metric: arcface
  frame_rate: 25.0
  generate_audio: true
  guidance_scale: 4.0
  images:
  - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/first_frames/IgKQ8z_xNhk_6_seg1.png
  - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/first_frames/oBvB83S_U4w_3_seg0.png
  - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/first_frames/teoyewW1bUY_1_seg1.png
  - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/first_frames/cJyhEAxnQ-U_2_seg1.png
  include_reference_in_output: true
  inference_steps: 30
  interval: 100
  negative_prompt: worst quality, inconsistent motion, blurry, jittery, distorted
  prompts:
  - '[VISUAL]: A close-up shot of actor Brad Pitt, who has medium-length, brownish
    hair styled to fall over his forehead. He appears to be in a dimly lit environment,
    possibly indoors, with some light source coming from the side, highlighting his
    features. He is wearing a dark jacket with a zipper partially undone over a gray
    shirt. His expression shifts from slightly closed eyes and a mouth slightly open
    as if speaking, to his eyes opening and looking slightly upward with a subtle,
    knowing smile or smirk.

    [SPEECH]: We''ll change the game. And that''s what I want.

    [SOUNDS]: A low, ambient soundscape is present, consistent with a dramatic movie
    scene.

    [TEXT]: None'
  - '[VISUAL]: A close-up shot features a blonde woman with her hair styled in soft
    waves, wearing a wide-brimmed black hat and a black dress or top adorned with
    a sparkling, ornate brooch near the neckline. She is holding the handle of a black
    umbrella that has a white patterned edge visible at the top of the frame. Her
    expression is focused and somewhat serious as she looks slightly down and to her
    right. The background is a pale, slightly greenish-tinged sky.

    [SPEECH]: If you come with me, maybe we can find

    [SOUNDS]: Faint, ethereal music or background ambiance with a slightly dramatic
    tone.

    [TEXT]: None'
  - '[VISUAL]: A close-up shot features a pale, bald man with what appear to be milky
    or obscured eyes, suggesting blindness or an unusual condition. He has a somewhat
    stern or intense expression and is wearing a dark, possibly hooded, garment that
    looks rough or textured. In the blurred background to the left, a person wearing
    what looks like a military uniform and a dark beret is partially visible. The
    lighting is dim, highlighting the subject''s face.

    [SPEECH]: I would use reason. Your planet will burn.

    [SOUNDS]: Low, dramatic background music is present, consistent with a scene of
    tension or confrontation.

    [TEXT]: None'
  - '[VISUAL]: A close-up shot of a woman with dark, wavy hair styled partially up.
    She has striking features, including defined eyebrows and bright red lipstick.
    She is wearing large, gold hoop earrings. Her skin is illuminated with a soft,
    purplish-pink light, creating a dramatic and somewhat ethereal look against a
    dark background, possibly indoors or at night. She appears distressed, with her
    eyes looking down and slightly to the side, and her mouth slightly open as if
    speaking or crying softly. Her left hand, with painted red nails, is visible in
    the lower left, resting on something dark that she is leaning into. The overall
    image has a slightly soft focus or grainy quality typical of older film footage.

    [SPEECH]: I need to be loved while I''m hiding and afraid.

    [SOUNDS]: Soft, melancholic, dramatic orchestral music swells gently in the background.

    [TEXT]: None'
  reference_downscale_factor: 1
  reference_videos:
  - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121_clean/zeO0kdrR4EU_0_seg1.mp4
  - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121_clean/K5H6UBXFuBY_0_seg0.mp4
  - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121_clean/teoyewW1bUY_1_seg2.mp4
  - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121_clean/cJyhEAxnQ-U_2_seg0.mp4
  seed: 42
  skip_initial_validation: false
  speaker_metric: wavlm_ecapa
  stg_blocks:
  - 29
  stg_mode: stg_av
  stg_scale: 1.0
  target_videos:
  - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121/IgKQ8z_xNhk_6_seg1.mp4
  - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121/oBvB83S_U4w_3_seg0.mp4
  - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121/teoyewW1bUY_1_seg1.mp4
  - /fast/aviad/github/LTX-2/datasets/celebv-hq-step9/videos_segmented_25fps_121/cJyhEAxnQ-U_2_seg1.mp4
  video_dims: !!python/tuple
  - 512
  - 512
  - 121
  videos_per_prompt: 1
wandb:
  enabled: true
  entity: null
  log_validation_videos: true
  project: av-ic-lora
  tags:
  - ltx2.3
  - 22b
  - audio-ref-only-ic
  - r128
  - celebvhq
  - envsplit_diverse
  - negative_pos
  - masked_xattn
  - text_audio_mask
  - week4
  - 25fps
  - i2v
  - 3k_steps