import spaces
import gradio as gr
import torch
from diffusers import DiffusionPipeline
from diffusers.utils import load_image, export_to_video
from diffusers.pipelines.ltx2 import LTX2Pipeline, LTX2LatentUpsamplePipeline
from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
from diffusers.pipelines.ltx2.utils import (
    DISTILLED_SIGMA_VALUES,
    STAGE_2_DISTILLED_SIGMA_VALUES
)
from diffusers.pipelines.ltx2.export_utils import encode_video
import random
import numpy as np
from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
from PIL import Image, ImageOps

# ============================================================
# 🔥 GLOBAL PERFORMANCE SETTINGS (H200 OPTIMIZED)
# ============================================================

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_grad_enabled(False)

torch.backends.cuda.enable_flash_sdp(True)
torch.backends.cuda.enable_mem_efficient_sdp(True)

DEVICE = "cuda"
DTYPE = torch.bfloat16

MODEL_ID = "rootonchair/LTX-2-19b-distilled"

# ============================================================
# 🚀 LOAD IMAGE+ AUDIO PIPELINE
# ============================================================

print("🚀 Loading Audio-to-Video pipeline...")

pipe_i2v = DiffusionPipeline.from_pretrained(
    MODEL_ID,
    custom_pipeline="multimodalart/ltx2-audio-to-video",
    torch_dtype=DTYPE,
).to(DEVICE)

try:
    pipe_i2v.enable_xformers_memory_efficient_attention()
    print("✅ xFormers enabled")
except Exception as e:
    print("⚠️ xFormers not available, using PyTorch SDPA instead.")
    print(e)
pipe_i2v.load_lora_weights(
    "Lightricks/LTX-2-19b-IC-LoRA-Detailer",
    adapter_name="camera_control"
)
pipe_i2v.fuse_lora(lora_scale=0.8)
pipe_i2v.unload_lora_weights()

print("✅ Image+Audio pipeline loaded")

# ============================================================
# 🚀 LOAD TEXT TO VIDEO PIPELINE
# ============================================================

print("🚀 Loading Text-to-Video pipeline...")

pipe_t2v = LTX2Pipeline.from_pretrained(
    MODEL_ID,
    torch_dtype=DTYPE
).to(DEVICE)

latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
    MODEL_ID,
    subfolder="latent_upsampler",
    torch_dtype=DTYPE,
).to(DEVICE)

upsample_pipe = LTX2LatentUpsamplePipeline(
    vae=pipe_t2v.vae,
    latent_upsampler=latent_upsampler
).to(DEVICE)

print("✅ Text-to-Video pipeline loaded")

# ============================================================
# 🎬 IMAGE → VIDEO GENERATION
# ============================================================

@spaces.GPU(duration=85, size='xlarge')
def generate_i2v(image_path, audio_path, prompt, negative_prompt, duration, seed):

    if not image_path or not audio_path:
        raise gr.Error("Please provide both image and audio.")

    if seed == -1:
        seed = random.randint(0, 1_000_000)

    generator = torch.Generator(device="cuda").manual_seed(seed)

    image = load_image(image_path)

    fps = 24
    total_frames = int(duration * fps)
    num_frames = max((round(total_frames / 8) * 8) + 1, 9)

    with torch.inference_mode():
        video_output, _ = pipe_i2v(
            image=image,
            audio=audio_path,
            prompt=prompt,
            negative_prompt=negative_prompt,
            width=768,
            height=512,
            num_frames=num_frames,
            frame_rate=fps,
            num_inference_steps=8,
            sigmas=DISTILLED_SIGMA_VALUES,
            guidance_scale=1.0,
            generator=generator,
            return_dict=False,
        )

    output_path = f"i2v_{seed}.mp4"
    export_to_video(video_output, output_path, fps=fps)

    return output_path, seed


# ============================================================
# 🎬 TEXT → VIDEO GENERATION
# ============================================================

@spaces.GPU(duration=85, size='xlarge')
def generate_t2v(prompt, negative_prompt, duration, seed):

    if seed == -1:
        seed = random.randint(0, 1_000_000)

    generator = torch.Generator(device="cuda").manual_seed(seed)

    width = 768
    height = 512
    fps = 24
    total_frames = int(duration * fps)
    num_frames = max((round(total_frames / 8) * 8) + 1, 9)

    with torch.inference_mode():

        # Stage 1
        video_latent, audio_latent = pipe_t2v(
            prompt=prompt,
            negative_prompt=negative_prompt,
            width=width,
            height=height,
            num_frames=num_frames,
            frame_rate=fps,
            num_inference_steps=8,
            sigmas=DISTILLED_SIGMA_VALUES,
            guidance_scale=1.0,
            generator=generator,
            output_type="latent",
            return_dict=False,
        )

        # Latent Upscale
        upscaled_video_latent = upsample_pipe(
            latents=video_latent,
            output_type="latent",
            return_dict=False,
        )[0]

        # Stage 2
        video, audio = pipe_t2v(
            latents=upscaled_video_latent,
            audio_latents=audio_latent,
            prompt=prompt,
            negative_prompt=negative_prompt,
            num_inference_steps=3,
            noise_scale=STAGE_2_DISTILLED_SIGMA_VALUES[0],
            sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
            generator=generator,
            guidance_scale=1.0,
            output_type="np",
            return_dict=False,
        )

    video = (video * 255).round().astype("uint8")
    video = torch.from_numpy(video)

    output_path = f"t2v_{seed}.mp4"

    encode_video(
        video[0],
        fps=fps,
        audio=audio[0].float().cpu(),
        audio_sample_rate=pipe_t2v.vocoder.config.output_sampling_rate,
        output_path=output_path,
    )

    return output_path, seed


# ============================================================
# 🖥️ GRADIO UI (2 TABS)
# ============================================================

with gr.Blocks(theme=gr.themes.Soft()) as demo:

    gr.Markdown("# ⚡ LTX-2 Distilled (H200 Optimized)")

    with gr.Tabs():

        # ------------------------------------------------
        # TAB 1 — IMAGE TO VIDEO
        # ------------------------------------------------
        with gr.Tab("🖼️ Image to Video"):

            with gr.Row():
                input_image = gr.Image(type="filepath")
                input_audio = gr.Audio(type="filepath")
                result_i2v = gr.Video()

            prompt_i2v = gr.Textbox(
                value="A person speaking naturally",
                lines=2
            )

            duration_i2v = gr.Slider(1, 12, value=4)
            negative_i2v = gr.Textbox(value="low quality")
            seed_i2v = gr.Number(value=-1)

            btn_i2v = gr.Button("Generate I2V", variant="primary")

            btn_i2v.click(
                generate_i2v,
                inputs=[
                    input_image,
                    input_audio,
                    prompt_i2v,
                    negative_i2v,
                    duration_i2v,
                    seed_i2v
                ],
                outputs=[result_i2v, seed_i2v]
            )

        # ------------------------------------------------
        # TAB 2 — TEXT TO VIDEO
        # ------------------------------------------------
        with gr.Tab("🎬 Text to Video"):

            result_t2v = gr.Video()

            prompt_t2v = gr.Textbox(
                value="A cinematic sunset over the ocean",
                lines=3
            )

            duration_t2v = gr.Slider(1, 12, value=4)
            negative_t2v = gr.Textbox(
                value="low quality, distorted, glitchy"
            )
            seed_t2v = gr.Number(value=-1)

            btn_t2v = gr.Button("Generate T2V", variant="primary")

            btn_t2v.click(
                generate_t2v,
                inputs=[
                    prompt_t2v,
                    negative_t2v,
                    duration_t2v,
                    seed_t2v
                ],
                outputs=[result_t2v, seed_t2v]
            )


if __name__ == "__main__":
    demo.queue().launch()