import spaces import gradio as gr import torch from diffusers import DiffusionPipeline from diffusers.utils import load_image, export_to_video from diffusers.pipelines.ltx2 import LTX2Pipeline, LTX2LatentUpsamplePipeline from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel from diffusers.pipelines.ltx2.utils import ( DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES ) from diffusers.pipelines.ltx2.export_utils import encode_video import random import numpy as np from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip from PIL import Image, ImageOps # ============================================================ # 🔥 GLOBAL PERFORMANCE SETTINGS (H200 OPTIMIZED) # ============================================================ torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True torch.set_grad_enabled(False) torch.backends.cuda.enable_flash_sdp(True) torch.backends.cuda.enable_mem_efficient_sdp(True) DEVICE = "cuda" DTYPE = torch.bfloat16 MODEL_ID = "rootonchair/LTX-2-19b-distilled" # ============================================================ # 🚀 LOAD IMAGE+ AUDIO PIPELINE # ============================================================ print("🚀 Loading Audio-to-Video pipeline...") pipe_i2v = DiffusionPipeline.from_pretrained( MODEL_ID, custom_pipeline="multimodalart/ltx2-audio-to-video", torch_dtype=DTYPE, ).to(DEVICE) try: pipe_i2v.enable_xformers_memory_efficient_attention() print("✅ xFormers enabled") except Exception as e: print("⚠️ xFormers not available, using PyTorch SDPA instead.") print(e) pipe_i2v.load_lora_weights( "Lightricks/LTX-2-19b-IC-LoRA-Detailer", adapter_name="camera_control" ) pipe_i2v.fuse_lora(lora_scale=0.8) pipe_i2v.unload_lora_weights() print("✅ Image+Audio pipeline loaded") # ============================================================ # 🚀 LOAD TEXT TO VIDEO PIPELINE # ============================================================ print("🚀 Loading Text-to-Video pipeline...") pipe_t2v = LTX2Pipeline.from_pretrained( MODEL_ID, torch_dtype=DTYPE ).to(DEVICE) latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained( MODEL_ID, subfolder="latent_upsampler", torch_dtype=DTYPE, ).to(DEVICE) upsample_pipe = LTX2LatentUpsamplePipeline( vae=pipe_t2v.vae, latent_upsampler=latent_upsampler ).to(DEVICE) print("✅ Text-to-Video pipeline loaded") # ============================================================ # 🎬 IMAGE → VIDEO GENERATION # ============================================================ @spaces.GPU(duration=85, size='xlarge') def generate_i2v(image_path, audio_path, prompt, negative_prompt, duration, seed): if not image_path or not audio_path: raise gr.Error("Please provide both image and audio.") if seed == -1: seed = random.randint(0, 1_000_000) generator = torch.Generator(device="cuda").manual_seed(seed) image = load_image(image_path) fps = 24 total_frames = int(duration * fps) num_frames = max((round(total_frames / 8) * 8) + 1, 9) with torch.inference_mode(): video_output, _ = pipe_i2v( image=image, audio=audio_path, prompt=prompt, negative_prompt=negative_prompt, width=768, height=512, num_frames=num_frames, frame_rate=fps, num_inference_steps=8, sigmas=DISTILLED_SIGMA_VALUES, guidance_scale=1.0, generator=generator, return_dict=False, ) output_path = f"i2v_{seed}.mp4" export_to_video(video_output, output_path, fps=fps) return output_path, seed # ============================================================ # 🎬 TEXT → VIDEO GENERATION # ============================================================ @spaces.GPU(duration=85, size='xlarge') def generate_t2v(prompt, negative_prompt, duration, seed): if seed == -1: seed = random.randint(0, 1_000_000) generator = torch.Generator(device="cuda").manual_seed(seed) width = 768 height = 512 fps = 24 total_frames = int(duration * fps) num_frames = max((round(total_frames / 8) * 8) + 1, 9) with torch.inference_mode(): # Stage 1 video_latent, audio_latent = pipe_t2v( prompt=prompt, negative_prompt=negative_prompt, width=width, height=height, num_frames=num_frames, frame_rate=fps, num_inference_steps=8, sigmas=DISTILLED_SIGMA_VALUES, guidance_scale=1.0, generator=generator, output_type="latent", return_dict=False, ) # Latent Upscale upscaled_video_latent = upsample_pipe( latents=video_latent, output_type="latent", return_dict=False, )[0] # Stage 2 video, audio = pipe_t2v( latents=upscaled_video_latent, audio_latents=audio_latent, prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=3, noise_scale=STAGE_2_DISTILLED_SIGMA_VALUES[0], sigmas=STAGE_2_DISTILLED_SIGMA_VALUES, generator=generator, guidance_scale=1.0, output_type="np", return_dict=False, ) video = (video * 255).round().astype("uint8") video = torch.from_numpy(video) output_path = f"t2v_{seed}.mp4" encode_video( video[0], fps=fps, audio=audio[0].float().cpu(), audio_sample_rate=pipe_t2v.vocoder.config.output_sampling_rate, output_path=output_path, ) return output_path, seed # ============================================================ # 🖥️ GRADIO UI (2 TABS) # ============================================================ with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# ⚡ LTX-2 Distilled (H200 Optimized)") with gr.Tabs(): # ------------------------------------------------ # TAB 1 — IMAGE TO VIDEO # ------------------------------------------------ with gr.Tab("🖼️ Image to Video"): with gr.Row(): input_image = gr.Image(type="filepath") input_audio = gr.Audio(type="filepath") result_i2v = gr.Video() prompt_i2v = gr.Textbox( value="A person speaking naturally", lines=2 ) duration_i2v = gr.Slider(1, 12, value=4) negative_i2v = gr.Textbox(value="low quality") seed_i2v = gr.Number(value=-1) btn_i2v = gr.Button("Generate I2V", variant="primary") btn_i2v.click( generate_i2v, inputs=[ input_image, input_audio, prompt_i2v, negative_i2v, duration_i2v, seed_i2v ], outputs=[result_i2v, seed_i2v] ) # ------------------------------------------------ # TAB 2 — TEXT TO VIDEO # ------------------------------------------------ with gr.Tab("🎬 Text to Video"): result_t2v = gr.Video() prompt_t2v = gr.Textbox( value="A cinematic sunset over the ocean", lines=3 ) duration_t2v = gr.Slider(1, 12, value=4) negative_t2v = gr.Textbox( value="low quality, distorted, glitchy" ) seed_t2v = gr.Number(value=-1) btn_t2v = gr.Button("Generate T2V", variant="primary") btn_t2v.click( generate_t2v, inputs=[ prompt_t2v, negative_t2v, duration_t2v, seed_t2v ], outputs=[result_t2v, seed_t2v] ) if __name__ == "__main__": demo.queue().launch()