Video-Reason
/

VBVR-Wan2.2

Model card Files Files and versions

VBVR-Wan2.2 / example.py

wruisi's picture

Upload example.py with huggingface_hub

0970deb verified 4 months ago

history blame contribute delete

3.53 kB

	#!/usr/bin/env python3
	"""
	VBVR-Wan2.2 Image-to-Video Inference Example

	Generate a video from a reference image using the VBVR-Wan2.2 model.
	Usage:
	python inference.py --model_path /path/to/VBVR-Wan2.2
	"""

	import os
	import torch
	from PIL import Image
	from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
	from diffusers.utils import export_to_video

	# ─────────────── Configuration (only change model_path) ───────────────
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument("--model_path", type=str, default="VBVR-Wan2.2")
	args = parser.parse_args()
	model_path = args.model_path

	# ──────────────────────────────────────────────────────────────────────

	# Paths derived from model_path
	image_path = os.path.join(model_path, "assets", "first_frame.png")
	output_path = "output.mp4"

	# Prompt
	prompt = (
	"The scene contains two types of shapes, each type has three shapes of "
	"different sizes arranged randomly. Keep all shapes unchanged in appearance "
	"(type, size, and color). Only rearrange their positions: first group the "
	"shapes by type, then within each group, sort the shapes from smallest to "
	"largest (left to right), and arrange all shapes in a single horizontal "
	"line from left to right."
	)
	negative_prompt = (
	"色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，"
	"整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，"
	"画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，"
	"静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
	)

	# Generation settings
	num_frames = 96
	num_inference_steps = 50
	guidance_scale = 5.0
	seed = 1

	# ──────────────────────── Load Pipeline ────────────────────────

	print(f"Loading model from: {model_path}")

	vae = AutoencoderKLWan.from_pretrained(
	model_path, subfolder="vae", torch_dtype=torch.float32
	)

	pipe = WanImageToVideoPipeline.from_pretrained(
	model_path,
	vae=vae,
	torch_dtype=torch.bfloat16,
	)
	pipe.enable_model_cpu_offload()

	print(f"Pipeline loaded. boundary_ratio = {pipe.config.boundary_ratio}")

	# ──────────────────────── Load Image ────────────────────────

	print(f"Loading image: {image_path}")
	image = Image.open(image_path).convert("RGB")
	width, height = image.size
	print(f"Image size: {width}x{height}")

	# ──────────────────────── Generate Video ────────────────────────

	print(f"Generating video: {num_frames} frames @ {width}x{height}, {num_inference_steps} steps")
	generator = torch.Generator(device="cpu").manual_seed(seed)

	output = pipe(
	image=image,
	prompt=prompt,
	negative_prompt=negative_prompt,
	height=height,
	width=width,
	num_frames=num_frames,
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale,
	generator=generator,
	)

	export_to_video(output.frames[0], output_path, fps=16)
	print(f"Video saved to: {output_path}")