Instructions to use Video-Reason/VBVR-Wan2.2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use Video-Reason/VBVR-Wan2.2 with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline from diffusers.utils import load_image, export_to_video # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("Video-Reason/VBVR-Wan2.2", dtype=torch.bfloat16, device_map="cuda") pipe.to("cuda") prompt = "A man with short gray hair plays a red electric guitar." image = load_image( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guitar-man.png" ) output = pipe(image=image, prompt=prompt).frames[0] export_to_video(output, "output.mp4") - Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| """ | |
| VBVR-Wan2.2 Image-to-Video Inference Example | |
| Generate a video from a reference image using the VBVR-Wan2.2 model. | |
| Usage: | |
| python inference.py --model_path /path/to/VBVR-Wan2.2 | |
| """ | |
| import os | |
| import torch | |
| from PIL import Image | |
| from diffusers import WanImageToVideoPipeline, AutoencoderKLWan | |
| from diffusers.utils import export_to_video | |
| # โโโโโโโโโโโโโโโ Configuration (only change model_path) โโโโโโโโโโโโโโโ | |
| import argparse | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--model_path", type=str, default="VBVR-Wan2.2") | |
| args = parser.parse_args() | |
| model_path = args.model_path | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # Paths derived from model_path | |
| image_path = os.path.join(model_path, "assets", "first_frame.png") | |
| output_path = "output.mp4" | |
| # Prompt | |
| prompt = ( | |
| "The scene contains two types of shapes, each type has three shapes of " | |
| "different sizes arranged randomly. Keep all shapes unchanged in appearance " | |
| "(type, size, and color). Only rearrange their positions: first group the " | |
| "shapes by type, then within each group, sort the shapes from smallest to " | |
| "largest (left to right), and arrange all shapes in a single horizontal " | |
| "line from left to right." | |
| ) | |
| negative_prompt = ( | |
| "่ฒ่ฐ่ณไธฝ๏ผ่ฟๆ๏ผ้ๆ๏ผ็ป่ๆจก็ณไธๆธ ๏ผๅญๅน๏ผ้ฃๆ ผ๏ผไฝๅ๏ผ็ปไฝ๏ผ็ป้ข๏ผ้ๆญข๏ผ" | |
| "ๆดไฝๅ็ฐ๏ผๆๅทฎ่ดจ้๏ผไฝ่ดจ้๏ผJPEGๅ็ผฉๆฎ็๏ผไธ้็๏ผๆฎ็ผบ็๏ผๅคไฝ็ๆๆ๏ผ" | |
| "็ปๅพไธๅฅฝ็ๆ้จ๏ผ็ปๅพไธๅฅฝ็่ธ้จ๏ผ็ธๅฝข็๏ผๆฏๅฎน็๏ผๅฝขๆ็ธๅฝข็่ขไฝ๏ผๆๆ่ๅ๏ผ" | |
| "้ๆญขไธๅจ็็ป้ข๏ผๆไนฑ็่ๆฏ๏ผไธๆก่ ฟ๏ผ่ๆฏไบบๅพๅค๏ผๅ็่ตฐ" | |
| ) | |
| # Generation settings | |
| num_frames = 96 | |
| num_inference_steps = 50 | |
| guidance_scale = 5.0 | |
| seed = 1 | |
| # โโโโโโโโโโโโโโโโโโโโโโโโ Load Pipeline โโโโโโโโโโโโโโโโโโโโโโโโ | |
| print(f"Loading model from: {model_path}") | |
| vae = AutoencoderKLWan.from_pretrained( | |
| model_path, subfolder="vae", torch_dtype=torch.float32 | |
| ) | |
| pipe = WanImageToVideoPipeline.from_pretrained( | |
| model_path, | |
| vae=vae, | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| pipe.enable_model_cpu_offload() | |
| print(f"Pipeline loaded. boundary_ratio = {pipe.config.boundary_ratio}") | |
| # โโโโโโโโโโโโโโโโโโโโโโโโ Load Image โโโโโโโโโโโโโโโโโโโโโโโโ | |
| print(f"Loading image: {image_path}") | |
| image = Image.open(image_path).convert("RGB") | |
| width, height = image.size | |
| print(f"Image size: {width}x{height}") | |
| # โโโโโโโโโโโโโโโโโโโโโโโโ Generate Video โโโโโโโโโโโโโโโโโโโโโโโโ | |
| print(f"Generating video: {num_frames} frames @ {width}x{height}, {num_inference_steps} steps") | |
| generator = torch.Generator(device="cpu").manual_seed(seed) | |
| output = pipe( | |
| image=image, | |
| prompt=prompt, | |
| negative_prompt=negative_prompt, | |
| height=height, | |
| width=width, | |
| num_frames=num_frames, | |
| num_inference_steps=num_inference_steps, | |
| guidance_scale=guidance_scale, | |
| generator=generator, | |
| ) | |
| export_to_video(output.frames[0], output_path, fps=16) | |
| print(f"Video saved to: {output_path}") | |