"""
model.py
Pretrained deepfake detector using:
  prithivMLmods/Deep-Fake-Detector-v2-Model
  Architecture : ViT (vit-base-patch16-224-in21k fine-tuned)
  Accuracy     : ~92%  (56,001-image test set)
  Labels       : "Realism" (real) | "Deepfake"

Weights download automatically on first run (~330 MB).
Cached forever in ~/.cache/huggingface/ — only downloaded once.
"""

import torch
from transformers import ViTForImageClassification, ViTImageProcessor
from PIL import Image

MODEL_ID = "prithivMLmods/Deep-Fake-Detector-v2-Model"
DEVICE   = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# "Realism" is the model's word for a real (non-fake) image
LABEL_MAP = {"Realism": "Real", "Deepfake": "Deepfake"}

_model     = None
_processor = None


def load_model():
    """
    Load model + processor into memory.
    Safe to call multiple times — only loads once (singleton).
    """
    global _model, _processor
    if _model is not None:
        return _model, _processor

    print(f"[model] Downloading / loading  {MODEL_ID}")
    print("[model] First run: ~330 MB download, takes ~1 min on a normal connection.")

    _processor = ViTImageProcessor.from_pretrained(MODEL_ID)
    _model     = ViTForImageClassification.from_pretrained(MODEL_ID)
    _model.to(DEVICE).eval()

    print(f"[model] Ready on {DEVICE}")
    return _model, _processor


def _infer(pil_images: list) -> tuple:
    """
    Run inference on a list of PIL Images.
    Averages softmax probabilities across all images (useful for video frames).

    Returns:
        (label: str, confidence: float)   e.g. ("Deepfake", 94.2)
    """
    model, processor = load_model()

    # processor resizes to 224×224 and normalises — no manual transforms needed
    inputs = processor(images=pil_images, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        logits = model(**inputs).logits       # (N, 2)
        probs  = torch.softmax(logits, dim=1) # (N, 2)
        avg    = probs.mean(dim=0)            # (2,)  average across frames

    class_idx  = avg.argmax().item()
    confidence = round(avg[class_idx].item() * 100, 2)
    raw_label  = model.config.id2label[class_idx]   # "Realism" or "Deepfake"
    label      = LABEL_MAP.get(raw_label, raw_label)

    return label, confidence


def predict_image(pil_img: Image.Image) -> tuple:
    """Single image → ("Real" | "Deepfake", confidence %)"""
    return _infer([pil_img])


def predict_video(pil_frames: list) -> tuple:
    """List of PIL frames → ("Real" | "Deepfake", confidence %)"""
    return _infer(pil_frames)