Spaces:

jiang-cc
/

AD-Copilot

Running on Zero

App Files Files Community

jiang-cc commited on 8 days ago

Commit

46a44c9

verified ·

1 Parent(s): 546d962

fix: simplify to single @spaces.GPU function, remove inference_mode decorator, fix type hint

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +24 -24

README.md CHANGED Viewed

@@ -6,7 +6,7 @@ colorTo: indigo
 sdk: gradio
 sdk_version: 5.23.0
 app_file: app.py
-hardware: a10g-small
 pinned: true
 license: apache-2.0
 models:

 sdk: gradio
 sdk_version: 5.23.0
 app_file: app.py
+hardware: zero-a10g
 pinned: true
 license: apache-2.0
 models:

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ AD-Copilot Demo: Comparison-Aware Anomaly Detection with Vision-Language Model
 import os
 import traceback
 import gradio as gr
 import torch
 from transformers import AutoModelForImageTextToText, AutoProcessor
@@ -11,8 +12,8 @@ from qwen_vl_utils import process_vision_info
 from PIL import Image
 # ---------------------------------------------------------------------------
-# Model loading — use device_map="auto" to load directly on GPU
-# (ZeroGPU @spaces.GPU causes TypeError with custom trust_remote_code models)
 # ---------------------------------------------------------------------------
 MODEL_ID = "jiang-cc/AD-Copilot"
@@ -26,7 +27,6 @@ processor = AutoProcessor.from_pretrained(
 model = AutoModelForImageTextToText.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.bfloat16,
-    device_map="auto",
     trust_remote_code=True,
 ).eval()
@@ -34,11 +34,12 @@ model = AutoModelForImageTextToText.from_pretrained(
 # ---------------------------------------------------------------------------
 # Inference
 # ---------------------------------------------------------------------------
 def predict(
     reference_image: Image.Image,
     test_image: Image.Image,
     prompt: str,
-    max_new_tokens: int,
 ):
     if reference_image is None or test_image is None:
         return "Please upload both a reference (good) image and a test image."
@@ -68,26 +69,25 @@ def predict(
         )
         image_inputs, video_inputs = process_vision_info(messages)
-        with torch.inference_mode():
-            inputs = processor(
-                text=[text],
-                images=image_inputs,
-                videos=video_inputs,
-                padding=True,
-                return_tensors="pt",
-            ).to(model.device)
-            generated_ids = model.generate(
-                **inputs, max_new_tokens=max_new_tokens, do_sample=False
-            )
-            generated_ids_trimmed = [
-                out[len(inp) :] for inp, out in zip(inputs.input_ids, generated_ids)
-            ]
-            output = processor.batch_decode(
-                generated_ids_trimmed,
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=False,
-            )[0]
         return output
     except Exception as e:
         tb = traceback.format_exc()

 import os
 import traceback
+import spaces
 import gradio as gr
 import torch
 from transformers import AutoModelForImageTextToText, AutoProcessor
 from PIL import Image
 # ---------------------------------------------------------------------------
+# Model loading (happens once at Space startup; weights stay on CPU until
+# @spaces.GPU moves them to GPU on demand)
 # ---------------------------------------------------------------------------
 MODEL_ID = "jiang-cc/AD-Copilot"
 model = AutoModelForImageTextToText.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.bfloat16,
     trust_remote_code=True,
 ).eval()
 # ---------------------------------------------------------------------------
 # Inference
 # ---------------------------------------------------------------------------
+@spaces.GPU(duration=120)
 def predict(
     reference_image: Image.Image,
     test_image: Image.Image,
     prompt: str,
+    max_new_tokens: float,
 ):
     if reference_image is None or test_image is None:
         return "Please upload both a reference (good) image and a test image."
         )
         image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        ).to(model.device)
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=max_new_tokens, do_sample=False
+        )
+        generated_ids_trimmed = [
+            out[len(inp) :] for inp, out in zip(inputs.input_ids, generated_ids)
+        ]
+        output = processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
         return output
     except Exception as e:
         tb = traceback.format_exc()