jiang-cc commited on
Commit
46a44c9
·
verified ·
1 Parent(s): 546d962

fix: simplify to single @spaces.GPU function, remove inference_mode decorator, fix type hint

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +24 -24
README.md CHANGED
@@ -6,7 +6,7 @@ colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.23.0
8
  app_file: app.py
9
- hardware: a10g-small
10
  pinned: true
11
  license: apache-2.0
12
  models:
 
6
  sdk: gradio
7
  sdk_version: 5.23.0
8
  app_file: app.py
9
+ hardware: zero-a10g
10
  pinned: true
11
  license: apache-2.0
12
  models:
app.py CHANGED
@@ -4,6 +4,7 @@ AD-Copilot Demo: Comparison-Aware Anomaly Detection with Vision-Language Model
4
 
5
  import os
6
  import traceback
 
7
  import gradio as gr
8
  import torch
9
  from transformers import AutoModelForImageTextToText, AutoProcessor
@@ -11,8 +12,8 @@ from qwen_vl_utils import process_vision_info
11
  from PIL import Image
12
 
13
  # ---------------------------------------------------------------------------
14
- # Model loading use device_map="auto" to load directly on GPU
15
- # (ZeroGPU @spaces.GPU causes TypeError with custom trust_remote_code models)
16
  # ---------------------------------------------------------------------------
17
  MODEL_ID = "jiang-cc/AD-Copilot"
18
 
@@ -26,7 +27,6 @@ processor = AutoProcessor.from_pretrained(
26
  model = AutoModelForImageTextToText.from_pretrained(
27
  MODEL_ID,
28
  torch_dtype=torch.bfloat16,
29
- device_map="auto",
30
  trust_remote_code=True,
31
  ).eval()
32
 
@@ -34,11 +34,12 @@ model = AutoModelForImageTextToText.from_pretrained(
34
  # ---------------------------------------------------------------------------
35
  # Inference
36
  # ---------------------------------------------------------------------------
 
37
  def predict(
38
  reference_image: Image.Image,
39
  test_image: Image.Image,
40
  prompt: str,
41
- max_new_tokens: int,
42
  ):
43
  if reference_image is None or test_image is None:
44
  return "Please upload both a reference (good) image and a test image."
@@ -68,26 +69,25 @@ def predict(
68
  )
69
  image_inputs, video_inputs = process_vision_info(messages)
70
 
71
- with torch.inference_mode():
72
- inputs = processor(
73
- text=[text],
74
- images=image_inputs,
75
- videos=video_inputs,
76
- padding=True,
77
- return_tensors="pt",
78
- ).to(model.device)
79
-
80
- generated_ids = model.generate(
81
- **inputs, max_new_tokens=max_new_tokens, do_sample=False
82
- )
83
- generated_ids_trimmed = [
84
- out[len(inp) :] for inp, out in zip(inputs.input_ids, generated_ids)
85
- ]
86
- output = processor.batch_decode(
87
- generated_ids_trimmed,
88
- skip_special_tokens=True,
89
- clean_up_tokenization_spaces=False,
90
- )[0]
91
  return output
92
  except Exception as e:
93
  tb = traceback.format_exc()
 
4
 
5
  import os
6
  import traceback
7
+ import spaces
8
  import gradio as gr
9
  import torch
10
  from transformers import AutoModelForImageTextToText, AutoProcessor
 
12
  from PIL import Image
13
 
14
  # ---------------------------------------------------------------------------
15
+ # Model loading (happens once at Space startup; weights stay on CPU until
16
+ # @spaces.GPU moves them to GPU on demand)
17
  # ---------------------------------------------------------------------------
18
  MODEL_ID = "jiang-cc/AD-Copilot"
19
 
 
27
  model = AutoModelForImageTextToText.from_pretrained(
28
  MODEL_ID,
29
  torch_dtype=torch.bfloat16,
 
30
  trust_remote_code=True,
31
  ).eval()
32
 
 
34
  # ---------------------------------------------------------------------------
35
  # Inference
36
  # ---------------------------------------------------------------------------
37
+ @spaces.GPU(duration=120)
38
  def predict(
39
  reference_image: Image.Image,
40
  test_image: Image.Image,
41
  prompt: str,
42
+ max_new_tokens: float,
43
  ):
44
  if reference_image is None or test_image is None:
45
  return "Please upload both a reference (good) image and a test image."
 
69
  )
70
  image_inputs, video_inputs = process_vision_info(messages)
71
 
72
+ inputs = processor(
73
+ text=[text],
74
+ images=image_inputs,
75
+ videos=video_inputs,
76
+ padding=True,
77
+ return_tensors="pt",
78
+ ).to(model.device)
79
+
80
+ generated_ids = model.generate(
81
+ **inputs, max_new_tokens=max_new_tokens, do_sample=False
82
+ )
83
+ generated_ids_trimmed = [
84
+ out[len(inp) :] for inp, out in zip(inputs.input_ids, generated_ids)
85
+ ]
86
+ output = processor.batch_decode(
87
+ generated_ids_trimmed,
88
+ skip_special_tokens=True,
89
+ clean_up_tokenization_spaces=False,
90
+ )[0]
 
91
  return output
92
  except Exception as e:
93
  tb = traceback.format_exc()