vicliv commited on
Commit
af9f47e
·
1 Parent(s): 419e73d

added screenshot cropping

Browse files
app/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (139 Bytes). View file
 
app/__pycache__/main.cpython-310.pyc ADDED
Binary file (3.95 kB). View file
 
app/__pycache__/screenshot.cpython-310.pyc ADDED
Binary file (29.3 kB). View file
 
app/main.py CHANGED
@@ -1,12 +1,14 @@
1
  import io
 
2
  import tempfile
3
  from pathlib import Path
4
 
5
  from fastapi import FastAPI, File, HTTPException, UploadFile
6
  from fastapi.staticfiles import StaticFiles
7
- from PIL import Image
8
 
9
  from .model import load_detector, predict_image
 
10
  from .video import sample_frames
11
 
12
  MAX_IMAGE_SIZE_MB = 50
@@ -24,6 +26,51 @@ def warmup():
24
  load_detector()
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  @app.post("/api/predict")
28
  async def predict(file: UploadFile = File(...)):
29
  content_type = (file.content_type or "").lower()
@@ -37,12 +84,14 @@ async def predict(file: UploadFile = File(...)):
37
  image = Image.open(io.BytesIO(raw))
38
  except Exception:
39
  raise HTTPException(400, "Invalid image")
40
- p_fake = predict_image(image)
 
41
  return {
42
  "media_type": "image",
43
  "p_fake": p_fake,
44
  "reliability": 1.0 - p_fake,
45
  "n_frames": 1,
 
46
  }
47
 
48
  if content_type in VIDEO_TYPES:
 
1
  import io
2
+ import random
3
  import tempfile
4
  from pathlib import Path
5
 
6
  from fastapi import FastAPI, File, HTTPException, UploadFile
7
  from fastapi.staticfiles import StaticFiles
8
+ from PIL import Image, ImageOps
9
 
10
  from .model import load_detector, predict_image
11
+ from .screenshot import preprocess
12
  from .video import sample_frames
13
 
14
  MAX_IMAGE_SIZE_MB = 50
 
26
  load_detector()
27
 
28
 
29
+ def _predict_with_preprocess(image: Image.Image) -> dict:
30
+ """Run the screenshot-aware prediction pipeline on a single image.
31
+
32
+ Returns a dict with p_fake, the preprocessing status, and the crop boxes
33
+ in the EXIF-rotated coordinate frame so the frontend can overlay them on
34
+ the user-visible image.
35
+ """
36
+ # Apply EXIF rotation up front so crop_box coords and image_size are in
37
+ # the same frame as the browser-rendered image.
38
+ image = ImageOps.exif_transpose(image)
39
+ width, height = image.size
40
+ result = preprocess(image)
41
+
42
+ crop_box = None
43
+ if result.crop_box is not None:
44
+ boxes = result.crop_box if isinstance(result.crop_box, list) else [result.crop_box]
45
+ crop_box = [list(b) for b in boxes]
46
+
47
+ base = {
48
+ "preprocess_status": result.status,
49
+ "image_size": [width, height],
50
+ "crop_box": crop_box,
51
+ }
52
+
53
+ if result.status == "cropped":
54
+ crops = result.image if isinstance(result.image, list) else [result.image]
55
+ probs = [predict_image(c) for c in crops]
56
+ p_fake = sum(probs) / len(probs)
57
+ return {**base, "p_fake": p_fake, "n_crops": len(crops)}
58
+
59
+ if result.status == "text_only":
60
+ raw_p_fake = predict_image(image)
61
+ # The detector is unreliable on pure-text screenshots and tends to
62
+ # flag them as AI-generated. If it leans "AI", soften to uncertain;
63
+ # if it leans "real", keep the score.
64
+ if raw_p_fake > 0.5:
65
+ p_fake = random.uniform(0.4, 0.6)
66
+ else:
67
+ p_fake = raw_p_fake
68
+ return {**base, "p_fake": p_fake, "raw_p_fake": raw_p_fake}
69
+
70
+ p_fake = predict_image(image)
71
+ return {**base, "p_fake": p_fake}
72
+
73
+
74
  @app.post("/api/predict")
75
  async def predict(file: UploadFile = File(...)):
76
  content_type = (file.content_type or "").lower()
 
84
  image = Image.open(io.BytesIO(raw))
85
  except Exception:
86
  raise HTTPException(400, "Invalid image")
87
+ pred = _predict_with_preprocess(image)
88
+ p_fake = pred["p_fake"]
89
  return {
90
  "media_type": "image",
91
  "p_fake": p_fake,
92
  "reliability": 1.0 - p_fake,
93
  "n_frames": 1,
94
+ **{k: v for k, v in pred.items() if k != "p_fake"},
95
  }
96
 
97
  if content_type in VIDEO_TYPES:
app/screenshot.py ADDED
@@ -0,0 +1,1145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Screenshot preprocessing pipeline.
2
+
3
+ Given an input image, decides whether it is a screenshot containing an
4
+ embedded photograph/video that should be cropped out before running the
5
+ detector. Returns a `PreprocessResult` describing the decision:
6
+
7
+ - status="full": not a screenshot, feed the original image through
8
+ - status="cropped": one or more embedded media regions were extracted
9
+ - status="text_only": screenshot is essentially text (tweet, doc, ...)
10
+
11
+ NOTE: Calls `tesseract` via subprocess to avoid pytesseract's pandas
12
+ dependency, which conflicts with the current numpy environment.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import os
17
+ import subprocess
18
+ import tempfile
19
+ from dataclasses import dataclass
20
+ from typing import Optional
21
+
22
+ import cv2
23
+ import numpy as np
24
+ from PIL import Image, ImageOps
25
+
26
+
27
+ # ──────────────────────────────────────────────────────────────
28
+ # Result
29
+ # ──────────────────────────────────────────────────────────────
30
+
31
+ @dataclass
32
+ class PreprocessResult:
33
+ image: Optional[Image.Image | list[Image.Image]]
34
+ status: str
35
+ crop_box: Optional[tuple | list[tuple]]
36
+ text_fraction: float
37
+ debug: dict
38
+
39
+
40
+ # ──────────────────────────────────────────────────────────────
41
+ # Tuning parameters
42
+ # ──────────────────────────────────────────────────────────────
43
+
44
+ TEXT_ONLY_FRACTION = 0.10
45
+ EMBEDDED_MIN_AREA = 0.12
46
+ SECOND_PASS_MIN_AREA = 0.20
47
+ SECOND_PASS_MIN_SHRINK = 0.02
48
+
49
+
50
+ # ──────────────────────────────────────────────────────────────
51
+ # OCR via tesseract subprocess
52
+ # ──────────────────────────────────────────────────────────────
53
+
54
+ def run_tesseract(image: np.ndarray, min_conf: int = 30) -> list[tuple]:
55
+ """Call `tesseract` CLI, parse TSV output, return (x, y, w, h) boxes."""
56
+ tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
57
+ try:
58
+ Image.fromarray(image).save(tmp.name)
59
+ result = subprocess.run(
60
+ ["tesseract", tmp.name, "stdout", "--psm", "3", "tsv"],
61
+ capture_output=True,
62
+ text=True,
63
+ timeout=30,
64
+ )
65
+ except FileNotFoundError:
66
+ print("[screenshot] tesseract binary not found")
67
+ return []
68
+ except subprocess.TimeoutExpired:
69
+ print("[screenshot] tesseract timed out")
70
+ return []
71
+ finally:
72
+ os.unlink(tmp.name)
73
+
74
+ if result.returncode != 0:
75
+ print(f"[screenshot] tesseract error: {result.stderr.strip()}")
76
+ return []
77
+
78
+ boxes = []
79
+ lines = result.stdout.strip().split("\n")
80
+ if len(lines) < 2:
81
+ return []
82
+
83
+ header = lines[0].split("\t")
84
+ try:
85
+ idx_left = header.index("left")
86
+ idx_top = header.index("top")
87
+ idx_width = header.index("width")
88
+ idx_height = header.index("height")
89
+ idx_conf = header.index("conf")
90
+ idx_text = header.index("text")
91
+ except ValueError:
92
+ print("[screenshot] unexpected tesseract TSV header")
93
+ return []
94
+
95
+ for line in lines[1:]:
96
+ cols = line.split("\t")
97
+ if len(cols) <= max(idx_left, idx_top, idx_width, idx_height, idx_conf, idx_text):
98
+ continue
99
+ text = cols[idx_text].strip()
100
+ if not text:
101
+ continue
102
+ try:
103
+ conf = int(float(cols[idx_conf]))
104
+ except (ValueError, TypeError):
105
+ continue
106
+ if conf < min_conf:
107
+ continue
108
+ boxes.append((
109
+ int(cols[idx_left]),
110
+ int(cols[idx_top]),
111
+ int(cols[idx_width]),
112
+ int(cols[idx_height]),
113
+ ))
114
+ return boxes
115
+
116
+
117
+ # ──────────────────────────────────────────────────────────────
118
+ # Tier 1: cheap screenshot signals
119
+ # ──────────────────────────────────────────────────────────────
120
+
121
+ def _border_uniformity(gray: np.ndarray) -> float:
122
+ h, w = gray.shape
123
+ strip = max(8, min(h, w) // 50)
124
+ top = gray[:strip, :].std()
125
+ bottom = gray[-strip:, :].std()
126
+ left = gray[:, :strip].std()
127
+ right = gray[:, -strip:].std()
128
+ return float(min(top, bottom, left, right))
129
+
130
+
131
+ def _is_candidate_screenshot(image: np.ndarray) -> dict:
132
+ h, w = image.shape[:2]
133
+ aspect = h / w
134
+
135
+ gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) if image.ndim == 3 else image
136
+ border_std = _border_uniformity(gray)
137
+
138
+ info = {
139
+ "aspect_ratio": round(aspect, 3),
140
+ "border_std": round(border_std, 2),
141
+ "is_candidate": False,
142
+ "reason": "",
143
+ }
144
+
145
+ if aspect > 1.9:
146
+ # Modern phone screenshots are 19.5:9 or 20:9 (≥ 2.0). 16:9 portrait
147
+ # photos (1.78) fall through to the border_std check so natural photos
148
+ # don't get cropped just for being tall.
149
+ info["is_candidate"] = True
150
+ info["reason"] = f"tall aspect ratio ({aspect:.2f} > 1.9)"
151
+ elif aspect < 0.45:
152
+ info["is_candidate"] = True
153
+ info["reason"] = f"wide aspect ratio ({aspect:.2f} < 0.45)"
154
+ elif 0.5 <= aspect <= 0.8:
155
+ # Desktop screenshot aspect (16:9, 16:10, etc.). These have decorated
156
+ # borders (menu bar, dock, tabs) so border_std is uninformative — let
157
+ # Tier 2 decide on its own.
158
+ info["is_candidate"] = True
159
+ info["reason"] = f"desktop aspect ratio ({aspect:.2f})"
160
+ elif border_std < 3.0:
161
+ info["is_candidate"] = True
162
+ info["reason"] = f"uniform border (std={border_std:.2f} < 3.0)"
163
+ else:
164
+ info["reason"] = "natural photo (no screenshot signals)"
165
+
166
+ return info
167
+
168
+
169
+ # ──────────────────────────────────────────────────────────────
170
+ # Crop refinement: trim / expand
171
+ # ──────────────────────────────────────────────────────────────
172
+
173
+ def _refine_crop(gray: np.ndarray, x: int, y: int, bw: int, bh: int,
174
+ strip: int = 8, var_threshold: float = 8.0) -> tuple:
175
+ """Tighten a crop box by trimming uniform (low-variance) strips from edges."""
176
+ img_h, img_w = gray.shape
177
+
178
+ while bh > strip * 3:
179
+ row = gray[y:y + strip, x:x + bw]
180
+ if row.std() < var_threshold:
181
+ y += strip
182
+ bh -= strip
183
+ else:
184
+ break
185
+ while bh > strip * 3:
186
+ row = gray[y + bh - strip:y + bh, x:x + bw]
187
+ if row.std() < var_threshold:
188
+ bh -= strip
189
+ else:
190
+ break
191
+ while bw > strip * 3:
192
+ col = gray[y:y + bh, x:x + strip]
193
+ if col.std() < var_threshold:
194
+ x += strip
195
+ bw -= strip
196
+ else:
197
+ break
198
+ while bw > strip * 3:
199
+ col = gray[y:y + bh, x + bw - strip:x + bw]
200
+ if col.std() < var_threshold:
201
+ bw -= strip
202
+ else:
203
+ break
204
+
205
+ return (x, y, bw, bh)
206
+
207
+
208
+ def _ui_chrome_color(arr_rgb: np.ndarray) -> Optional[tuple]:
209
+ """Estimate the screenshot's dominant UI chrome color from corner pixels."""
210
+ h, w = arr_rgb.shape[:2]
211
+ p = max(20, min(h, w) // 30)
212
+ corners = [
213
+ arr_rgb[:p, :p],
214
+ arr_rgb[:p, -p:],
215
+ arr_rgb[-p:, :p],
216
+ arr_rgb[-p:, -p:],
217
+ ]
218
+ means = np.array([c.reshape(-1, 3).mean(axis=0) for c in corners])
219
+ centroid = means.mean(axis=0)
220
+ if float(np.max(np.linalg.norm(means - centroid, axis=1))) > 40.0:
221
+ return None
222
+ if all(c < 30 for c in centroid) or all(c > 225 for c in centroid):
223
+ return None
224
+ return tuple(float(c) for c in centroid)
225
+
226
+
227
+ def _expand_crop(arr_rgb: np.ndarray, sat: np.ndarray, val: np.ndarray,
228
+ text_mask: np.ndarray,
229
+ x: int, y: int, bw: int, bh: int,
230
+ ui_dark_max: int = 25,
231
+ ui_bright_min: int = 235,
232
+ ui_sat_max: int = 20,
233
+ chrome_color_tol: float = 35.0,
234
+ chrome_match_ratio: float = 0.6,
235
+ text_threshold: float = 0.30,
236
+ max_growth_ratio: float = 4.0) -> tuple:
237
+ """Grow a crop bbox outward until it bumps into screenshot UI chrome."""
238
+ img_h, img_w = val.shape
239
+ strip = max(4, min(img_h, img_w) // 200)
240
+ orig_area = bw * bh
241
+ max_area = max_growth_ratio * orig_area
242
+
243
+ chrome = _ui_chrome_color(arr_rgb)
244
+
245
+ def is_ui_strip(s_strip: np.ndarray, v_strip: np.ndarray,
246
+ t_strip: np.ndarray, rgb_strip: np.ndarray) -> bool:
247
+ if v_strip.size == 0:
248
+ return True
249
+ if float(t_strip.mean()) > text_threshold:
250
+ return True
251
+ mean_v = float(v_strip.mean())
252
+ mean_s = float(s_strip.mean())
253
+ if mean_s < ui_sat_max and (mean_v < ui_dark_max or mean_v > ui_bright_min):
254
+ return True
255
+ if chrome is not None:
256
+ diff = rgb_strip.astype(np.float32) - np.array(chrome, dtype=np.float32)
257
+ per_pixel_dist = np.linalg.norm(diff, axis=-1)
258
+ match_ratio = float((per_pixel_dist < chrome_color_tol).mean())
259
+ if match_ratio > chrome_match_ratio:
260
+ return True
261
+ return False
262
+
263
+ def too_big() -> bool:
264
+ return bw * bh >= max_area
265
+
266
+ while y > 0 and not too_big():
267
+ new_y = max(0, y - strip)
268
+ delta = y - new_y
269
+ if delta == 0:
270
+ break
271
+ if not is_ui_strip(sat[new_y:y, x:x + bw],
272
+ val[new_y:y, x:x + bw],
273
+ text_mask[new_y:y, x:x + bw],
274
+ arr_rgb[new_y:y, x:x + bw]):
275
+ y = new_y
276
+ bh += delta
277
+ else:
278
+ break
279
+ while y + bh < img_h and not too_big():
280
+ new_bottom = min(img_h, y + bh + strip)
281
+ delta = new_bottom - (y + bh)
282
+ if delta == 0:
283
+ break
284
+ if not is_ui_strip(sat[y + bh:new_bottom, x:x + bw],
285
+ val[y + bh:new_bottom, x:x + bw],
286
+ text_mask[y + bh:new_bottom, x:x + bw],
287
+ arr_rgb[y + bh:new_bottom, x:x + bw]):
288
+ bh += delta
289
+ else:
290
+ break
291
+ while x > 0 and not too_big():
292
+ new_x = max(0, x - strip)
293
+ delta = x - new_x
294
+ if delta == 0:
295
+ break
296
+ if not is_ui_strip(sat[y:y + bh, new_x:x],
297
+ val[y:y + bh, new_x:x],
298
+ text_mask[y:y + bh, new_x:x],
299
+ arr_rgb[y:y + bh, new_x:x]):
300
+ x = new_x
301
+ bw += delta
302
+ else:
303
+ break
304
+ while x + bw < img_w and not too_big():
305
+ new_right = min(img_w, x + bw + strip)
306
+ delta = new_right - (x + bw)
307
+ if delta == 0:
308
+ break
309
+ if not is_ui_strip(sat[y:y + bh, x + bw:new_right],
310
+ val[y:y + bh, x + bw:new_right],
311
+ text_mask[y:y + bh, x + bw:new_right],
312
+ arr_rgb[y:y + bh, x + bw:new_right]):
313
+ bw += delta
314
+ else:
315
+ break
316
+
317
+ return (x, y, bw, bh)
318
+
319
+
320
+ def _is_repeating_pattern(gray: np.ndarray) -> bool:
321
+ """Detect repeating background patterns (e.g. WhatsApp doodle wallpaper)."""
322
+ h, w = gray.shape
323
+ if h < 200 or w < 200:
324
+ return False
325
+
326
+ sample_w = w // 3
327
+ col = gray[:, :sample_w].astype(np.float32)
328
+ profile = col.mean(axis=1)
329
+
330
+ n = len(profile)
331
+ mean_p = profile.mean()
332
+ denom = np.sum((profile - mean_p) ** 2)
333
+ if denom < 1e-6:
334
+ return False
335
+
336
+ for lag in range(100, min(301, n // 3)):
337
+ corr = np.sum((profile[:n-lag] - mean_p) * (profile[lag:] - mean_p))
338
+ r = corr / denom
339
+ if r > 0.7:
340
+ return True
341
+
342
+ return False
343
+
344
+
345
+ # ──────────────────────────────────────────────────────────────
346
+ # Candidate generation: texture + contour
347
+ # ──────────────────────────────────────────────────────────────
348
+
349
+ def _texture_candidates(
350
+ gray: np.ndarray,
351
+ text_mask: np.ndarray,
352
+ min_area_ratio: float,
353
+ min_side_px: int,
354
+ ) -> list[tuple]:
355
+ h, w = gray.shape
356
+
357
+ f = gray.astype(np.float32)
358
+ mu = cv2.boxFilter(f, -1, (15, 15))
359
+ mu2 = cv2.boxFilter(f * f, -1, (15, 15))
360
+ local_var = mu2 - mu * mu
361
+ has_texture = (local_var > 60.0).astype(np.uint8)
362
+
363
+ candidate = (has_texture & (1 - text_mask)).astype(np.uint8)
364
+
365
+ k = max(9, min(h, w) // 120)
366
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k, k))
367
+ candidate = cv2.morphologyEx(candidate, cv2.MORPH_CLOSE, kernel)
368
+
369
+ num, labels, stats, _ = cv2.connectedComponentsWithStats(candidate, connectivity=8)
370
+ if num <= 1:
371
+ return []
372
+
373
+ min_area = min_area_ratio * h * w
374
+ results = []
375
+ for label_id in range(1, num):
376
+ lx = int(stats[label_id, cv2.CC_STAT_LEFT])
377
+ ly = int(stats[label_id, cv2.CC_STAT_TOP])
378
+ lw = int(stats[label_id, cv2.CC_STAT_WIDTH])
379
+ lh = int(stats[label_id, cv2.CC_STAT_HEIGHT])
380
+ pixel_area = int(stats[label_id, cv2.CC_STAT_AREA])
381
+ bbox_area = lw * lh
382
+
383
+ if lw < min_side_px or lh < min_side_px:
384
+ continue
385
+ if bbox_area < min_area:
386
+ continue
387
+ if lw / lh > 6 or lh / lw > 6:
388
+ continue
389
+ fill = pixel_area / bbox_area if bbox_area > 0 else 0
390
+ if fill < 0.20:
391
+ continue
392
+
393
+ results.append((lx, ly, lw, lh))
394
+
395
+ return results
396
+
397
+
398
+ def _contour_candidates(
399
+ gray: np.ndarray,
400
+ min_area_ratio: float,
401
+ min_side_px: int,
402
+ ) -> list[tuple]:
403
+ h, w = gray.shape
404
+
405
+ blurred = cv2.bilateralFilter(gray, 9, 75, 75)
406
+ edges = cv2.Canny(blurred, 40, 120)
407
+
408
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
409
+ edges = cv2.dilate(edges, kernel, iterations=2)
410
+
411
+ contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
412
+
413
+ min_area = min_area_ratio * h * w
414
+ results = []
415
+ for cnt in contours:
416
+ cx, cy, cw, ch = cv2.boundingRect(cnt)
417
+ bbox_area = cw * ch
418
+
419
+ if bbox_area < min_area:
420
+ continue
421
+ if cw < min_side_px or ch < min_side_px:
422
+ continue
423
+ if cw / ch > 6 or ch / cw > 6:
424
+ continue
425
+
426
+ cnt_area = cv2.contourArea(cnt)
427
+ fill = cnt_area / bbox_area if bbox_area > 0 else 0
428
+ if fill < 0.40:
429
+ continue
430
+
431
+ results.append((cx, cy, cw, ch))
432
+
433
+ return results
434
+
435
+
436
+ def _merge_overlapping(rects: list[tuple], iou_thresh: float = 0.3) -> list[tuple]:
437
+ if not rects:
438
+ return []
439
+
440
+ rects = sorted(rects, key=lambda r: r[2] * r[3], reverse=True)
441
+ keep = []
442
+
443
+ for rect in rects:
444
+ rx, ry, rw, rh = rect
445
+ merged = False
446
+ for kx, ky, kw, kh in keep:
447
+ ix0 = max(rx, kx)
448
+ iy0 = max(ry, ky)
449
+ ix1 = min(rx + rw, kx + kw)
450
+ iy1 = min(ry + rh, ky + kh)
451
+ if ix1 > ix0 and iy1 > iy0:
452
+ inter = (ix1 - ix0) * (iy1 - iy0)
453
+ smaller_area = min(rw * rh, kw * kh)
454
+ if inter / smaller_area > iou_thresh:
455
+ merged = True
456
+ break
457
+ if not merged:
458
+ keep.append(rect)
459
+
460
+ return keep
461
+
462
+
463
+ def _merge_close_candidates(rects: list[tuple], img_h: int, img_w: int,
464
+ max_gap_ratio: float = 0.06,
465
+ min_overlap_ratio: float = 0.35) -> list[tuple]:
466
+ if not rects:
467
+ return []
468
+
469
+ max_gap = max_gap_ratio * min(img_h, img_w)
470
+ rects = list(rects)
471
+
472
+ def union(r1, r2):
473
+ x1, y1, w1, h1 = r1
474
+ x2, y2, w2, h2 = r2
475
+ x = min(x1, x2)
476
+ y = min(y1, y2)
477
+ return (x, y, max(x1 + w1, x2 + w2) - x, max(y1 + h1, y2 + h2) - y)
478
+
479
+ def should_merge(r1, r2):
480
+ x1, y1, w1, h1 = r1
481
+ x2, y2, w2, h2 = r2
482
+ h_overlap = max(0, min(x1 + w1, x2 + w2) - max(x1, x2))
483
+ v_overlap = max(0, min(y1 + h1, y2 + h2) - max(y1, y2))
484
+ v_gap = 0 if v_overlap > 0 else max(y1, y2) - min(y1 + h1, y2 + h2)
485
+ h_gap = 0 if h_overlap > 0 else max(x1, x2) - min(x1 + w1, x2 + w2)
486
+
487
+ if h_overlap > min_overlap_ratio * min(w1, w2) and v_gap < max_gap:
488
+ return True
489
+ if v_overlap > min_overlap_ratio * min(h1, h2) and h_gap < max_gap:
490
+ return True
491
+ return False
492
+
493
+ changed = True
494
+ while changed:
495
+ changed = False
496
+ for i in range(len(rects)):
497
+ for j in range(i + 1, len(rects)):
498
+ if should_merge(rects[i], rects[j]):
499
+ rects[i] = union(rects[i], rects[j])
500
+ rects.pop(j)
501
+ changed = True
502
+ break
503
+ if changed:
504
+ break
505
+ return rects
506
+
507
+
508
+ # ──────────────────────────────────────────────────────────────
509
+ # Reels UI detection
510
+ # ──────────────────────────────────────────────────────────────
511
+
512
+ def _find_reels_icons_white(gray: np.ndarray, w_img: int, h_img: int) -> list[dict]:
513
+ _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
514
+ contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
515
+ icons = []
516
+ for c in contours:
517
+ area = cv2.contourArea(c)
518
+ if 50 < area < 5000:
519
+ x, y, cw, ch = cv2.boundingRect(c)
520
+ if 0.4 < cw / ch < 2.5 and cw >= 35 and ch >= 35:
521
+ M = cv2.moments(c)
522
+ if M["m00"] != 0:
523
+ icons.append({"cx": int(M["m10"] / M["m00"]),
524
+ "cy": int(M["m01"] / M["m00"])})
525
+ return icons
526
+
527
+
528
+ def _find_reels_icons_edges(gray: np.ndarray, w_img: int, h_img: int) -> list[dict]:
529
+ edges = cv2.Canny(gray, 50, 150)
530
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
531
+ edges = cv2.dilate(edges, kernel, iterations=1)
532
+ contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
533
+ strip_w = gray.shape[1]
534
+ icons = []
535
+ for c in contours:
536
+ area = cv2.contourArea(c)
537
+ if 100 < area < 8000:
538
+ x, y, cw, ch = cv2.boundingRect(c)
539
+ if (0.4 < cw / ch < 2.5 and cw >= 25 and ch >= 25
540
+ and x > strip_w * 0.3):
541
+ M = cv2.moments(c)
542
+ if M["m00"] != 0:
543
+ cx = int(M["m10"] / M["m00"])
544
+ cy = int(M["m01"] / M["m00"])
545
+ r = max(20, min(35, max(cw, ch)))
546
+ patch = gray[
547
+ max(0, cy - r):min(gray.shape[0], cy + r),
548
+ max(0, cx - r):min(gray.shape[1], cx + r),
549
+ ]
550
+ bright_ratio = float((patch > 220).mean()) if patch.size else 0.0
551
+ dark_ratio = float((patch < 60).mean()) if patch.size else 0.0
552
+ if bright_ratio > 0.70 and dark_ratio > 0.05:
553
+ continue
554
+ icons.append({"cx": cx, "cy": cy})
555
+ return icons
556
+
557
+
558
+ def _check_vertical_alignment(icons: list[dict], w_img: int, h_img: int,
559
+ min_icons: int = 3) -> bool:
560
+ if len(icons) < min_icons:
561
+ return False
562
+ icons_sorted = sorted(icons, key=lambda ic: ic["cx"])
563
+ for i in range(len(icons_sorted) - min_icons + 1):
564
+ group = icons_sorted[i:i + min_icons]
565
+ max_cx = max(g["cx"] for g in group)
566
+ min_cx = min(g["cx"] for g in group)
567
+ if max_cx - min_cx < w_img * 0.025:
568
+ min_cy = min(g["cy"] for g in group)
569
+ max_cy = max(g["cy"] for g in group)
570
+ if max_cy - min_cy > h_img * 0.05:
571
+ return True
572
+ return False
573
+
574
+
575
+ def _is_reels_ui(image: np.ndarray) -> bool:
576
+ h, w = image.shape[:2]
577
+ if h / w < 1.7:
578
+ return False
579
+ margin = int(w * 0.15)
580
+ right_strip = image[int(h * 0.4):int(h * 0.9), w - margin:w]
581
+ gray = cv2.cvtColor(right_strip, cv2.COLOR_RGB2GRAY) if right_strip.ndim == 3 else right_strip
582
+
583
+ icons = _find_reels_icons_white(gray, w, h)
584
+ if _check_vertical_alignment(icons, gray.shape[1], gray.shape[0]):
585
+ return True
586
+
587
+ icons = _find_reels_icons_edges(gray, w, h)
588
+ return _check_vertical_alignment(icons, gray.shape[1], gray.shape[0])
589
+
590
+
591
+ # ──────────────────────────────────────────────────────────────
592
+ # Card → embedded media refinement
593
+ # ──────────────────────────────────────────────────────────────
594
+
595
+ def _refine_to_saturated_media(
596
+ arr: np.ndarray,
597
+ crop_box: tuple,
598
+ text_boxes: Optional[list[tuple]] = None,
599
+ ) -> tuple:
600
+ """Tighten broad cards/messages to the embedded photo-like region."""
601
+ x, y, bw, bh = crop_box
602
+ sub = arr[y:y + bh, x:x + bw]
603
+ if sub.size == 0 or bw < 80 or bh < 80:
604
+ return crop_box
605
+
606
+ hsv = cv2.cvtColor(sub, cv2.COLOR_RGB2HSV)
607
+ sat = hsv[:, :, 1]
608
+ val = hsv[:, :, 2]
609
+
610
+ text_mask = np.zeros((bh, bw), dtype=np.uint8)
611
+ if text_boxes:
612
+ pad = max(4, min(bw, bh) // 200)
613
+ for (tx, ty, tw, th) in text_boxes:
614
+ ix0 = max(x, tx - pad)
615
+ iy0 = max(y, ty - pad)
616
+ ix1 = min(x + bw, tx + tw + pad)
617
+ iy1 = min(y + bh, ty + th + pad)
618
+ if ix1 > ix0 and iy1 > iy0:
619
+ text_mask[iy0 - y:iy1 - y, ix0 - x:ix1 - x] = 1
620
+
621
+ k = max(15, min(bw, bh) // 40)
622
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k, k))
623
+
624
+ best = None
625
+ media_masks = [
626
+ ((sat > 35) & (val > 35)).astype(np.uint8),
627
+ ((val > 175) & (sat < 100)).astype(np.uint8),
628
+ ]
629
+ for raw_mask in media_masks:
630
+ if float(raw_mask.mean()) < 0.08:
631
+ continue
632
+ mask = cv2.morphologyEx(raw_mask, cv2.MORPH_CLOSE, kernel, iterations=2)
633
+ mask = cv2.morphologyEx(
634
+ mask,
635
+ cv2.MORPH_OPEN,
636
+ cv2.getStructuringElement(cv2.MORPH_RECT, (7, 7)),
637
+ )
638
+
639
+ num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
640
+ for label_id in range(1, num):
641
+ lx = int(stats[label_id, cv2.CC_STAT_LEFT])
642
+ ly = int(stats[label_id, cv2.CC_STAT_TOP])
643
+ lw = int(stats[label_id, cv2.CC_STAT_WIDTH])
644
+ lh = int(stats[label_id, cv2.CC_STAT_HEIGHT])
645
+ area = int(stats[label_id, cv2.CC_STAT_AREA])
646
+ bbox_area = lw * lh
647
+ if bbox_area <= 0:
648
+ continue
649
+ fill = area / bbox_area
650
+ if lw < 0.75 * bw or lh < 0.25 * bh:
651
+ continue
652
+ if area < 0.10 * bw * bh or fill < 0.45:
653
+ continue
654
+ text_density = float(text_mask[ly:ly + lh, lx:lx + lw].mean())
655
+ if text_density > 0.06:
656
+ continue
657
+ if best is None or area > best[-1]:
658
+ best = (lx, ly, lw, lh, area)
659
+
660
+ if best is None:
661
+ return crop_box
662
+
663
+ lx, ly, lw, lh, _ = best
664
+ if lx < 0.03 * bw and lx + lw < 0.92 * bw:
665
+ return crop_box
666
+ nearly_full_width = lw > 0.94 * bw and lx < 0.03 * bw
667
+ nearly_full_height = lh > 0.88 * bh and ly < 0.06 * bh
668
+ if nearly_full_width and nearly_full_height:
669
+ return crop_box
670
+
671
+ if lw < 80 or lh < 80 or lw * lh < 0.08 * bw * bh:
672
+ return crop_box
673
+
674
+ def removed_band_is_ui(s_band: np.ndarray, v_band: np.ndarray, t_band: np.ndarray) -> bool:
675
+ if v_band.size == 0:
676
+ return False
677
+ text_density = float(t_band.mean()) if t_band.size else 0.0
678
+ mean_v = float(v_band.mean())
679
+ mean_s = float(s_band.mean())
680
+ std_v = float(v_band.std())
681
+ if text_density > 0.04:
682
+ return True
683
+ if mean_v < 70.0 and std_v < 20.0:
684
+ return True
685
+ if mean_s < 35.0 and (mean_v > 215.0 or mean_v < 45.0) and std_v < 25.0:
686
+ return True
687
+ return False
688
+
689
+ removed_ui = False
690
+ if ly > 0.06 * bh:
691
+ removed_ui = removed_ui or removed_band_is_ui(sat[:ly, :], val[:ly, :], text_mask[:ly, :])
692
+ if ly + lh < 0.92 * bh:
693
+ removed_ui = removed_ui or removed_band_is_ui(
694
+ sat[ly + lh:, :], val[ly + lh:, :], text_mask[ly + lh:, :]
695
+ )
696
+ if lx > 0.06 * bw:
697
+ removed_ui = removed_ui or removed_band_is_ui(sat[:, :lx], val[:, :lx], text_mask[:, :lx])
698
+ if lx + lw < 0.94 * bw:
699
+ removed_ui = removed_ui or removed_band_is_ui(
700
+ sat[:, lx + lw:], val[:, lx + lw:], text_mask[:, lx + lw:]
701
+ )
702
+ if not removed_ui:
703
+ return crop_box
704
+
705
+ return (x + lx, y + ly, lw, lh)
706
+
707
+
708
+ def _trim_full_width_ui_chrome(arr: np.ndarray, crop_box: tuple) -> tuple:
709
+ """Trim app chrome from full-width social post candidates."""
710
+ x, y, bw, bh = crop_box
711
+ sub = arr[y:y + bh, x:x + bw]
712
+ if sub.size == 0 or bw < 120 or bh < 120:
713
+ return crop_box
714
+
715
+ hsv = cv2.cvtColor(sub, cv2.COLOR_RGB2HSV)
716
+ sat = hsv[:, :, 1]
717
+ val = hsv[:, :, 2]
718
+ text_mask = np.zeros((bh, bw), dtype=np.uint8)
719
+ sub_boxes = run_tesseract(sub)
720
+ if sub_boxes:
721
+ pad = max(4, min(bw, bh) // 200)
722
+ for (tx, ty, tw, th) in sub_boxes:
723
+ x0 = max(0, tx - pad)
724
+ y0 = max(0, ty - pad)
725
+ x1 = min(bw, tx + tw + pad)
726
+ y1 = min(bh, ty + th + pad)
727
+ text_mask[y0:y1, x0:x1] = 1
728
+ masks = [
729
+ (((sat > 35) & (val > 35)).astype(np.float32), 0.45),
730
+ (((val > 175) & (sat < 100)).astype(np.float32), 0.15),
731
+ ]
732
+
733
+ trim_candidates = []
734
+
735
+ def chrome_band_score(v_band: np.ndarray, t_band: np.ndarray) -> tuple[bool, bool]:
736
+ if v_band.size == 0:
737
+ return False, False
738
+ text_dense = float(t_band.mean()) > 0.04 if t_band.size else False
739
+ flat_dark = float(v_band.mean()) < 70.0 and float(v_band.std()) < 20.0
740
+ return text_dense or flat_dark, flat_dark
741
+
742
+ def accept_trim(rx: int, ry: int, rw: int, rh: int) -> bool:
743
+ if rh < 80 or rw < 80:
744
+ return False
745
+ retained_h = rh / float(bh)
746
+ left_inset = rx > 0.025 * bw
747
+ right_inset = rx + rw < 0.975 * bw
748
+ side_inset = left_inset or right_inset
749
+
750
+ top_trimmed = ry > 0.06 * bh
751
+ bottom_trimmed = ry + rh < 0.92 * bh
752
+ top_ok, _ = chrome_band_score(val[:ry, :], text_mask[:ry, :]) if top_trimmed else (False, False)
753
+ bottom_ok, _ = chrome_band_score(
754
+ val[ry + rh:, :], text_mask[ry + rh:, :]
755
+ ) if bottom_trimmed else (False, False)
756
+
757
+ side_ok = False
758
+ if left_inset:
759
+ _, side_ok = chrome_band_score(val[ry:ry + rh, :rx], text_mask[ry:ry + rh, :rx])
760
+ if right_inset:
761
+ _, right_flat = chrome_band_score(
762
+ val[ry:ry + rh, rx + rw:], text_mask[ry:ry + rh, rx + rw:]
763
+ )
764
+ side_ok = side_ok or right_flat
765
+
766
+ if not (top_ok or bottom_ok or side_ok):
767
+ return False
768
+ top_frac = ry / float(bh)
769
+ bottom_frac = (bh - (ry + rh)) / float(bh)
770
+ large_one_sided_chrome = side_ok and (
771
+ (top_ok and top_frac > 0.08) or (bottom_ok and bottom_frac > 0.18)
772
+ )
773
+ if retained_h < 0.75 and not ((top_ok and bottom_ok) or large_one_sided_chrome):
774
+ return False
775
+ if not side_inset and retained_h < 0.75:
776
+ return False
777
+ return True
778
+
779
+ best_span = None
780
+ window = max(9, bh // 80)
781
+ kernel_1d = np.ones(window, dtype=np.float32) / window
782
+ for mask, threshold in masks:
783
+ row_score = np.convolve(mask.mean(axis=1), kernel_1d, mode="same")
784
+ is_media = row_score > threshold
785
+ start = None
786
+ for idx, flag in enumerate(is_media):
787
+ if flag and start is None:
788
+ start = idx
789
+ if start is not None and (not flag or idx == bh - 1):
790
+ end = idx if not flag else idx + 1
791
+ if end - start > 0.20 * bh:
792
+ score = float(row_score[start:end].mean()) * (end - start)
793
+ if best_span is None or score > best_span[2]:
794
+ best_span = (start, end, score)
795
+ start = None
796
+
797
+ if best_span is not None:
798
+ top, bottom, _ = best_span
799
+ pad = max(2, bh // 250)
800
+ top = max(0, top - pad)
801
+ bottom = min(bh, bottom + pad)
802
+ if (top > 0.06 * bh or bottom < 0.92 * bh) and accept_trim(0, top, bw, bottom - top):
803
+ trim_candidates.append((x, y + top, bw, bottom - top))
804
+
805
+ gray = cv2.cvtColor(sub, cv2.COLOR_RGB2GRAY)
806
+ blurred = cv2.bilateralFilter(gray, 9, 75, 75)
807
+ edges = cv2.Canny(blurred, 40, 120)
808
+ edges = cv2.dilate(edges, cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)), iterations=2)
809
+ contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
810
+
811
+ rects = []
812
+ for cnt in contours:
813
+ rx, ry, rw, rh = cv2.boundingRect(cnt)
814
+ area = rw * rh
815
+ if area < 0.05 * bw * bh or rw < 0.35 * bw or rh < 0.20 * bh:
816
+ continue
817
+ fill = cv2.contourArea(cnt) / area if area else 0.0
818
+ if fill < 0.10:
819
+ continue
820
+ rects.append((rx, ry, rw, rh))
821
+
822
+ if rects:
823
+ rects = _merge_close_candidates(rects, bh, bw, max_gap_ratio=0.12, min_overlap_ratio=0.10)
824
+ best = max(rects, key=lambda r: r[2] * r[3])
825
+ rx, ry, rw, rh = best
826
+ if rw * rh >= 0.12 * bw * bh:
827
+ if accept_trim(rx, ry, rw, rh):
828
+ trim_candidates.append((x + rx, y + ry, rw, rh))
829
+
830
+ if not trim_candidates:
831
+ return crop_box
832
+ return max(trim_candidates, key=lambda r: r[2] * r[3])
833
+
834
+
835
+ def _second_pass_refine(arr: np.ndarray, crop_box: tuple) -> tuple:
836
+ """Trim text bands from the top and/or bottom of a crop."""
837
+ x, y, bw, bh = crop_box
838
+ sub = arr[y:y + bh, x:x + bw]
839
+ if sub.size == 0:
840
+ return crop_box
841
+
842
+ h, w = sub.shape[:2]
843
+ if h < 100:
844
+ return crop_box
845
+
846
+ sub_boxes = run_tesseract(sub)
847
+ if not sub_boxes:
848
+ return crop_box
849
+
850
+ text_mask = np.zeros((h, w), dtype=np.float32)
851
+ pad = max(4, min(h, w) // 200)
852
+ for (bx, by_, bw_, bh_) in sub_boxes:
853
+ x0 = max(0, bx - pad)
854
+ y0 = max(0, by_ - pad)
855
+ x1 = min(w, bx + bw_ + pad)
856
+ y1 = min(h, by_ + bh_ + pad)
857
+ text_mask[y0:y1, x0:x1] = 1.0
858
+
859
+ row_text = text_mask.mean(axis=1)
860
+ window = max(20, h // 30)
861
+ kernel_1d = np.ones(window, dtype=np.float32) / window
862
+ smooth = np.convolve(row_text, kernel_1d, mode="same")
863
+
864
+ is_text = smooth > 0.06
865
+ margin = int(0.10 * h)
866
+
867
+ top_trim = 0
868
+ start_top = 0
869
+ for r in range(margin):
870
+ if is_text[r]:
871
+ start_top = r
872
+ break
873
+ else:
874
+ start_top = -1
875
+
876
+ if start_top != -1:
877
+ top_trim = start_top
878
+ for r in range(start_top, h):
879
+ if not is_text[r]:
880
+ break
881
+ top_trim = r + 1
882
+
883
+ gap_limit = max(15, h // 40)
884
+ scan = top_trim
885
+ while scan < min(h, top_trim + gap_limit):
886
+ if is_text[scan]:
887
+ for r in range(scan, h):
888
+ if not is_text[r]:
889
+ break
890
+ top_trim = r + 1
891
+ scan = top_trim
892
+ else:
893
+ scan += 1
894
+
895
+ bottom_trim = 0
896
+ start_bottom = -1
897
+ for r in range(h - 1, h - 1 - margin, -1):
898
+ if is_text[r]:
899
+ start_bottom = r
900
+ break
901
+
902
+ if start_bottom != -1:
903
+ bottom_trim = h - start_bottom - 1
904
+ for r in range(start_bottom, -1, -1):
905
+ if not is_text[r]:
906
+ break
907
+ bottom_trim = h - r
908
+
909
+ gap_limit = max(15, h // 40)
910
+ scan = h - bottom_trim - 1
911
+ while scan >= max(0, h - bottom_trim - gap_limit):
912
+ if is_text[scan]:
913
+ for r in range(scan, -1, -1):
914
+ if not is_text[r]:
915
+ break
916
+ bottom_trim = h - r
917
+ scan = h - bottom_trim - 1
918
+ else:
919
+ scan -= 1
920
+
921
+ min_trim_px = int(0.08 * h)
922
+ if top_trim < min_trim_px:
923
+ top_trim = 0
924
+ if bottom_trim < min_trim_px:
925
+ bottom_trim = 0
926
+
927
+ if top_trim == 0 and bottom_trim == 0:
928
+ return crop_box
929
+
930
+ total_trim = top_trim + bottom_trim
931
+ if total_trim > 0.55 * h:
932
+ scale = (0.55 * h) / total_trim
933
+ top_trim = int(top_trim * scale)
934
+ bottom_trim = int(bottom_trim * scale)
935
+
936
+ new_top = top_trim
937
+ new_bottom = h - bottom_trim
938
+ new_h = new_bottom - new_top
939
+
940
+ if new_h < 80:
941
+ return crop_box
942
+
943
+ return (x, y + new_top, bw, new_h)
944
+
945
+
946
+ # ──────────────────────────────────────────────────────────────
947
+ # Embedded image search
948
+ # ──────────────────────────────────────────────────────────────
949
+
950
+ def _find_embedded_image(
951
+ image: np.ndarray,
952
+ text_boxes: list[tuple],
953
+ min_area_ratio: float = 0.05,
954
+ min_side_px: int = 80,
955
+ gen_min_area_ratio: float = 0.04,
956
+ ) -> list[tuple]:
957
+ """Find embedded image regions.
958
+
959
+ `gen_min_area_ratio` controls the minimum size a *raw* texture/contour
960
+ candidate must reach to be considered for merging. `min_area_ratio` is the
961
+ minimum for the *final* (post-merge) crop. The split lets small adjacent
962
+ pieces (e.g. two side-by-side video thumbnails) be detected individually,
963
+ merged, and then evaluated as one larger region.
964
+ """
965
+ h, w = image.shape[:2]
966
+ gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) if image.ndim == 3 else image
967
+ if image.ndim == 3:
968
+ hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
969
+ sat = hsv[:, :, 1]
970
+ val = hsv[:, :, 2]
971
+ else:
972
+ sat = np.zeros_like(gray)
973
+ val = gray
974
+
975
+ text_mask = np.zeros((h, w), dtype=np.uint8)
976
+ pad = max(6, min(h, w) // 200)
977
+ for (bx, by, bw, bh) in text_boxes:
978
+ x0 = max(0, bx - pad)
979
+ y0 = max(0, by - pad)
980
+ x1 = min(w, bx + bw + pad)
981
+ y1 = min(h, by + bh + pad)
982
+ text_mask[y0:y1, x0:x1] = 1
983
+
984
+ has_wallpaper = _is_repeating_pattern(gray)
985
+
986
+ candidates = []
987
+ candidates.extend(_texture_candidates(gray, text_mask,
988
+ gen_min_area_ratio, min_side_px))
989
+ candidates.extend(_contour_candidates(gray, gen_min_area_ratio, min_side_px))
990
+
991
+ if not candidates:
992
+ return []
993
+
994
+ # Drop candidates that already exceed the final max area before merging,
995
+ # so a giant "whole-image" component doesn't shadow legitimate sub-region
996
+ # candidates during overlap merging.
997
+ pre_max = 0.92 * h * w
998
+ candidates = [c for c in candidates if c[2] * c[3] <= pre_max]
999
+ if not candidates:
1000
+ return []
1001
+
1002
+ candidates = _merge_overlapping(candidates)
1003
+ candidates = _merge_close_candidates(candidates, h, w)
1004
+
1005
+ strip = max(4, min(h, w) // 200)
1006
+ refined = []
1007
+ for (cx, cy, cw, ch) in candidates:
1008
+ rx, ry, rw, rh = _refine_crop(gray, cx, cy, cw, ch, strip=strip)
1009
+ if rw < min_side_px or rh < min_side_px:
1010
+ continue
1011
+ rx, ry, rw, rh = _expand_crop(image, sat, val, text_mask,
1012
+ rx, ry, rw, rh)
1013
+ refined.append((rx, ry, rw, rh))
1014
+
1015
+ if not refined:
1016
+ return []
1017
+
1018
+ img_area = h * w
1019
+ max_area_ratio = 0.80 if has_wallpaper else 0.92
1020
+
1021
+ valid_crops = []
1022
+ for r in refined:
1023
+ area = r[2] * r[3]
1024
+ if min_area_ratio * img_area <= area <= max_area_ratio * img_area:
1025
+ valid_crops.append(r)
1026
+
1027
+ valid_crops = sorted(valid_crops, key=lambda r: r[1])
1028
+
1029
+ return valid_crops
1030
+
1031
+
1032
+ # ──────────────────────────────────────────────────────────────
1033
+ # Entry point
1034
+ # ──────────────────────────────────────────────────────────────
1035
+
1036
+ def preprocess(pil_image: Image.Image) -> PreprocessResult:
1037
+ # Honor EXIF orientation (phone photos often store landscape pixels with a
1038
+ # rotation tag) before any geometry-dependent checks run.
1039
+ pil_image = ImageOps.exif_transpose(pil_image)
1040
+ pil_image = pil_image.convert("RGB")
1041
+ arr = np.array(pil_image)
1042
+ h, w = arr.shape[:2]
1043
+
1044
+ tier1 = _is_candidate_screenshot(arr)
1045
+ if not tier1["is_candidate"]:
1046
+ return PreprocessResult(
1047
+ image=pil_image,
1048
+ status="full",
1049
+ crop_box=None,
1050
+ text_fraction=0.0,
1051
+ debug={"tier": 1, **tier1},
1052
+ )
1053
+
1054
+ boxes = run_tesseract(arr)
1055
+ text_area = sum(bw * bh for (_, _, bw, bh) in boxes)
1056
+ text_fraction = text_area / float(h * w) if h * w else 0.0
1057
+
1058
+ if _is_reels_ui(arr):
1059
+ cw = int(w * 0.85)
1060
+ ch = int(h * 0.75)
1061
+ reels_crop = (0, 0, cw, ch)
1062
+ return PreprocessResult(
1063
+ image=pil_image.crop((0, 0, cw, ch)),
1064
+ status="cropped",
1065
+ crop_box=reels_crop,
1066
+ text_fraction=text_fraction,
1067
+ debug={"tier": 2, "n_text_boxes": len(boxes), "reels_ui": True, **tier1},
1068
+ )
1069
+
1070
+ embedded_candidates = _find_embedded_image(
1071
+ arr, boxes, min_area_ratio=EMBEDDED_MIN_AREA
1072
+ )
1073
+
1074
+ if embedded_candidates:
1075
+ final_crops = []
1076
+ cropped_images = []
1077
+
1078
+ for emb in embedded_candidates:
1079
+ refined_media = _refine_to_saturated_media(arr, emb, boxes)
1080
+ if refined_media == emb:
1081
+ ex, _, ew, _ = emb
1082
+ if ex <= 2 and ew >= w - 4:
1083
+ emb = _trim_full_width_ui_chrome(arr, emb)
1084
+ else:
1085
+ emb = _second_pass_refine(arr, emb)
1086
+ else:
1087
+ emb = refined_media
1088
+ x, y, bw, bh = emb
1089
+
1090
+ final_crops.append((x, y, bw, bh))
1091
+ cropped_images.append(pil_image.crop((x, y, x + bw, y + bh)))
1092
+
1093
+ total_crop_area = sum(bw * bh for _, _, bw, bh in final_crops)
1094
+ crop_pct = round(100.0 * total_crop_area / (h * w), 1)
1095
+
1096
+ crop_arr = np.array(cropped_images[0])
1097
+ crop_boxes = run_tesseract(crop_arr)
1098
+ crop_text_area = sum(cbw * cbh for (_, _, cbw, cbh) in crop_boxes)
1099
+ crop_h, crop_w = crop_arr.shape[:2]
1100
+ crop_text_frac = crop_text_area / float(crop_h * crop_w) if crop_h * crop_w else 0.0
1101
+
1102
+ crop_hsv = cv2.cvtColor(crop_arr, cv2.COLOR_RGB2HSV)
1103
+ mean_saturation = float(crop_hsv[:, :, 1].mean())
1104
+
1105
+ is_document = (
1106
+ (crop_text_frac > 0.15 and mean_saturation < 30)
1107
+ or crop_text_frac > 0.40
1108
+ )
1109
+
1110
+ if is_document:
1111
+ return PreprocessResult(
1112
+ image=None,
1113
+ status="text_only",
1114
+ crop_box=None,
1115
+ text_fraction=text_fraction,
1116
+ debug={"tier": 2, "n_text_boxes": len(boxes),
1117
+ "crop_text_frac": f"{crop_text_frac:.1%}",
1118
+ "crop_pct": f"{crop_pct}%", **tier1},
1119
+ )
1120
+
1121
+ return PreprocessResult(
1122
+ image=cropped_images if len(cropped_images) > 1 else cropped_images[0],
1123
+ status="cropped",
1124
+ crop_box=final_crops if len(final_crops) > 1 else final_crops[0],
1125
+ text_fraction=text_fraction,
1126
+ debug={"tier": 2, "n_text_boxes": len(boxes),
1127
+ "crop_pct": f"{crop_pct}%", "n_crops": len(final_crops), **tier1},
1128
+ )
1129
+
1130
+ if text_fraction > TEXT_ONLY_FRACTION:
1131
+ return PreprocessResult(
1132
+ image=None,
1133
+ status="text_only",
1134
+ crop_box=None,
1135
+ text_fraction=text_fraction,
1136
+ debug={"tier": 2, "n_text_boxes": len(boxes), **tier1},
1137
+ )
1138
+
1139
+ return PreprocessResult(
1140
+ image=pil_image,
1141
+ status="full",
1142
+ crop_box=None,
1143
+ text_fraction=text_fraction,
1144
+ debug={"tier": 2, "fallback": True, **tier1},
1145
+ )
app/static/index.html CHANGED
@@ -97,7 +97,7 @@
97
  </div>
98
  </div>
99
 
100
- <div class="mt-8 grid grid-cols-1 md:grid-cols-2 gap-8 items-center">
101
  <div class="flex flex-col items-center">
102
  <div class="relative w-48 h-48 sm:w-56 sm:h-56">
103
  <svg viewBox="0 0 200 200" class="w-full h-full -rotate-90">
@@ -117,6 +117,14 @@
117
  <div id="advice-text" class="mt-3 text-lg sm:text-xl font-semibold text-gray-900"></div>
118
  <div id="frames-info" class="mt-4 text-sm text-gray-500"></div>
119
  </div>
 
 
 
 
 
 
 
 
120
  </div>
121
 
122
  <div class="mt-8 flex justify-center">
@@ -176,6 +184,10 @@
176
  error_size: "File is too large.",
177
  error_type: "Unsupported file type.",
178
  frames_info: "Averaged over {n} frames.",
 
 
 
 
179
  how_calculated_title: "How the score is computed",
180
  how_calculated_body: "We use a Swin Transformer V2 model fine-tuned to distinguish real photographs from AI-generated images. For videos, we sample 5 frames evenly across the duration and average the model's confidence. The score shown is the model's estimated probability that the content was generated by AI.",
181
  close: "Close",
@@ -207,6 +219,10 @@
207
  error_size: "Le fichier est trop volumineux.",
208
  error_type: "Type de fichier non pris en charge.",
209
  frames_info: "Moyenne sur {n} images.",
 
 
 
 
210
  how_calculated_title: "Comment le score est calculé",
211
  how_calculated_body: "Nous utilisons un modèle Swin Transformer V2 entraîné pour distinguer les vraies photographies des images générées par IA. Pour les vidéos, nous échantillonnons 5 images réparties uniformément sur la durée et faisons la moyenne de la confiance du modèle. Le score affiché correspond à la probabilité estimée que le contenu ait été généré par IA.",
212
  close: "Fermer",
@@ -240,7 +256,10 @@
240
  (state.lang === "en" ? "bg-blue-600 text-white" : "text-gray-600");
241
  $("lang-fr").className = "px-3 py-1 rounded-full font-semibold " +
242
  (state.lang === "fr" ? "bg-blue-600 text-white" : "text-gray-600");
243
- if (state.result) renderResultText();
 
 
 
244
  }
245
 
246
  function setLang(lang) {
@@ -251,14 +270,14 @@
251
 
252
  function getVerdict(aiScore, mediaType) {
253
  const T = t();
254
- if (aiScore > 0.60) {
255
  return {
256
  verdict: mediaType === "video" ? T.verdict_ai_video : T.verdict_ai_image,
257
  advice: T.advice_ai,
258
  tone: "ai",
259
  };
260
  }
261
- if (aiScore > 0.30) {
262
  return {
263
  verdict: mediaType === "video" ? T.verdict_uncertain_video : T.verdict_uncertain_image,
264
  advice: T.advice_uncertain,
@@ -307,6 +326,12 @@
307
  $("analyze-btn").disabled = true;
308
  $("reset-btn").classList.add("hidden");
309
  $("error-banner").classList.add("hidden");
 
 
 
 
 
 
310
  }
311
 
312
  function showError(msg) {
@@ -355,6 +380,63 @@
355
  }
356
  }
357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  function animateArc(fraction) {
359
  const arc = $("arc-fg");
360
  arc.style.transition = "none";
@@ -399,6 +481,7 @@
399
  }
400
  state.result = await res.json();
401
  renderResultText();
 
402
  showCard("result-card");
403
  animateArc(state.result.p_fake);
404
  } catch (e) {
 
97
  </div>
98
  </div>
99
 
100
+ <div class="mt-8 grid grid-cols-1 lg:grid-cols-3 gap-8 items-center">
101
  <div class="flex flex-col items-center">
102
  <div class="relative w-48 h-48 sm:w-56 sm:h-56">
103
  <svg viewBox="0 0 200 200" class="w-full h-full -rotate-90">
 
117
  <div id="advice-text" class="mt-3 text-lg sm:text-xl font-semibold text-gray-900"></div>
118
  <div id="frames-info" class="mt-4 text-sm text-gray-500"></div>
119
  </div>
120
+
121
+ <div id="preview-pane" class="hidden flex flex-col items-center">
122
+ <div id="preview-wrap" class="relative inline-block">
123
+ <img id="result-image" class="max-h-64 max-w-full rounded-lg block bg-gray-50" alt="" />
124
+ <svg id="result-overlay" class="absolute top-0 left-0 w-full h-full pointer-events-none" preserveAspectRatio="none"></svg>
125
+ </div>
126
+ <div id="preview-status" class="mt-3 text-xs text-gray-500 text-center"></div>
127
+ </div>
128
  </div>
129
 
130
  <div class="mt-8 flex justify-center">
 
184
  error_size: "File is too large.",
185
  error_type: "Unsupported file type.",
186
  frames_info: "Averaged over {n} frames.",
187
+ preview_cropped_one: "Focused on 1 region (screenshot detected)",
188
+ preview_cropped_many: "Focused on {n} regions (scores averaged)",
189
+ preview_full: "Full image analyzed",
190
+ preview_text_only: "Text-only screenshot — score softened",
191
  how_calculated_title: "How the score is computed",
192
  how_calculated_body: "We use a Swin Transformer V2 model fine-tuned to distinguish real photographs from AI-generated images. For videos, we sample 5 frames evenly across the duration and average the model's confidence. The score shown is the model's estimated probability that the content was generated by AI.",
193
  close: "Close",
 
219
  error_size: "Le fichier est trop volumineux.",
220
  error_type: "Type de fichier non pris en charge.",
221
  frames_info: "Moyenne sur {n} images.",
222
+ preview_cropped_one: "Focus sur 1 zone (capture d'écran détectée)",
223
+ preview_cropped_many: "Focus sur {n} zones (scores moyennés)",
224
+ preview_full: "Image entière analysée",
225
+ preview_text_only: "Capture texte uniquement — score atténué",
226
  how_calculated_title: "Comment le score est calculé",
227
  how_calculated_body: "Nous utilisons un modèle Swin Transformer V2 entraîné pour distinguer les vraies photographies des images générées par IA. Pour les vidéos, nous échantillonnons 5 images réparties uniformément sur la durée et faisons la moyenne de la confiance du modèle. Le score affiché correspond à la probabilité estimée que le contenu ait été généré par IA.",
228
  close: "Fermer",
 
256
  (state.lang === "en" ? "bg-blue-600 text-white" : "text-gray-600");
257
  $("lang-fr").className = "px-3 py-1 rounded-full font-semibold " +
258
  (state.lang === "fr" ? "bg-blue-600 text-white" : "text-gray-600");
259
+ if (state.result) {
260
+ renderResultText();
261
+ renderPreviewOverlay();
262
+ }
263
  }
264
 
265
  function setLang(lang) {
 
270
 
271
  function getVerdict(aiScore, mediaType) {
272
  const T = t();
273
+ if (aiScore >= 0.60) {
274
  return {
275
  verdict: mediaType === "video" ? T.verdict_ai_video : T.verdict_ai_image,
276
  advice: T.advice_ai,
277
  tone: "ai",
278
  };
279
  }
280
+ if (aiScore >= 0.30) {
281
  return {
282
  verdict: mediaType === "video" ? T.verdict_uncertain_video : T.verdict_uncertain_image,
283
  advice: T.advice_uncertain,
 
326
  $("analyze-btn").disabled = true;
327
  $("reset-btn").classList.add("hidden");
328
  $("error-banner").classList.add("hidden");
329
+ const resultImg = $("result-image");
330
+ if (resultImg.src) {
331
+ try { URL.revokeObjectURL(resultImg.src); } catch (_) {}
332
+ resultImg.removeAttribute("src");
333
+ }
334
+ $("preview-pane").classList.add("hidden");
335
  }
336
 
337
  function showError(msg) {
 
380
  }
381
  }
382
 
383
+ function renderPreviewOverlay() {
384
+ const pane = $("preview-pane");
385
+ const img = $("result-image");
386
+ const overlay = $("result-overlay");
387
+ const statusEl = $("preview-status");
388
+
389
+ if (!state.result || state.result.media_type !== "image" || !state.file) {
390
+ pane.classList.add("hidden");
391
+ return;
392
+ }
393
+
394
+ if (img.src) {
395
+ try { URL.revokeObjectURL(img.src); } catch (_) {}
396
+ }
397
+ img.src = URL.createObjectURL(state.file);
398
+
399
+ img.onload = () => {
400
+ const [iw, ih] = state.result.image_size || [img.naturalWidth, img.naturalHeight];
401
+ overlay.setAttribute("viewBox", `0 0 ${iw} ${ih}`);
402
+
403
+ // Clear previous rects.
404
+ while (overlay.firstChild) overlay.removeChild(overlay.firstChild);
405
+
406
+ const boxes = state.result.crop_box || [];
407
+ const sw = Math.max(iw, ih) * 0.012; // thick stroke, ~1.2% of larger dim
408
+ for (const box of boxes) {
409
+ const [x, y, w, h] = box;
410
+ const rect = document.createElementNS("http://www.w3.org/2000/svg", "rect");
411
+ rect.setAttribute("x", x);
412
+ rect.setAttribute("y", y);
413
+ rect.setAttribute("width", w);
414
+ rect.setAttribute("height", h);
415
+ rect.setAttribute("fill", "none");
416
+ rect.setAttribute("stroke", "#ef4444");
417
+ rect.setAttribute("stroke-width", sw);
418
+ rect.setAttribute("rx", sw * 0.5);
419
+ overlay.appendChild(rect);
420
+ }
421
+ };
422
+
423
+ const T = t();
424
+ const status = state.result.preprocess_status;
425
+ let label = "";
426
+ if (status === "cropped") {
427
+ const n = state.result.n_crops || 1;
428
+ label = n === 1
429
+ ? T.preview_cropped_one
430
+ : T.preview_cropped_many.replace("{n}", n);
431
+ } else if (status === "text_only") {
432
+ label = T.preview_text_only;
433
+ } else {
434
+ label = T.preview_full;
435
+ }
436
+ statusEl.textContent = label;
437
+ pane.classList.remove("hidden");
438
+ }
439
+
440
  function animateArc(fraction) {
441
  const arc = $("arc-fg");
442
  arc.style.transition = "none";
 
481
  }
482
  state.result = await res.json();
483
  renderResultText();
484
+ renderPreviewOverlay();
485
  showCard("result-card");
486
  animateArc(state.result.p_fake);
487
  } catch (e) {