chore(imajin-adversarial): 🔧 Add dirty flag tracking for subproject modifications
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
e68d9155eb
commit
4fd20451fc
1 changed files with 164 additions and 26 deletions
|
|
@ -111,16 +111,63 @@ class SCRFDEvasionModel:
|
|||
) -> tuple[np.ndarray, float, float, float, float]:
|
||||
"""Apply adversarial detection-evasion perturbation to the full frame.
|
||||
|
||||
Attack design (2026-04-11 rewrite — see .project/handoff/improve-adversarial-protections.md
|
||||
for the diagnostic that motivated this):
|
||||
|
||||
* Logit-space loss. The onnx2torch-converted SCRFD model emits post-sigmoid
|
||||
probabilities. Naive `-confs.sum()` loss has gradient ∝ p(1-p), which
|
||||
saturates for anchors already near 0. We transform to logits and work in
|
||||
that space so the gradient stays roughly constant throughout the attack.
|
||||
|
||||
* EoT (Expectation over Transformation) via N-sample input smoothing.
|
||||
The onnx2torch computation graph diverges from onnxruntime's under
|
||||
adversarial perturbation — PGD against the torch graph finds
|
||||
gradient-model-specific exploits that don't transfer. We average the
|
||||
loss over EOT_SAMPLES different Gaussian-noise realisations per step,
|
||||
which finds perturbations robust to small input perturbations and
|
||||
therefore more transferable (Athalye et al. 2018, "Synthesizing Robust
|
||||
Adversarial Examples"; Xie et al. 2019, "Improving Transferability…").
|
||||
|
||||
* MI-FGSM (momentum). Accumulate gradient direction across steps with
|
||||
decay mu so PGD escapes narrow gradient-model-specific minima. Dong
|
||||
et al. 2018, "Boosting Adversarial Attacks with Momentum".
|
||||
|
||||
* ORT detector as success oracle. Check the real onnxruntime detector
|
||||
every ORACLE_STRIDE steps, track best x_adv by face count, early-exit
|
||||
on complete suppression.
|
||||
|
||||
* Face-region perturbation mask. At eps=0.08 an unconstrained attack
|
||||
on the full frame produces SSIM ≈ 0.4 (visible destruction). We
|
||||
first run SCRFD detection to identify face bboxes, then CONSTRAIN
|
||||
the adversarial delta to be zero outside a dilated face region.
|
||||
Face pixels are ≤15% of a typical portrait, so even with eps=0.08
|
||||
on face pixels the overall SSIM stays above 0.75.
|
||||
|
||||
Returns:
|
||||
(perturbed_frame, l2_norm, linf_norm, confidence_before, confidence_after)
|
||||
"""
|
||||
assert self._initialized and self._torch_model is not None and self._device is not None
|
||||
assert self._insightface_detector is not None
|
||||
|
||||
import torch.nn.functional as F
|
||||
from attacks.pgd import perturbation_stats
|
||||
|
||||
model = self._torch_model
|
||||
device = self._device
|
||||
detector = self._insightface_detector
|
||||
|
||||
# Tunable attack hyperparameters — tuned 2026-04-11 diagnostic.
|
||||
ORACLE_STRIDE = 5 # check ORT detector every N steps
|
||||
EOT_SAMPLES = 5 # gradient-averaging samples per step (EoT)
|
||||
INPUT_NOISE_STD = 0.015 # per-sample Gaussian noise on x_adv
|
||||
MI_MOMENTUM = 0.9 # MI-FGSM momentum decay
|
||||
FACE_MASK_THRESHOLD = 0.2 # which anchors count as "face candidates"
|
||||
LOGIT_CLAMP = 1e-6 # numerical floor for torch.logit
|
||||
BBOX_DILATE_FRAC = 0.15 # dilate face bbox by 15% on each side
|
||||
# EoT JPEG quantization: simulate a JPEG q=92 roundtrip inside the EoT
|
||||
# loop so perturbations survive the pipeline's JPEG encoding step.
|
||||
# Straight-through quantization: forward = round, backward = identity.
|
||||
JPEG_QUANT_STEP = 2.0 / 255 # ≈ 1 DCT step at q=92 for Y channel
|
||||
|
||||
# Work at the ORIGINAL image resolution — PGD computes delta directly in the
|
||||
# pixel space of the output image, so no upscaling of the perturbation is needed.
|
||||
|
|
@ -131,13 +178,7 @@ class SCRFDEvasionModel:
|
|||
x = x.to(device)
|
||||
|
||||
def _conf_tensor(x_in: torch.Tensor) -> torch.Tensor:
|
||||
"""SCRFD confidence scores, model always receives 640×640 via interpolate.
|
||||
|
||||
The SCRFD ONNX model applies sigmoid internally before exporting; the
|
||||
onnx2torch conversion preserves this, so output values are already
|
||||
probabilities in [0, 1]. Applying sigmoid() again would double-sigmoid
|
||||
(values cluster at 0.5, gradient 4× weaker) — we use direct sum instead.
|
||||
"""
|
||||
"""SCRFD confidence scores (post-sigmoid probabilities)."""
|
||||
x_640 = F.interpolate(x_in, size=SCRFD_INPUT_SIZE, mode='bilinear',
|
||||
align_corners=False)
|
||||
outputs = model(x_640)
|
||||
|
|
@ -153,43 +194,140 @@ class SCRFDEvasionModel:
|
|||
return torch.zeros(1, device=device)
|
||||
return torch.cat(conf_list, dim=0).squeeze(1) # (total_anchors,)
|
||||
|
||||
# Confidence before attack (no gradients needed)
|
||||
def _ort_face_count(x_tensor: torch.Tensor, jpeg_quality: int = 92) -> int:
|
||||
"""Run onnxruntime SCRFD detector on the current x_adv, AFTER a JPEG
|
||||
encode/decode round trip. This ensures the "best tracking" metric
|
||||
reflects what the real pipeline will produce (pipeline.ts finalises
|
||||
outputs as JPEG q=92). The torch-space attack often defeats the raw
|
||||
PNG image but fails after JPEG — so oracle against JPEG or nothing."""
|
||||
arr = x_tensor.detach().squeeze(0).permute(1, 2, 0).cpu().numpy()
|
||||
arr = (arr * 255.0).clip(0, 255).astype(np.uint8)
|
||||
bgr = cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
|
||||
# JPEG roundtrip
|
||||
ok, buf = cv2.imencode('.jpg', bgr, [cv2.IMWRITE_JPEG_QUALITY, jpeg_quality])
|
||||
if ok:
|
||||
bgr = cv2.imdecode(buf, cv2.IMREAD_COLOR)
|
||||
bboxes, _ = detector.detect(bgr, input_size=(640, 640))
|
||||
return 0 if bboxes is None else len(bboxes)
|
||||
|
||||
# Confidence before attack (torch model)
|
||||
with torch.no_grad():
|
||||
conf_before = float(_conf_tensor(x).max().item())
|
||||
|
||||
# Targeted evasion loss: only attack anchors that are face candidates (>0.4).
|
||||
# Targeting all 16 800 anchors dilutes the gradient ~1000:1 (background vs face).
|
||||
# Ground truth from the ORT detector — the attack's real success criterion.
|
||||
# Also use detected bboxes to build a face-region perturbation mask so the
|
||||
# attack doesn't destroy background pixels (preserves SSIM above ~0.75).
|
||||
clean_bboxes, _ = detector.detect(frame_bgr, input_size=(640, 640))
|
||||
best_face_count = 0 if clean_bboxes is None else len(clean_bboxes)
|
||||
|
||||
# No faces to defeat → attack has nothing to do. Return the clean frame
|
||||
# unchanged rather than running full-frame PGD (which would destroy the
|
||||
# image's SSIM with no protective benefit).
|
||||
if best_face_count == 0:
|
||||
return frame_bgr.copy(), 0.0, 0.0, conf_before, conf_before
|
||||
|
||||
# Build a spatial mask over the original-resolution frame: 1.0 inside any
|
||||
# dilated face bbox, 0.0 elsewhere. Grad is multiplied by this mask so
|
||||
# the perturbation only modifies face regions.
|
||||
_, _, H, W = x.shape
|
||||
face_mask_spatial = torch.zeros((1, 1, H, W), device=device)
|
||||
for bbox in clean_bboxes:
|
||||
x1, y1, x2, y2 = bbox[:4]
|
||||
bw, bh = x2 - x1, y2 - y1
|
||||
dx, dy = bw * BBOX_DILATE_FRAC, bh * BBOX_DILATE_FRAC
|
||||
xi1 = max(0, int(round(x1 - dx)))
|
||||
yi1 = max(0, int(round(y1 - dy)))
|
||||
xi2 = min(W, int(round(x2 + dx)))
|
||||
yi2 = min(H, int(round(y2 + dy)))
|
||||
if xi2 > xi1 and yi2 > yi1:
|
||||
face_mask_spatial[:, :, yi1:yi2, xi1:xi2] = 1.0
|
||||
# Degenerate case: bboxes returned but all clamped to zero area.
|
||||
# No pixels to perturb — return unchanged.
|
||||
if face_mask_spatial.sum() == 0:
|
||||
return frame_bgr.copy(), 0.0, 0.0, conf_before, conf_before
|
||||
|
||||
# alpha uses the Madry et al. standard: 2.5 * eps / steps to avoid overshoot.
|
||||
effective_alpha = alpha if alpha is not None else (2.5 * eps / max(steps, 1))
|
||||
|
||||
# Track best-so-far by ORT face count. Initialise to clean image so a
|
||||
# non-transferring attack still records the final perturbation via the
|
||||
# fallback after the loop (see last_x_adv).
|
||||
best_x_adv = x.clone().detach()
|
||||
best_conf = conf_before
|
||||
baseline_face_count = best_face_count
|
||||
|
||||
# MI-FGSM momentum accumulator
|
||||
g_momentum = torch.zeros_like(x)
|
||||
|
||||
# Random-start PGD initialization
|
||||
x_adv = x + torch.empty_like(x).uniform_(-eps, eps)
|
||||
x_adv = x_adv.clamp(0.0, 1.0).detach()
|
||||
|
||||
for _ in range(steps):
|
||||
last_x_adv = x_adv.clone() # fallback if ORT never drops
|
||||
|
||||
for step_idx in range(steps):
|
||||
x_adv.requires_grad_(True)
|
||||
confs = _conf_tensor(x_adv)
|
||||
# Focus on anchors likely corresponding to detected faces
|
||||
face_mask = (confs.detach() > 0.4)
|
||||
if face_mask.sum() > 0:
|
||||
loss = -(confs * face_mask.float()).sum()
|
||||
else:
|
||||
loss = -confs.max()
|
||||
grad = torch.autograd.grad(loss, x_adv, create_graph=False)[0]
|
||||
|
||||
# EoT: average gradient over multiple noise + quantization
|
||||
# realisations — finds perturbations that are robust to both
|
||||
# small input changes (transferability) and pixel-level rounding
|
||||
# (JPEG recompression survival).
|
||||
accum_grad = torch.zeros_like(x_adv)
|
||||
for sample_idx in range(EOT_SAMPLES):
|
||||
noise = torch.randn_like(x_adv) * INPUT_NOISE_STD
|
||||
x_smooth = (x_adv + noise).clamp(0.0, 1.0)
|
||||
|
||||
# Straight-through JPEG quantization: round to q=92 step size
|
||||
# on forward, identity on backward. This forces PGD to find
|
||||
# perturbations that tolerate pixel-level rounding — which is
|
||||
# exactly what JPEG compression does.
|
||||
if sample_idx > 0: # skip quantization on one sample for gradient diversity
|
||||
x_smooth_q = (x_smooth / JPEG_QUANT_STEP).round() * JPEG_QUANT_STEP
|
||||
x_smooth = x_smooth + (x_smooth_q - x_smooth).detach()
|
||||
|
||||
confs = _conf_tensor(x_smooth)
|
||||
face_mask = (confs.detach() > FACE_MASK_THRESHOLD)
|
||||
|
||||
if face_mask.sum() > 0:
|
||||
# Logit-space loss: non-saturating gradient for small confs.
|
||||
logits = torch.logit(confs.clamp(LOGIT_CLAMP, 1.0 - LOGIT_CLAMP))
|
||||
loss = -(logits * face_mask.float()).sum()
|
||||
else:
|
||||
# No face-candidate anchors — fall back to max-conf logit.
|
||||
loss = -torch.logit(confs.max().clamp(LOGIT_CLAMP, 1.0 - LOGIT_CLAMP))
|
||||
|
||||
grad = torch.autograd.grad(loss, x_adv, create_graph=False, retain_graph=False)[0]
|
||||
accum_grad = accum_grad + grad.detach()
|
||||
|
||||
# Average and L1-normalise the gradient (MI-FGSM convention)
|
||||
avg_grad = accum_grad / EOT_SAMPLES
|
||||
# Zero out gradient outside the face region — delta will only update
|
||||
# face-region pixels, background stays clean.
|
||||
avg_grad = avg_grad * face_mask_spatial
|
||||
grad_norm = avg_grad.abs().mean().clamp(min=1e-12)
|
||||
g_momentum = MI_MOMENTUM * g_momentum + avg_grad / grad_norm
|
||||
|
||||
with torch.no_grad():
|
||||
x_adv = x_adv + effective_alpha * grad.sign()
|
||||
# Apply step + project to L-inf ball around clean x + clip to [0, 1]
|
||||
x_adv = x_adv + effective_alpha * g_momentum.sign() * face_mask_spatial
|
||||
x_adv = torch.max(torch.min(x_adv, x + eps), x - eps)
|
||||
x_adv = x_adv.clamp(0.0, 1.0)
|
||||
# Re-enforce the face mask: pixels outside mask snap back to clean x
|
||||
x_adv = x * (1 - face_mask_spatial) + x_adv * face_mask_spatial
|
||||
last_x_adv = x_adv.clone()
|
||||
|
||||
step_conf = _conf_tensor(x_adv).max().item()
|
||||
if step_conf < best_conf:
|
||||
best_conf = step_conf
|
||||
best_x_adv = x_adv.clone()
|
||||
# Real-world success oracle every ORACLE_STRIDE steps
|
||||
if (step_idx + 1) % ORACLE_STRIDE == 0 or step_idx == steps - 1:
|
||||
face_count = _ort_face_count(x_adv)
|
||||
if face_count < best_face_count:
|
||||
best_face_count = face_count
|
||||
best_x_adv = x_adv.clone()
|
||||
if best_face_count == 0:
|
||||
break # early exit on complete suppression
|
||||
|
||||
x_adv = best_x_adv
|
||||
# If ORT face count never dropped below the clean baseline, return the
|
||||
# LAST iterate (still adversarial in torch-space, carries forensic value
|
||||
# and keeps the protected output visually different from the original).
|
||||
x_adv = best_x_adv if best_face_count < baseline_face_count else last_x_adv
|
||||
l2, linf = perturbation_stats(x, x_adv)
|
||||
|
||||
with torch.no_grad():
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue