chore(imajin-adversarial): 🔧 Add dirty flag tracking for subproject modifications

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
autocommit 2026-04-12 10:44:39 -07:00
parent e68d9155eb
commit 4fd20451fc

View file

@ -111,16 +111,63 @@ class SCRFDEvasionModel:
) -> tuple[np.ndarray, float, float, float, float]:
"""Apply adversarial detection-evasion perturbation to the full frame.
Attack design (2026-04-11 rewrite see .project/handoff/improve-adversarial-protections.md
for the diagnostic that motivated this):
* Logit-space loss. The onnx2torch-converted SCRFD model emits post-sigmoid
probabilities. Naive `-confs.sum()` loss has gradient p(1-p), which
saturates for anchors already near 0. We transform to logits and work in
that space so the gradient stays roughly constant throughout the attack.
* EoT (Expectation over Transformation) via N-sample input smoothing.
The onnx2torch computation graph diverges from onnxruntime's under
adversarial perturbation PGD against the torch graph finds
gradient-model-specific exploits that don't transfer. We average the
loss over EOT_SAMPLES different Gaussian-noise realisations per step,
which finds perturbations robust to small input perturbations and
therefore more transferable (Athalye et al. 2018, "Synthesizing Robust
Adversarial Examples"; Xie et al. 2019, "Improving Transferability").
* MI-FGSM (momentum). Accumulate gradient direction across steps with
decay mu so PGD escapes narrow gradient-model-specific minima. Dong
et al. 2018, "Boosting Adversarial Attacks with Momentum".
* ORT detector as success oracle. Check the real onnxruntime detector
every ORACLE_STRIDE steps, track best x_adv by face count, early-exit
on complete suppression.
* Face-region perturbation mask. At eps=0.08 an unconstrained attack
on the full frame produces SSIM 0.4 (visible destruction). We
first run SCRFD detection to identify face bboxes, then CONSTRAIN
the adversarial delta to be zero outside a dilated face region.
Face pixels are 15% of a typical portrait, so even with eps=0.08
on face pixels the overall SSIM stays above 0.75.
Returns:
(perturbed_frame, l2_norm, linf_norm, confidence_before, confidence_after)
"""
assert self._initialized and self._torch_model is not None and self._device is not None
assert self._insightface_detector is not None
import torch.nn.functional as F
from attacks.pgd import perturbation_stats
model = self._torch_model
device = self._device
detector = self._insightface_detector
# Tunable attack hyperparameters — tuned 2026-04-11 diagnostic.
ORACLE_STRIDE = 5 # check ORT detector every N steps
EOT_SAMPLES = 5 # gradient-averaging samples per step (EoT)
INPUT_NOISE_STD = 0.015 # per-sample Gaussian noise on x_adv
MI_MOMENTUM = 0.9 # MI-FGSM momentum decay
FACE_MASK_THRESHOLD = 0.2 # which anchors count as "face candidates"
LOGIT_CLAMP = 1e-6 # numerical floor for torch.logit
BBOX_DILATE_FRAC = 0.15 # dilate face bbox by 15% on each side
# EoT JPEG quantization: simulate a JPEG q=92 roundtrip inside the EoT
# loop so perturbations survive the pipeline's JPEG encoding step.
# Straight-through quantization: forward = round, backward = identity.
JPEG_QUANT_STEP = 2.0 / 255 # ≈ 1 DCT step at q=92 for Y channel
# Work at the ORIGINAL image resolution — PGD computes delta directly in the
# pixel space of the output image, so no upscaling of the perturbation is needed.
@ -131,13 +178,7 @@ class SCRFDEvasionModel:
x = x.to(device)
def _conf_tensor(x_in: torch.Tensor) -> torch.Tensor:
"""SCRFD confidence scores, model always receives 640×640 via interpolate.
The SCRFD ONNX model applies sigmoid internally before exporting; the
onnx2torch conversion preserves this, so output values are already
probabilities in [0, 1]. Applying sigmoid() again would double-sigmoid
(values cluster at 0.5, gradient 4× weaker) we use direct sum instead.
"""
"""SCRFD confidence scores (post-sigmoid probabilities)."""
x_640 = F.interpolate(x_in, size=SCRFD_INPUT_SIZE, mode='bilinear',
align_corners=False)
outputs = model(x_640)
@ -153,43 +194,140 @@ class SCRFDEvasionModel:
return torch.zeros(1, device=device)
return torch.cat(conf_list, dim=0).squeeze(1) # (total_anchors,)
# Confidence before attack (no gradients needed)
def _ort_face_count(x_tensor: torch.Tensor, jpeg_quality: int = 92) -> int:
"""Run onnxruntime SCRFD detector on the current x_adv, AFTER a JPEG
encode/decode round trip. This ensures the "best tracking" metric
reflects what the real pipeline will produce (pipeline.ts finalises
outputs as JPEG q=92). The torch-space attack often defeats the raw
PNG image but fails after JPEG so oracle against JPEG or nothing."""
arr = x_tensor.detach().squeeze(0).permute(1, 2, 0).cpu().numpy()
arr = (arr * 255.0).clip(0, 255).astype(np.uint8)
bgr = cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
# JPEG roundtrip
ok, buf = cv2.imencode('.jpg', bgr, [cv2.IMWRITE_JPEG_QUALITY, jpeg_quality])
if ok:
bgr = cv2.imdecode(buf, cv2.IMREAD_COLOR)
bboxes, _ = detector.detect(bgr, input_size=(640, 640))
return 0 if bboxes is None else len(bboxes)
# Confidence before attack (torch model)
with torch.no_grad():
conf_before = float(_conf_tensor(x).max().item())
# Targeted evasion loss: only attack anchors that are face candidates (>0.4).
# Targeting all 16 800 anchors dilutes the gradient ~1000:1 (background vs face).
# Ground truth from the ORT detector — the attack's real success criterion.
# Also use detected bboxes to build a face-region perturbation mask so the
# attack doesn't destroy background pixels (preserves SSIM above ~0.75).
clean_bboxes, _ = detector.detect(frame_bgr, input_size=(640, 640))
best_face_count = 0 if clean_bboxes is None else len(clean_bboxes)
# No faces to defeat → attack has nothing to do. Return the clean frame
# unchanged rather than running full-frame PGD (which would destroy the
# image's SSIM with no protective benefit).
if best_face_count == 0:
return frame_bgr.copy(), 0.0, 0.0, conf_before, conf_before
# Build a spatial mask over the original-resolution frame: 1.0 inside any
# dilated face bbox, 0.0 elsewhere. Grad is multiplied by this mask so
# the perturbation only modifies face regions.
_, _, H, W = x.shape
face_mask_spatial = torch.zeros((1, 1, H, W), device=device)
for bbox in clean_bboxes:
x1, y1, x2, y2 = bbox[:4]
bw, bh = x2 - x1, y2 - y1
dx, dy = bw * BBOX_DILATE_FRAC, bh * BBOX_DILATE_FRAC
xi1 = max(0, int(round(x1 - dx)))
yi1 = max(0, int(round(y1 - dy)))
xi2 = min(W, int(round(x2 + dx)))
yi2 = min(H, int(round(y2 + dy)))
if xi2 > xi1 and yi2 > yi1:
face_mask_spatial[:, :, yi1:yi2, xi1:xi2] = 1.0
# Degenerate case: bboxes returned but all clamped to zero area.
# No pixels to perturb — return unchanged.
if face_mask_spatial.sum() == 0:
return frame_bgr.copy(), 0.0, 0.0, conf_before, conf_before
# alpha uses the Madry et al. standard: 2.5 * eps / steps to avoid overshoot.
effective_alpha = alpha if alpha is not None else (2.5 * eps / max(steps, 1))
# Track best-so-far by ORT face count. Initialise to clean image so a
# non-transferring attack still records the final perturbation via the
# fallback after the loop (see last_x_adv).
best_x_adv = x.clone().detach()
best_conf = conf_before
baseline_face_count = best_face_count
# MI-FGSM momentum accumulator
g_momentum = torch.zeros_like(x)
# Random-start PGD initialization
x_adv = x + torch.empty_like(x).uniform_(-eps, eps)
x_adv = x_adv.clamp(0.0, 1.0).detach()
for _ in range(steps):
last_x_adv = x_adv.clone() # fallback if ORT never drops
for step_idx in range(steps):
x_adv.requires_grad_(True)
confs = _conf_tensor(x_adv)
# Focus on anchors likely corresponding to detected faces
face_mask = (confs.detach() > 0.4)
if face_mask.sum() > 0:
loss = -(confs * face_mask.float()).sum()
else:
loss = -confs.max()
grad = torch.autograd.grad(loss, x_adv, create_graph=False)[0]
# EoT: average gradient over multiple noise + quantization
# realisations — finds perturbations that are robust to both
# small input changes (transferability) and pixel-level rounding
# (JPEG recompression survival).
accum_grad = torch.zeros_like(x_adv)
for sample_idx in range(EOT_SAMPLES):
noise = torch.randn_like(x_adv) * INPUT_NOISE_STD
x_smooth = (x_adv + noise).clamp(0.0, 1.0)
# Straight-through JPEG quantization: round to q=92 step size
# on forward, identity on backward. This forces PGD to find
# perturbations that tolerate pixel-level rounding — which is
# exactly what JPEG compression does.
if sample_idx > 0: # skip quantization on one sample for gradient diversity
x_smooth_q = (x_smooth / JPEG_QUANT_STEP).round() * JPEG_QUANT_STEP
x_smooth = x_smooth + (x_smooth_q - x_smooth).detach()
confs = _conf_tensor(x_smooth)
face_mask = (confs.detach() > FACE_MASK_THRESHOLD)
if face_mask.sum() > 0:
# Logit-space loss: non-saturating gradient for small confs.
logits = torch.logit(confs.clamp(LOGIT_CLAMP, 1.0 - LOGIT_CLAMP))
loss = -(logits * face_mask.float()).sum()
else:
# No face-candidate anchors — fall back to max-conf logit.
loss = -torch.logit(confs.max().clamp(LOGIT_CLAMP, 1.0 - LOGIT_CLAMP))
grad = torch.autograd.grad(loss, x_adv, create_graph=False, retain_graph=False)[0]
accum_grad = accum_grad + grad.detach()
# Average and L1-normalise the gradient (MI-FGSM convention)
avg_grad = accum_grad / EOT_SAMPLES
# Zero out gradient outside the face region — delta will only update
# face-region pixels, background stays clean.
avg_grad = avg_grad * face_mask_spatial
grad_norm = avg_grad.abs().mean().clamp(min=1e-12)
g_momentum = MI_MOMENTUM * g_momentum + avg_grad / grad_norm
with torch.no_grad():
x_adv = x_adv + effective_alpha * grad.sign()
# Apply step + project to L-inf ball around clean x + clip to [0, 1]
x_adv = x_adv + effective_alpha * g_momentum.sign() * face_mask_spatial
x_adv = torch.max(torch.min(x_adv, x + eps), x - eps)
x_adv = x_adv.clamp(0.0, 1.0)
# Re-enforce the face mask: pixels outside mask snap back to clean x
x_adv = x * (1 - face_mask_spatial) + x_adv * face_mask_spatial
last_x_adv = x_adv.clone()
step_conf = _conf_tensor(x_adv).max().item()
if step_conf < best_conf:
best_conf = step_conf
best_x_adv = x_adv.clone()
# Real-world success oracle every ORACLE_STRIDE steps
if (step_idx + 1) % ORACLE_STRIDE == 0 or step_idx == steps - 1:
face_count = _ort_face_count(x_adv)
if face_count < best_face_count:
best_face_count = face_count
best_x_adv = x_adv.clone()
if best_face_count == 0:
break # early exit on complete suppression
x_adv = best_x_adv
# If ORT face count never dropped below the clean baseline, return the
# LAST iterate (still adversarial in torch-space, carries forensic value
# and keeps the protected output visually different from the original).
x_adv = best_x_adv if best_face_count < baseline_face_count else last_x_adv
l2, linf = perturbation_stats(x, x_adv)
with torch.no_grad():