diff --git a/services/imajin-adversarial/service/src/models/evasion_model.py b/services/imajin-adversarial/service/src/models/evasion_model.py index 27ec07f2..be1b5a9d 100644 --- a/services/imajin-adversarial/service/src/models/evasion_model.py +++ b/services/imajin-adversarial/service/src/models/evasion_model.py @@ -111,16 +111,63 @@ class SCRFDEvasionModel: ) -> tuple[np.ndarray, float, float, float, float]: """Apply adversarial detection-evasion perturbation to the full frame. + Attack design (2026-04-11 rewrite — see .project/handoff/improve-adversarial-protections.md + for the diagnostic that motivated this): + + * Logit-space loss. The onnx2torch-converted SCRFD model emits post-sigmoid + probabilities. Naive `-confs.sum()` loss has gradient ∝ p(1-p), which + saturates for anchors already near 0. We transform to logits and work in + that space so the gradient stays roughly constant throughout the attack. + + * EoT (Expectation over Transformation) via N-sample input smoothing. + The onnx2torch computation graph diverges from onnxruntime's under + adversarial perturbation — PGD against the torch graph finds + gradient-model-specific exploits that don't transfer. We average the + loss over EOT_SAMPLES different Gaussian-noise realisations per step, + which finds perturbations robust to small input perturbations and + therefore more transferable (Athalye et al. 2018, "Synthesizing Robust + Adversarial Examples"; Xie et al. 2019, "Improving Transferability…"). + + * MI-FGSM (momentum). Accumulate gradient direction across steps with + decay mu so PGD escapes narrow gradient-model-specific minima. Dong + et al. 2018, "Boosting Adversarial Attacks with Momentum". + + * ORT detector as success oracle. Check the real onnxruntime detector + every ORACLE_STRIDE steps, track best x_adv by face count, early-exit + on complete suppression. + + * Face-region perturbation mask. At eps=0.08 an unconstrained attack + on the full frame produces SSIM ≈ 0.4 (visible destruction). We + first run SCRFD detection to identify face bboxes, then CONSTRAIN + the adversarial delta to be zero outside a dilated face region. + Face pixels are ≤15% of a typical portrait, so even with eps=0.08 + on face pixels the overall SSIM stays above 0.75. + Returns: (perturbed_frame, l2_norm, linf_norm, confidence_before, confidence_after) """ assert self._initialized and self._torch_model is not None and self._device is not None + assert self._insightface_detector is not None import torch.nn.functional as F from attacks.pgd import perturbation_stats model = self._torch_model device = self._device + detector = self._insightface_detector + + # Tunable attack hyperparameters — tuned 2026-04-11 diagnostic. + ORACLE_STRIDE = 5 # check ORT detector every N steps + EOT_SAMPLES = 5 # gradient-averaging samples per step (EoT) + INPUT_NOISE_STD = 0.015 # per-sample Gaussian noise on x_adv + MI_MOMENTUM = 0.9 # MI-FGSM momentum decay + FACE_MASK_THRESHOLD = 0.2 # which anchors count as "face candidates" + LOGIT_CLAMP = 1e-6 # numerical floor for torch.logit + BBOX_DILATE_FRAC = 0.15 # dilate face bbox by 15% on each side + # EoT JPEG quantization: simulate a JPEG q=92 roundtrip inside the EoT + # loop so perturbations survive the pipeline's JPEG encoding step. + # Straight-through quantization: forward = round, backward = identity. + JPEG_QUANT_STEP = 2.0 / 255 # ≈ 1 DCT step at q=92 for Y channel # Work at the ORIGINAL image resolution — PGD computes delta directly in the # pixel space of the output image, so no upscaling of the perturbation is needed. @@ -131,13 +178,7 @@ class SCRFDEvasionModel: x = x.to(device) def _conf_tensor(x_in: torch.Tensor) -> torch.Tensor: - """SCRFD confidence scores, model always receives 640×640 via interpolate. - - The SCRFD ONNX model applies sigmoid internally before exporting; the - onnx2torch conversion preserves this, so output values are already - probabilities in [0, 1]. Applying sigmoid() again would double-sigmoid - (values cluster at 0.5, gradient 4× weaker) — we use direct sum instead. - """ + """SCRFD confidence scores (post-sigmoid probabilities).""" x_640 = F.interpolate(x_in, size=SCRFD_INPUT_SIZE, mode='bilinear', align_corners=False) outputs = model(x_640) @@ -153,43 +194,140 @@ class SCRFDEvasionModel: return torch.zeros(1, device=device) return torch.cat(conf_list, dim=0).squeeze(1) # (total_anchors,) - # Confidence before attack (no gradients needed) + def _ort_face_count(x_tensor: torch.Tensor, jpeg_quality: int = 92) -> int: + """Run onnxruntime SCRFD detector on the current x_adv, AFTER a JPEG + encode/decode round trip. This ensures the "best tracking" metric + reflects what the real pipeline will produce (pipeline.ts finalises + outputs as JPEG q=92). The torch-space attack often defeats the raw + PNG image but fails after JPEG — so oracle against JPEG or nothing.""" + arr = x_tensor.detach().squeeze(0).permute(1, 2, 0).cpu().numpy() + arr = (arr * 255.0).clip(0, 255).astype(np.uint8) + bgr = cv2.cvtColor(arr, cv2.COLOR_RGB2BGR) + # JPEG roundtrip + ok, buf = cv2.imencode('.jpg', bgr, [cv2.IMWRITE_JPEG_QUALITY, jpeg_quality]) + if ok: + bgr = cv2.imdecode(buf, cv2.IMREAD_COLOR) + bboxes, _ = detector.detect(bgr, input_size=(640, 640)) + return 0 if bboxes is None else len(bboxes) + + # Confidence before attack (torch model) with torch.no_grad(): conf_before = float(_conf_tensor(x).max().item()) - # Targeted evasion loss: only attack anchors that are face candidates (>0.4). - # Targeting all 16 800 anchors dilutes the gradient ~1000:1 (background vs face). + # Ground truth from the ORT detector — the attack's real success criterion. + # Also use detected bboxes to build a face-region perturbation mask so the + # attack doesn't destroy background pixels (preserves SSIM above ~0.75). + clean_bboxes, _ = detector.detect(frame_bgr, input_size=(640, 640)) + best_face_count = 0 if clean_bboxes is None else len(clean_bboxes) + + # No faces to defeat → attack has nothing to do. Return the clean frame + # unchanged rather than running full-frame PGD (which would destroy the + # image's SSIM with no protective benefit). + if best_face_count == 0: + return frame_bgr.copy(), 0.0, 0.0, conf_before, conf_before + + # Build a spatial mask over the original-resolution frame: 1.0 inside any + # dilated face bbox, 0.0 elsewhere. Grad is multiplied by this mask so + # the perturbation only modifies face regions. + _, _, H, W = x.shape + face_mask_spatial = torch.zeros((1, 1, H, W), device=device) + for bbox in clean_bboxes: + x1, y1, x2, y2 = bbox[:4] + bw, bh = x2 - x1, y2 - y1 + dx, dy = bw * BBOX_DILATE_FRAC, bh * BBOX_DILATE_FRAC + xi1 = max(0, int(round(x1 - dx))) + yi1 = max(0, int(round(y1 - dy))) + xi2 = min(W, int(round(x2 + dx))) + yi2 = min(H, int(round(y2 + dy))) + if xi2 > xi1 and yi2 > yi1: + face_mask_spatial[:, :, yi1:yi2, xi1:xi2] = 1.0 + # Degenerate case: bboxes returned but all clamped to zero area. + # No pixels to perturb — return unchanged. + if face_mask_spatial.sum() == 0: + return frame_bgr.copy(), 0.0, 0.0, conf_before, conf_before + # alpha uses the Madry et al. standard: 2.5 * eps / steps to avoid overshoot. effective_alpha = alpha if alpha is not None else (2.5 * eps / max(steps, 1)) + # Track best-so-far by ORT face count. Initialise to clean image so a + # non-transferring attack still records the final perturbation via the + # fallback after the loop (see last_x_adv). best_x_adv = x.clone().detach() - best_conf = conf_before + baseline_face_count = best_face_count + # MI-FGSM momentum accumulator + g_momentum = torch.zeros_like(x) + + # Random-start PGD initialization x_adv = x + torch.empty_like(x).uniform_(-eps, eps) x_adv = x_adv.clamp(0.0, 1.0).detach() - for _ in range(steps): + last_x_adv = x_adv.clone() # fallback if ORT never drops + + for step_idx in range(steps): x_adv.requires_grad_(True) - confs = _conf_tensor(x_adv) - # Focus on anchors likely corresponding to detected faces - face_mask = (confs.detach() > 0.4) - if face_mask.sum() > 0: - loss = -(confs * face_mask.float()).sum() - else: - loss = -confs.max() - grad = torch.autograd.grad(loss, x_adv, create_graph=False)[0] + + # EoT: average gradient over multiple noise + quantization + # realisations — finds perturbations that are robust to both + # small input changes (transferability) and pixel-level rounding + # (JPEG recompression survival). + accum_grad = torch.zeros_like(x_adv) + for sample_idx in range(EOT_SAMPLES): + noise = torch.randn_like(x_adv) * INPUT_NOISE_STD + x_smooth = (x_adv + noise).clamp(0.0, 1.0) + + # Straight-through JPEG quantization: round to q=92 step size + # on forward, identity on backward. This forces PGD to find + # perturbations that tolerate pixel-level rounding — which is + # exactly what JPEG compression does. + if sample_idx > 0: # skip quantization on one sample for gradient diversity + x_smooth_q = (x_smooth / JPEG_QUANT_STEP).round() * JPEG_QUANT_STEP + x_smooth = x_smooth + (x_smooth_q - x_smooth).detach() + + confs = _conf_tensor(x_smooth) + face_mask = (confs.detach() > FACE_MASK_THRESHOLD) + + if face_mask.sum() > 0: + # Logit-space loss: non-saturating gradient for small confs. + logits = torch.logit(confs.clamp(LOGIT_CLAMP, 1.0 - LOGIT_CLAMP)) + loss = -(logits * face_mask.float()).sum() + else: + # No face-candidate anchors — fall back to max-conf logit. + loss = -torch.logit(confs.max().clamp(LOGIT_CLAMP, 1.0 - LOGIT_CLAMP)) + + grad = torch.autograd.grad(loss, x_adv, create_graph=False, retain_graph=False)[0] + accum_grad = accum_grad + grad.detach() + + # Average and L1-normalise the gradient (MI-FGSM convention) + avg_grad = accum_grad / EOT_SAMPLES + # Zero out gradient outside the face region — delta will only update + # face-region pixels, background stays clean. + avg_grad = avg_grad * face_mask_spatial + grad_norm = avg_grad.abs().mean().clamp(min=1e-12) + g_momentum = MI_MOMENTUM * g_momentum + avg_grad / grad_norm with torch.no_grad(): - x_adv = x_adv + effective_alpha * grad.sign() + # Apply step + project to L-inf ball around clean x + clip to [0, 1] + x_adv = x_adv + effective_alpha * g_momentum.sign() * face_mask_spatial x_adv = torch.max(torch.min(x_adv, x + eps), x - eps) x_adv = x_adv.clamp(0.0, 1.0) + # Re-enforce the face mask: pixels outside mask snap back to clean x + x_adv = x * (1 - face_mask_spatial) + x_adv * face_mask_spatial + last_x_adv = x_adv.clone() - step_conf = _conf_tensor(x_adv).max().item() - if step_conf < best_conf: - best_conf = step_conf - best_x_adv = x_adv.clone() + # Real-world success oracle every ORACLE_STRIDE steps + if (step_idx + 1) % ORACLE_STRIDE == 0 or step_idx == steps - 1: + face_count = _ort_face_count(x_adv) + if face_count < best_face_count: + best_face_count = face_count + best_x_adv = x_adv.clone() + if best_face_count == 0: + break # early exit on complete suppression - x_adv = best_x_adv + # If ORT face count never dropped below the clean baseline, return the + # LAST iterate (still adversarial in torch-space, carries forensic value + # and keeps the protected output visually different from the original). + x_adv = best_x_adv if best_face_count < baseline_face_count else last_x_adv l2, linf = perturbation_stats(x, x_adv) with torch.no_grad():