chore(imajin-adversarial): 🔧 Add dirty flag tracking for subproject modifications

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-04-12 10:44:39 -07:00 · 2026-04-12 10:44:39 -07:00 · 4fd20451fc
commit 4fd20451fc
parent e68d9155eb
1 changed files with 164 additions and 26 deletions
--- a/services/imajin-adversarial/service/src/models/evasion_model.py
+++ b/services/imajin-adversarial/service/src/models/evasion_model.py
@ -111,16 +111,63 @@ class SCRFDEvasionModel:
    ) -> tuple[np.ndarray, float, float, float, float]:
        """Apply adversarial detection-evasion perturbation to the full frame.

+        Attack design (2026-04-11 rewrite — see .project/handoff/improve-adversarial-protections.md
+        for the diagnostic that motivated this):
+
+          * Logit-space loss. The onnx2torch-converted SCRFD model emits post-sigmoid
+            probabilities. Naive `-confs.sum()` loss has gradient ∝ p(1-p), which
+            saturates for anchors already near 0. We transform to logits and work in
+            that space so the gradient stays roughly constant throughout the attack.
+
+          * EoT (Expectation over Transformation) via N-sample input smoothing.
+            The onnx2torch computation graph diverges from onnxruntime's under
+            adversarial perturbation — PGD against the torch graph finds
+            gradient-model-specific exploits that don't transfer. We average the
+            loss over EOT_SAMPLES different Gaussian-noise realisations per step,
+            which finds perturbations robust to small input perturbations and
+            therefore more transferable (Athalye et al. 2018, "Synthesizing Robust
+            Adversarial Examples"; Xie et al. 2019, "Improving Transferability…").
+
+          * MI-FGSM (momentum). Accumulate gradient direction across steps with
+            decay mu so PGD escapes narrow gradient-model-specific minima. Dong
+            et al. 2018, "Boosting Adversarial Attacks with Momentum".
+
+          * ORT detector as success oracle. Check the real onnxruntime detector
+            every ORACLE_STRIDE steps, track best x_adv by face count, early-exit
+            on complete suppression.
+
+          * Face-region perturbation mask. At eps=0.08 an unconstrained attack
+            on the full frame produces SSIM ≈ 0.4 (visible destruction). We
+            first run SCRFD detection to identify face bboxes, then CONSTRAIN
+            the adversarial delta to be zero outside a dilated face region.
+            Face pixels are ≤15% of a typical portrait, so even with eps=0.08
+            on face pixels the overall SSIM stays above 0.75.
+
        Returns:
            (perturbed_frame, l2_norm, linf_norm, confidence_before, confidence_after)
        """
        assert self._initialized and self._torch_model is not None and self._device is not None
+        assert self._insightface_detector is not None

        import torch.nn.functional as F
        from attacks.pgd import perturbation_stats

        model = self._torch_model
        device = self._device
+        detector = self._insightface_detector
+
+        # Tunable attack hyperparameters — tuned 2026-04-11 diagnostic.
+        ORACLE_STRIDE = 5            # check ORT detector every N steps
+        EOT_SAMPLES = 5              # gradient-averaging samples per step (EoT)
+        INPUT_NOISE_STD = 0.015      # per-sample Gaussian noise on x_adv
+        MI_MOMENTUM = 0.9            # MI-FGSM momentum decay
+        FACE_MASK_THRESHOLD = 0.2    # which anchors count as "face candidates"
+        LOGIT_CLAMP = 1e-6           # numerical floor for torch.logit
+        BBOX_DILATE_FRAC = 0.15      # dilate face bbox by 15% on each side
+        # EoT JPEG quantization: simulate a JPEG q=92 roundtrip inside the EoT
+        # loop so perturbations survive the pipeline's JPEG encoding step.
+        # Straight-through quantization: forward = round, backward = identity.
+        JPEG_QUANT_STEP = 2.0 / 255  # ≈ 1 DCT step at q=92 for Y channel

        # Work at the ORIGINAL image resolution — PGD computes delta directly in the
        # pixel space of the output image, so no upscaling of the perturbation is needed.
@ -131,13 +178,7 @@ class SCRFDEvasionModel:
        x = x.to(device)

        def _conf_tensor(x_in: torch.Tensor) -> torch.Tensor:
-            """SCRFD confidence scores, model always receives 640×640 via interpolate.
-
-            The SCRFD ONNX model applies sigmoid internally before exporting; the
-            onnx2torch conversion preserves this, so output values are already
-            probabilities in [0, 1].  Applying sigmoid() again would double-sigmoid
-            (values cluster at 0.5, gradient 4× weaker) — we use direct sum instead.
-            """
+            """SCRFD confidence scores (post-sigmoid probabilities)."""
            x_640 = F.interpolate(x_in, size=SCRFD_INPUT_SIZE, mode='bilinear',
                                   align_corners=False)
            outputs = model(x_640)
@ -153,43 +194,140 @@ class SCRFDEvasionModel:
                return torch.zeros(1, device=device)
            return torch.cat(conf_list, dim=0).squeeze(1)  # (total_anchors,)

-        # Confidence before attack (no gradients needed)
+        def _ort_face_count(x_tensor: torch.Tensor, jpeg_quality: int = 92) -> int:
+            """Run onnxruntime SCRFD detector on the current x_adv, AFTER a JPEG
+            encode/decode round trip. This ensures the "best tracking" metric
+            reflects what the real pipeline will produce (pipeline.ts finalises
+            outputs as JPEG q=92). The torch-space attack often defeats the raw
+            PNG image but fails after JPEG — so oracle against JPEG or nothing."""
+            arr = x_tensor.detach().squeeze(0).permute(1, 2, 0).cpu().numpy()
+            arr = (arr * 255.0).clip(0, 255).astype(np.uint8)
+            bgr = cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
+            # JPEG roundtrip
+            ok, buf = cv2.imencode('.jpg', bgr, [cv2.IMWRITE_JPEG_QUALITY, jpeg_quality])
+            if ok:
+                bgr = cv2.imdecode(buf, cv2.IMREAD_COLOR)
+            bboxes, _ = detector.detect(bgr, input_size=(640, 640))
+            return 0 if bboxes is None else len(bboxes)
+
+        # Confidence before attack (torch model)
        with torch.no_grad():
            conf_before = float(_conf_tensor(x).max().item())

-        # Targeted evasion loss: only attack anchors that are face candidates (>0.4).
-        # Targeting all 16 800 anchors dilutes the gradient ~1000:1 (background vs face).
+        # Ground truth from the ORT detector — the attack's real success criterion.
+        # Also use detected bboxes to build a face-region perturbation mask so the
+        # attack doesn't destroy background pixels (preserves SSIM above ~0.75).
+        clean_bboxes, _ = detector.detect(frame_bgr, input_size=(640, 640))
+        best_face_count = 0 if clean_bboxes is None else len(clean_bboxes)
+
+        # No faces to defeat → attack has nothing to do. Return the clean frame
+        # unchanged rather than running full-frame PGD (which would destroy the
+        # image's SSIM with no protective benefit).
+        if best_face_count == 0:
+            return frame_bgr.copy(), 0.0, 0.0, conf_before, conf_before
+
+        # Build a spatial mask over the original-resolution frame: 1.0 inside any
+        # dilated face bbox, 0.0 elsewhere. Grad is multiplied by this mask so
+        # the perturbation only modifies face regions.
+        _, _, H, W = x.shape
+        face_mask_spatial = torch.zeros((1, 1, H, W), device=device)
+        for bbox in clean_bboxes:
+            x1, y1, x2, y2 = bbox[:4]
+            bw, bh = x2 - x1, y2 - y1
+            dx, dy = bw * BBOX_DILATE_FRAC, bh * BBOX_DILATE_FRAC
+            xi1 = max(0, int(round(x1 - dx)))
+            yi1 = max(0, int(round(y1 - dy)))
+            xi2 = min(W, int(round(x2 + dx)))
+            yi2 = min(H, int(round(y2 + dy)))
+            if xi2 > xi1 and yi2 > yi1:
+                face_mask_spatial[:, :, yi1:yi2, xi1:xi2] = 1.0
+        # Degenerate case: bboxes returned but all clamped to zero area.
+        # No pixels to perturb — return unchanged.
+        if face_mask_spatial.sum() == 0:
+            return frame_bgr.copy(), 0.0, 0.0, conf_before, conf_before
+
        # alpha uses the Madry et al. standard: 2.5 * eps / steps to avoid overshoot.
        effective_alpha = alpha if alpha is not None else (2.5 * eps / max(steps, 1))

+        # Track best-so-far by ORT face count. Initialise to clean image so a
+        # non-transferring attack still records the final perturbation via the
+        # fallback after the loop (see last_x_adv).
        best_x_adv = x.clone().detach()
-        best_conf = conf_before
+        baseline_face_count = best_face_count

+        # MI-FGSM momentum accumulator
+        g_momentum = torch.zeros_like(x)
+
+        # Random-start PGD initialization
        x_adv = x + torch.empty_like(x).uniform_(-eps, eps)
        x_adv = x_adv.clamp(0.0, 1.0).detach()

-        for _ in range(steps):
+        last_x_adv = x_adv.clone()  # fallback if ORT never drops
+
+        for step_idx in range(steps):
            x_adv.requires_grad_(True)
-            confs = _conf_tensor(x_adv)
-            # Focus on anchors likely corresponding to detected faces
-            face_mask = (confs.detach() > 0.4)
-            if face_mask.sum() > 0:
-                loss = -(confs * face_mask.float()).sum()
-            else:
-                loss = -confs.max()
-            grad = torch.autograd.grad(loss, x_adv, create_graph=False)[0]
+
+            # EoT: average gradient over multiple noise + quantization
+            # realisations — finds perturbations that are robust to both
+            # small input changes (transferability) and pixel-level rounding
+            # (JPEG recompression survival).
+            accum_grad = torch.zeros_like(x_adv)
+            for sample_idx in range(EOT_SAMPLES):
+                noise = torch.randn_like(x_adv) * INPUT_NOISE_STD
+                x_smooth = (x_adv + noise).clamp(0.0, 1.0)
+
+                # Straight-through JPEG quantization: round to q=92 step size
+                # on forward, identity on backward. This forces PGD to find
+                # perturbations that tolerate pixel-level rounding — which is
+                # exactly what JPEG compression does.
+                if sample_idx > 0:  # skip quantization on one sample for gradient diversity
+                    x_smooth_q = (x_smooth / JPEG_QUANT_STEP).round() * JPEG_QUANT_STEP
+                    x_smooth = x_smooth + (x_smooth_q - x_smooth).detach()
+
+                confs = _conf_tensor(x_smooth)
+                face_mask = (confs.detach() > FACE_MASK_THRESHOLD)
+
+                if face_mask.sum() > 0:
+                    # Logit-space loss: non-saturating gradient for small confs.
+                    logits = torch.logit(confs.clamp(LOGIT_CLAMP, 1.0 - LOGIT_CLAMP))
+                    loss = -(logits * face_mask.float()).sum()
+                else:
+                    # No face-candidate anchors — fall back to max-conf logit.
+                    loss = -torch.logit(confs.max().clamp(LOGIT_CLAMP, 1.0 - LOGIT_CLAMP))
+
+                grad = torch.autograd.grad(loss, x_adv, create_graph=False, retain_graph=False)[0]
+                accum_grad = accum_grad + grad.detach()
+
+            # Average and L1-normalise the gradient (MI-FGSM convention)
+            avg_grad = accum_grad / EOT_SAMPLES
+            # Zero out gradient outside the face region — delta will only update
+            # face-region pixels, background stays clean.
+            avg_grad = avg_grad * face_mask_spatial
+            grad_norm = avg_grad.abs().mean().clamp(min=1e-12)
+            g_momentum = MI_MOMENTUM * g_momentum + avg_grad / grad_norm

            with torch.no_grad():
-                x_adv = x_adv + effective_alpha * grad.sign()
+                # Apply step + project to L-inf ball around clean x + clip to [0, 1]
+                x_adv = x_adv + effective_alpha * g_momentum.sign() * face_mask_spatial
                x_adv = torch.max(torch.min(x_adv, x + eps), x - eps)
                x_adv = x_adv.clamp(0.0, 1.0)
+                # Re-enforce the face mask: pixels outside mask snap back to clean x
+                x_adv = x * (1 - face_mask_spatial) + x_adv * face_mask_spatial
+                last_x_adv = x_adv.clone()

-                step_conf = _conf_tensor(x_adv).max().item()
-                if step_conf < best_conf:
-                    best_conf = step_conf
-                    best_x_adv = x_adv.clone()
+                # Real-world success oracle every ORACLE_STRIDE steps
+                if (step_idx + 1) % ORACLE_STRIDE == 0 or step_idx == steps - 1:
+                    face_count = _ort_face_count(x_adv)
+                    if face_count < best_face_count:
+                        best_face_count = face_count
+                        best_x_adv = x_adv.clone()
+                        if best_face_count == 0:
+                            break  # early exit on complete suppression

-        x_adv = best_x_adv
+        # If ORT face count never dropped below the clean baseline, return the
+        # LAST iterate (still adversarial in torch-space, carries forensic value
+        # and keeps the protected output visually different from the original).
+        x_adv = best_x_adv if best_face_count < baseline_face_count else last_x_adv
        l2, linf = perturbation_stats(x, x_adv)

        with torch.no_grad():