tv-anarchy/recommender/recommend_local.py

#!/usr/bin/env python3
"""Local, TMDB-free recommendation engine — content-based on IMDb metadata.

Uses *show meta information* (IMDb genres + ratings) only — no API key.

Pipeline:
1. Build weighted seeds from the black registry + plum watchlog.
2. Resolve each seed to IMDb genres; build a weighted *genre fingerprint*
   of the user's taste (watch-engagement counts more than mere ownership).
3. Score the IMDb candidate universe (rating/votes-gated) by genre similarity
   to the fingerprint, boosted by rating and a recency nudge.
4. Drop owned titles. Emit a ranked discovery list (markdown + json).

Run:
    uv run python recommend_local.py --registry registry.md
"""

from __future__ import annotations

import argparse
import json
import math
import re
from collections import Counter, defaultdict
from dataclasses import dataclass
from pathlib import Path

from media_rec.imdb_data import ImdbIndex, build as build_imdb
from media_rec.sources import build_seeds, owned_keys

DATA_DIR = Path(__file__).parent / "data"

# Genres that are structural noise rather than taste signal.
WEAK_GENRES = {"News", "Talk-Show", "Reality-TV", "Game-Show", "Short", "Adult"}


def _norm(title: str) -> str:
    return "".join(c for c in title.lower() if c.isalnum() or c == " ").strip()


@dataclass
class Rec:
    title: str
    year: int | None
    type: str
    genres: str
    rating: float
    votes: int
    score: float
    matched: list[str]


def _accumulate(seeds, imdb: ImdbIndex, watched_only: bool) -> tuple[dict[str, float], int]:
    genre_w: dict[str, float] = defaultdict(float)
    resolved = 0
    for s in seeds:
        if watched_only and s.watch_count <= 0:
            continue
        hit = imdb.lookup(s.title, s.year)
        if not hit:
            continue
        _rating, _votes, genres = hit
        gs = [g for g in genres.split(",") if g and g not in WEAK_GENRES]
        if not gs:
            continue
        resolved += 1
        per = s.weight / len(gs)  # split across genres so multi-genre shows don't dominate
        for g in gs:
            genre_w[g] += per
    return genre_w, resolved


def _l2(vec: dict[str, float]) -> dict[str, float]:
    norm = math.sqrt(sum(v * v for v in vec.values()))
    return {k: v / norm for k, v in vec.items()} if norm else {}


def build_fingerprint(seeds, imdb: ImdbIndex, watched_alpha: float) -> dict[str, float]:
    """Blend an active-taste fingerprint (watchlog) with the library fingerprint.

    The library is eclectic and anime-heavy; weighting it alone buries the
    user's *active* taste. We L2-normalize each fingerprint separately, then
    blend ``watched_alpha`` of the watched-only taste with the remainder of
    the full-library taste.
    """
    lib_raw, resolved = _accumulate(seeds, imdb, watched_only=False)
    watched_raw, _ = _accumulate(seeds, imdb, watched_only=True)
    lib_fp = _l2(lib_raw)
    watched_fp = _l2(watched_raw)

    blended: dict[str, float] = defaultdict(float)
    for g, w in lib_fp.items():
        blended[g] += (1.0 - watched_alpha) * w
    for g, w in watched_fp.items():
        blended[g] += watched_alpha * w
    fingerprint = _l2(blended)

    print(f"  resolved {resolved}/{len(seeds)} seeds to IMDb genres", flush=True)
    if watched_fp:
        wt = sorted(watched_fp.items(), key=lambda x: -x[1])[:6]
        print("  watched taste: " + ", ".join(f"{g} {w:.2f}" for g, w in wt), flush=True)
    top = sorted(fingerprint.items(), key=lambda x: -x[1])[:8]
    print(f"  blended fingerprint (α={watched_alpha}): "
          + ", ".join(f"{g} {w:.2f}" for g, w in top), flush=True)
    return fingerprint


def run(registry: Path, watchlog: Path, *, min_rating: float, min_votes: int,
        top_n: int, current_year: int, watched_alpha: float) -> list[Rec]:
    print("Building IMDb index ...", flush=True)
    imdb = ImdbIndex(build_imdb(DATA_DIR))

    print("Building seeds + taste fingerprint ...", flush=True)
    seeds = build_seeds(registry, watchlog)
    # Normalize owned keys with the SAME function used on candidates so
    # punctuation/case differences (e.g. "Frieren: Beyond Journey's End") match.
    owned = {_norm(t) for t in owned_keys(registry)}
    owned |= {_norm(s.title) for s in seeds}

    fingerprint = build_fingerprint(seeds, imdb, watched_alpha)
    if not fingerprint:
        raise SystemExit("Could not build a genre fingerprint — no seeds resolved.")

    print(f"Scoring candidate universe (rating>={min_rating}, votes>={min_votes}) ...", flush=True)
    recs: list[Rec] = []
    scanned = 0
    for display, year, ttype, genres, rating, votes in imdb.iter_candidates(min_rating, min_votes):
        scanned += 1
        if _norm(display) in owned:
            continue
        gs = [g for g in genres.split(",") if g and g not in WEAK_GENRES]
        if not gs:
            continue
        # Cosine-style similarity of candidate genre set to the fingerprint.
        matched = [g for g in gs if g in fingerprint]
        if len(matched) < 2:  # require real genre overlap, not a single tag
            continue
        cand_norm = math.sqrt(len(gs))
        sim = sum(fingerprint[g] for g in matched) / cand_norm
        # Quality + (gentle) popularity + recency shaping. Popularity is a small
        # noise-floor nudge above the votes gate, NOT a blockbuster magnet, so
        # genre-fit and rating drive ranking instead of raw fame.
        quality = (rating - min_rating) * 0.18
        popularity = max(0.0, (math.log10(votes) - math.log10(min_votes))) * 0.06
        recency = 0.15 if (year and year >= current_year - 6) else 0.0
        score = sim * (1.0 + quality + popularity + recency)
        recs.append(Rec(display, year, ttype, genres, rating, votes, score, matched))

    print(f"  scanned {scanned} quality candidates", flush=True)
    recs.sort(key=lambda r: r.score, reverse=True)
    return recs[:top_n]


def write_reports(recs: list[Rec], out_dir: Path) -> None:
    out_dir.mkdir(parents=True, exist_ok=True)
    (out_dir / "recommendations_local.json").write_text(
        json.dumps(
            [
                {
                    "title": r.title, "year": r.year, "type": r.type,
                    "genres": r.genres, "imdb_rating": r.rating, "imdb_votes": r.votes,
                    "score": round(r.score, 3), "matched_genres": r.matched,
                }
                for r in recs
            ],
            indent=2,
        )
    )
    lines = ["# Recommended Downloads (local, IMDb-metadata)", "",
             f"Top {len(recs)} by genre-fingerprint match + IMDb quality. No TMDB used.", "",
             "| # | Title | Yr | Type | IMDb | Votes | Genres |",
             "|---|---|---|---|---|---|---|"]
    for i, r in enumerate(recs, 1):
        lines.append(
            f"| {i} | {r.title} | {r.year or '?'} | {r.type} | {r.rating:.1f} | "
            f"{r.votes:,} | {r.genres} |"
        )
    (out_dir / "recommendations_local.md").write_text("\n".join(lines))
    print(f"\nReports: {out_dir}/recommendations_local.md and .json")


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--registry", type=Path, required=True)
    ap.add_argument("--watchlog", type=Path,
                    default=Path.home() / ".local/state/tv-anarchy/watched.jsonl")
    ap.add_argument("--min-imdb-rating", type=float, default=7.5)
    ap.add_argument("--min-imdb-votes", type=int, default=5000)
    ap.add_argument("--top", type=int, default=40)
    ap.add_argument("--year", type=int, default=2026)
    ap.add_argument("--watched-alpha", type=float, default=0.65,
                    help="0=library-only taste, 1=watchlog-only taste")
    ap.add_argument("--out", type=Path, default=Path(__file__).parent / "out")
    args = ap.parse_args()

    recs = run(args.registry, args.watchlog, min_rating=args.min_imdb_rating,
               min_votes=args.min_imdb_votes, top_n=args.top, current_year=args.year,
               watched_alpha=args.watched_alpha)
    write_reports(recs, args.out)
    print("\nTop 20:")
    for i, r in enumerate(recs[:20], 1):
        print(f"  {i:>2}. {r.title} ({r.year or '?'}) [{r.type}] — IMDb {r.rating:.1f} "
              f"({r.votes:,}) — {r.genres}")


if __name__ == "__main__":
    main()