tv-anarchy/recommender/recommend.py

#!/usr/bin/env python3
"""Media recommendation engine.

Pipeline:
1. Build weighted *seeds* from the black library registry + plum watchlog.
2. Resolve each seed to a TMDB id.
3. Gather TMDB recommendations/similar per seed → candidate pool.
4. Score candidates by seed-overlap (weighted) and gate on IMDb rating/votes.
5. Drop already-owned titles.
6. Emit a ranked download list (markdown + json).

Run:
    TMDB_API_KEY=xxxx uv run python recommend.py \
        --registry /path/to/.registry.md \
        --watchlog ~/.local/state/tv-anarchy/watched.jsonl

The registry is pulled from black:/bigdisk/_/media/.registry.md (scp it first
or pass a local copy).
"""

from __future__ import annotations

import argparse
import json
import os
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path

from media_rec.imdb_data import ImdbIndex, build as build_imdb
from media_rec.sources import build_seeds, owned_keys
from media_rec.tmdb import Candidate, Tmdb, TmdbError

DATA_DIR = Path(__file__).parent / "data"
ENV_FILE = Path(__file__).parent / ".env"


def _load_env() -> None:
    """Load KEY=VALUE lines from a local .env (no python-dotenv dependency)."""
    if not ENV_FILE.exists():
        return
    for line in ENV_FILE.read_text().splitlines():
        line = line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, _, val = line.partition("=")
        os.environ.setdefault(key.strip(), val.strip().strip('"').strip("'"))


@dataclass
class Scored:
    cand: Candidate
    score: float
    overlap: int  # how many seeds recommended it
    seed_titles: list[str] = field(default_factory=list)
    imdb_rating: float | None = None
    imdb_votes: int | None = None
    genres: str = ""


def _norm(title: str) -> str:
    return "".join(c for c in title.lower() if c.isalnum() or c == " ").strip()


def run(
    registry: Path,
    watchlog: Path,
    *,
    min_imdb_rating: float,
    min_imdb_votes: int,
    max_seeds: int,
    top_n: int,
) -> list[Scored]:
    api_key = os.environ.get("TMDB_API_KEY", "")
    tmdb = Tmdb(api_key, DATA_DIR / "tmdb_cache.json")

    print("Building IMDb index (first run downloads ~250 MB) ...", flush=True)
    imdb = ImdbIndex(build_imdb(DATA_DIR))

    print("Building seeds from library + watchlog ...", flush=True)
    seeds = build_seeds(registry, watchlog)[:max_seeds]
    owned = owned_keys(registry)
    print(f"  {len(seeds)} seeds (top: {', '.join(s.title for s in seeds[:5])})", flush=True)

    # Resolve seeds → TMDB, gather recommendations.
    pool: dict[int, Candidate] = {}
    contributions: dict[int, list[tuple[str, float]]] = defaultdict(list)
    for i, seed in enumerate(seeds, 1):
        resolved = tmdb.resolve(seed.title, seed.year)
        if resolved is None:
            continue
        recs = tmdb.recommendations(resolved.tmdb_id, resolved.media_type)
        for rank, cand in enumerate(recs):
            pool.setdefault(cand.tmdb_id, cand)
            # Position decay: earlier recs are stronger signals.
            decay = 1.0 / (1.0 + rank * 0.08)
            contributions[cand.tmdb_id].append((seed.title, seed.weight * decay))
        if i % 10 == 0:
            print(f"  resolved {i}/{len(seeds)} seeds, pool={len(pool)}", flush=True)
            tmdb.flush()
    tmdb.flush()

    # Score + gate.
    scored: list[Scored] = []
    for cid, cand in pool.items():
        if _norm(cand.title) in owned:
            continue
        contribs = contributions[cid]
        overlap = len(contribs)
        base = sum(w for _, w in contribs)
        # Overlap is the dominant signal — recommended-by-many beats one strong seed.
        score = base * (1.0 + 0.5 * (overlap - 1))

        imdb = imdb_index_lookup(imdb, cand)
        rating = votes = None
        genres = ""
        if imdb:
            rating, votes, genres = imdb
            if rating < min_imdb_rating or votes < min_imdb_votes:
                continue
            # Quality multiplier: nudge by how far above the bar it sits.
            score *= 1.0 + (rating - min_imdb_rating) * 0.15
        else:
            # No IMDb match — fall back to TMDB votes to avoid obscure noise.
            if cand.vote_count < 200 or cand.vote_average < 6.5:
                continue
            score *= 0.85  # penalize unverified

        scored.append(
            Scored(
                cand=cand,
                score=score,
                overlap=overlap,
                seed_titles=[t for t, _ in sorted(contribs, key=lambda x: -x[1])][:4],
                imdb_rating=rating,
                imdb_votes=votes,
                genres=genres,
            )
        )

    scored.sort(key=lambda s: s.score, reverse=True)
    return scored[:top_n]


def imdb_index_lookup(imdb: ImdbIndex, cand: Candidate):
    return imdb.lookup(cand.title, cand.year)


def write_reports(scored: list[Scored], out_dir: Path) -> None:
    out_dir.mkdir(parents=True, exist_ok=True)
    (out_dir / "recommendations.json").write_text(
        json.dumps(
            [
                {
                    "title": s.cand.title,
                    "year": s.cand.year,
                    "type": s.cand.media_type,
                    "tmdb_id": s.cand.tmdb_id,
                    "score": round(s.score, 2),
                    "overlap": s.overlap,
                    "imdb_rating": s.imdb_rating,
                    "imdb_votes": s.imdb_votes,
                    "genres": s.genres,
                    "because_of": s.seed_titles,
                }
                for s in scored
            ],
            indent=2,
        )
    )
    lines = ["# Recommended Downloads", "", f"Top {len(scored)} by seed-overlap + IMDb quality.", ""]
    lines.append("| # | Title | Yr | Type | IMDb | Votes | Overlap | Because you have/watch |")
    lines.append("|---|---|---|---|---|---|---|---|")
    for i, s in enumerate(scored, 1):
        rating = f"{s.imdb_rating:.1f}" if s.imdb_rating else "—"
        votes = f"{s.imdb_votes:,}" if s.imdb_votes else "—"
        because = ", ".join(s.seed_titles[:3])
        lines.append(
            f"| {i} | {s.cand.title} | {s.cand.year or '?'} | {s.cand.media_type} | "
            f"{rating} | {votes} | {s.overlap} | {because} |"
        )
    (out_dir / "recommendations.md").write_text("\n".join(lines))
    print(f"\nReports written to {out_dir}/recommendations.md and .json")


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--registry", type=Path, required=True)
    ap.add_argument(
        "--watchlog",
        type=Path,
        default=Path.home() / ".local/state/tv-anarchy/watched.jsonl",
    )
    ap.add_argument("--min-imdb-rating", type=float, default=7.0)
    ap.add_argument("--min-imdb-votes", type=int, default=2000)
    ap.add_argument("--max-seeds", type=int, default=120)
    ap.add_argument("--top", type=int, default=40)
    ap.add_argument("--out", type=Path, default=Path(__file__).parent / "out")
    args = ap.parse_args()
    _load_env()

    try:
        scored = run(
            args.registry,
            args.watchlog,
            min_imdb_rating=args.min_imdb_rating,
            min_imdb_votes=args.min_imdb_votes,
            max_seeds=args.max_seeds,
            top_n=args.top,
        )
    except TmdbError as exc:
        raise SystemExit(f"TMDB error: {exc}\nSet a valid TMDB_API_KEY and retry.")

    write_reports(scored, args.out)
    print("\nTop 15:")
    for i, s in enumerate(scored[:15], 1):
        rating = f"IMDb {s.imdb_rating:.1f}" if s.imdb_rating else "IMDb —"
        print(
            f"  {i:>2}. {s.cand.title} ({s.cand.year or '?'}) [{s.cand.media_type}] "
            f"— {rating}, overlap {s.overlap} — via {', '.join(s.seed_titles[:2])}"
        )


if __name__ == "__main__":
    main()