tv-anarchy/recommender/recommend.py
Natalie 4a2ceb9781 feat(offline): inline star-to-keep and trash-to-cull on cache rows
Surface the existing pin (keep-from-cull) and per-file delete actions as
visible inline buttons on each offline cache row instead of context-menu-only:
a star toggles protection from auto-cull (and restore-if-missing), a trash
culls that file early. Aligns wording/icons to the star metaphor.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 00:12:41 -04:00

225 lines
7.7 KiB
Python

#!/usr/bin/env python3
"""Media recommendation engine.
Pipeline:
1. Build weighted *seeds* from the black library registry + plum watchlog.
2. Resolve each seed to a TMDB id.
3. Gather TMDB recommendations/similar per seed → candidate pool.
4. Score candidates by seed-overlap (weighted) and gate on IMDb rating/votes.
5. Drop already-owned titles.
6. Emit a ranked download list (markdown + json).
Run:
TMDB_API_KEY=xxxx uv run python recommend.py \
--registry /path/to/.registry.md \
--watchlog ~/.local/state/tv-anarchy/watched.jsonl
The registry is pulled from black:/bigdisk/_/media/.registry.md (scp it first
or pass a local copy).
"""
from __future__ import annotations
import argparse
import json
import os
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from media_rec.imdb_data import ImdbIndex, build as build_imdb
from media_rec.sources import build_seeds, owned_keys
from media_rec.tmdb import Candidate, Tmdb, TmdbError
DATA_DIR = Path(__file__).parent / "data"
ENV_FILE = Path(__file__).parent / ".env"
def _load_env() -> None:
"""Load KEY=VALUE lines from a local .env (no python-dotenv dependency)."""
if not ENV_FILE.exists():
return
for line in ENV_FILE.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, _, val = line.partition("=")
os.environ.setdefault(key.strip(), val.strip().strip('"').strip("'"))
@dataclass
class Scored:
cand: Candidate
score: float
overlap: int # how many seeds recommended it
seed_titles: list[str] = field(default_factory=list)
imdb_rating: float | None = None
imdb_votes: int | None = None
genres: str = ""
def _norm(title: str) -> str:
return "".join(c for c in title.lower() if c.isalnum() or c == " ").strip()
def run(
registry: Path,
watchlog: Path,
*,
min_imdb_rating: float,
min_imdb_votes: int,
max_seeds: int,
top_n: int,
) -> list[Scored]:
api_key = os.environ.get("TMDB_API_KEY", "")
tmdb = Tmdb(api_key, DATA_DIR / "tmdb_cache.json")
print("Building IMDb index (first run downloads ~250 MB) ...", flush=True)
imdb = ImdbIndex(build_imdb(DATA_DIR))
print("Building seeds from library + watchlog ...", flush=True)
seeds = build_seeds(registry, watchlog)[:max_seeds]
owned = owned_keys(registry)
print(f" {len(seeds)} seeds (top: {', '.join(s.title for s in seeds[:5])})", flush=True)
# Resolve seeds → TMDB, gather recommendations.
pool: dict[int, Candidate] = {}
contributions: dict[int, list[tuple[str, float]]] = defaultdict(list)
for i, seed in enumerate(seeds, 1):
resolved = tmdb.resolve(seed.title, seed.year)
if resolved is None:
continue
recs = tmdb.recommendations(resolved.tmdb_id, resolved.media_type)
for rank, cand in enumerate(recs):
pool.setdefault(cand.tmdb_id, cand)
# Position decay: earlier recs are stronger signals.
decay = 1.0 / (1.0 + rank * 0.08)
contributions[cand.tmdb_id].append((seed.title, seed.weight * decay))
if i % 10 == 0:
print(f" resolved {i}/{len(seeds)} seeds, pool={len(pool)}", flush=True)
tmdb.flush()
tmdb.flush()
# Score + gate.
scored: list[Scored] = []
for cid, cand in pool.items():
if _norm(cand.title) in owned:
continue
contribs = contributions[cid]
overlap = len(contribs)
base = sum(w for _, w in contribs)
# Overlap is the dominant signal — recommended-by-many beats one strong seed.
score = base * (1.0 + 0.5 * (overlap - 1))
imdb = imdb_index_lookup(imdb, cand)
rating = votes = None
genres = ""
if imdb:
rating, votes, genres = imdb
if rating < min_imdb_rating or votes < min_imdb_votes:
continue
# Quality multiplier: nudge by how far above the bar it sits.
score *= 1.0 + (rating - min_imdb_rating) * 0.15
else:
# No IMDb match — fall back to TMDB votes to avoid obscure noise.
if cand.vote_count < 200 or cand.vote_average < 6.5:
continue
score *= 0.85 # penalize unverified
scored.append(
Scored(
cand=cand,
score=score,
overlap=overlap,
seed_titles=[t for t, _ in sorted(contribs, key=lambda x: -x[1])][:4],
imdb_rating=rating,
imdb_votes=votes,
genres=genres,
)
)
scored.sort(key=lambda s: s.score, reverse=True)
return scored[:top_n]
def imdb_index_lookup(imdb: ImdbIndex, cand: Candidate):
return imdb.lookup(cand.title, cand.year)
def write_reports(scored: list[Scored], out_dir: Path) -> None:
out_dir.mkdir(parents=True, exist_ok=True)
(out_dir / "recommendations.json").write_text(
json.dumps(
[
{
"title": s.cand.title,
"year": s.cand.year,
"type": s.cand.media_type,
"tmdb_id": s.cand.tmdb_id,
"score": round(s.score, 2),
"overlap": s.overlap,
"imdb_rating": s.imdb_rating,
"imdb_votes": s.imdb_votes,
"genres": s.genres,
"because_of": s.seed_titles,
}
for s in scored
],
indent=2,
)
)
lines = ["# Recommended Downloads", "", f"Top {len(scored)} by seed-overlap + IMDb quality.", ""]
lines.append("| # | Title | Yr | Type | IMDb | Votes | Overlap | Because you have/watch |")
lines.append("|---|---|---|---|---|---|---|---|")
for i, s in enumerate(scored, 1):
rating = f"{s.imdb_rating:.1f}" if s.imdb_rating else ""
votes = f"{s.imdb_votes:,}" if s.imdb_votes else ""
because = ", ".join(s.seed_titles[:3])
lines.append(
f"| {i} | {s.cand.title} | {s.cand.year or '?'} | {s.cand.media_type} | "
f"{rating} | {votes} | {s.overlap} | {because} |"
)
(out_dir / "recommendations.md").write_text("\n".join(lines))
print(f"\nReports written to {out_dir}/recommendations.md and .json")
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--registry", type=Path, required=True)
ap.add_argument(
"--watchlog",
type=Path,
default=Path.home() / ".local/state/tv-anarchy/watched.jsonl",
)
ap.add_argument("--min-imdb-rating", type=float, default=7.0)
ap.add_argument("--min-imdb-votes", type=int, default=2000)
ap.add_argument("--max-seeds", type=int, default=120)
ap.add_argument("--top", type=int, default=40)
ap.add_argument("--out", type=Path, default=Path(__file__).parent / "out")
args = ap.parse_args()
_load_env()
try:
scored = run(
args.registry,
args.watchlog,
min_imdb_rating=args.min_imdb_rating,
min_imdb_votes=args.min_imdb_votes,
max_seeds=args.max_seeds,
top_n=args.top,
)
except TmdbError as exc:
raise SystemExit(f"TMDB error: {exc}\nSet a valid TMDB_API_KEY and retry.")
write_reports(scored, args.out)
print("\nTop 15:")
for i, s in enumerate(scored[:15], 1):
rating = f"IMDb {s.imdb_rating:.1f}" if s.imdb_rating else "IMDb —"
print(
f" {i:>2}. {s.cand.title} ({s.cand.year or '?'}) [{s.cand.media_type}] "
f"{rating}, overlap {s.overlap} — via {', '.join(s.seed_titles[:2])}"
)
if __name__ == "__main__":
main()