Surface the existing pin (keep-from-cull) and per-file delete actions as visible inline buttons on each offline cache row instead of context-menu-only: a star toggles protection from auto-cull (and restore-if-missing), a trash culls that file early. Aligns wording/icons to the star metaphor. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
225 lines
7.7 KiB
Python
225 lines
7.7 KiB
Python
#!/usr/bin/env python3
|
|
"""Media recommendation engine.
|
|
|
|
Pipeline:
|
|
1. Build weighted *seeds* from the black library registry + plum watchlog.
|
|
2. Resolve each seed to a TMDB id.
|
|
3. Gather TMDB recommendations/similar per seed → candidate pool.
|
|
4. Score candidates by seed-overlap (weighted) and gate on IMDb rating/votes.
|
|
5. Drop already-owned titles.
|
|
6. Emit a ranked download list (markdown + json).
|
|
|
|
Run:
|
|
TMDB_API_KEY=xxxx uv run python recommend.py \
|
|
--registry /path/to/.registry.md \
|
|
--watchlog ~/.local/state/tv-anarchy/watched.jsonl
|
|
|
|
The registry is pulled from black:/bigdisk/_/media/.registry.md (scp it first
|
|
or pass a local copy).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
from media_rec.imdb_data import ImdbIndex, build as build_imdb
|
|
from media_rec.sources import build_seeds, owned_keys
|
|
from media_rec.tmdb import Candidate, Tmdb, TmdbError
|
|
|
|
DATA_DIR = Path(__file__).parent / "data"
|
|
ENV_FILE = Path(__file__).parent / ".env"
|
|
|
|
|
|
def _load_env() -> None:
|
|
"""Load KEY=VALUE lines from a local .env (no python-dotenv dependency)."""
|
|
if not ENV_FILE.exists():
|
|
return
|
|
for line in ENV_FILE.read_text().splitlines():
|
|
line = line.strip()
|
|
if not line or line.startswith("#") or "=" not in line:
|
|
continue
|
|
key, _, val = line.partition("=")
|
|
os.environ.setdefault(key.strip(), val.strip().strip('"').strip("'"))
|
|
|
|
|
|
@dataclass
|
|
class Scored:
|
|
cand: Candidate
|
|
score: float
|
|
overlap: int # how many seeds recommended it
|
|
seed_titles: list[str] = field(default_factory=list)
|
|
imdb_rating: float | None = None
|
|
imdb_votes: int | None = None
|
|
genres: str = ""
|
|
|
|
|
|
def _norm(title: str) -> str:
|
|
return "".join(c for c in title.lower() if c.isalnum() or c == " ").strip()
|
|
|
|
|
|
def run(
|
|
registry: Path,
|
|
watchlog: Path,
|
|
*,
|
|
min_imdb_rating: float,
|
|
min_imdb_votes: int,
|
|
max_seeds: int,
|
|
top_n: int,
|
|
) -> list[Scored]:
|
|
api_key = os.environ.get("TMDB_API_KEY", "")
|
|
tmdb = Tmdb(api_key, DATA_DIR / "tmdb_cache.json")
|
|
|
|
print("Building IMDb index (first run downloads ~250 MB) ...", flush=True)
|
|
imdb = ImdbIndex(build_imdb(DATA_DIR))
|
|
|
|
print("Building seeds from library + watchlog ...", flush=True)
|
|
seeds = build_seeds(registry, watchlog)[:max_seeds]
|
|
owned = owned_keys(registry)
|
|
print(f" {len(seeds)} seeds (top: {', '.join(s.title for s in seeds[:5])})", flush=True)
|
|
|
|
# Resolve seeds → TMDB, gather recommendations.
|
|
pool: dict[int, Candidate] = {}
|
|
contributions: dict[int, list[tuple[str, float]]] = defaultdict(list)
|
|
for i, seed in enumerate(seeds, 1):
|
|
resolved = tmdb.resolve(seed.title, seed.year)
|
|
if resolved is None:
|
|
continue
|
|
recs = tmdb.recommendations(resolved.tmdb_id, resolved.media_type)
|
|
for rank, cand in enumerate(recs):
|
|
pool.setdefault(cand.tmdb_id, cand)
|
|
# Position decay: earlier recs are stronger signals.
|
|
decay = 1.0 / (1.0 + rank * 0.08)
|
|
contributions[cand.tmdb_id].append((seed.title, seed.weight * decay))
|
|
if i % 10 == 0:
|
|
print(f" resolved {i}/{len(seeds)} seeds, pool={len(pool)}", flush=True)
|
|
tmdb.flush()
|
|
tmdb.flush()
|
|
|
|
# Score + gate.
|
|
scored: list[Scored] = []
|
|
for cid, cand in pool.items():
|
|
if _norm(cand.title) in owned:
|
|
continue
|
|
contribs = contributions[cid]
|
|
overlap = len(contribs)
|
|
base = sum(w for _, w in contribs)
|
|
# Overlap is the dominant signal — recommended-by-many beats one strong seed.
|
|
score = base * (1.0 + 0.5 * (overlap - 1))
|
|
|
|
imdb = imdb_index_lookup(imdb, cand)
|
|
rating = votes = None
|
|
genres = ""
|
|
if imdb:
|
|
rating, votes, genres = imdb
|
|
if rating < min_imdb_rating or votes < min_imdb_votes:
|
|
continue
|
|
# Quality multiplier: nudge by how far above the bar it sits.
|
|
score *= 1.0 + (rating - min_imdb_rating) * 0.15
|
|
else:
|
|
# No IMDb match — fall back to TMDB votes to avoid obscure noise.
|
|
if cand.vote_count < 200 or cand.vote_average < 6.5:
|
|
continue
|
|
score *= 0.85 # penalize unverified
|
|
|
|
scored.append(
|
|
Scored(
|
|
cand=cand,
|
|
score=score,
|
|
overlap=overlap,
|
|
seed_titles=[t for t, _ in sorted(contribs, key=lambda x: -x[1])][:4],
|
|
imdb_rating=rating,
|
|
imdb_votes=votes,
|
|
genres=genres,
|
|
)
|
|
)
|
|
|
|
scored.sort(key=lambda s: s.score, reverse=True)
|
|
return scored[:top_n]
|
|
|
|
|
|
def imdb_index_lookup(imdb: ImdbIndex, cand: Candidate):
|
|
return imdb.lookup(cand.title, cand.year)
|
|
|
|
|
|
def write_reports(scored: list[Scored], out_dir: Path) -> None:
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
(out_dir / "recommendations.json").write_text(
|
|
json.dumps(
|
|
[
|
|
{
|
|
"title": s.cand.title,
|
|
"year": s.cand.year,
|
|
"type": s.cand.media_type,
|
|
"tmdb_id": s.cand.tmdb_id,
|
|
"score": round(s.score, 2),
|
|
"overlap": s.overlap,
|
|
"imdb_rating": s.imdb_rating,
|
|
"imdb_votes": s.imdb_votes,
|
|
"genres": s.genres,
|
|
"because_of": s.seed_titles,
|
|
}
|
|
for s in scored
|
|
],
|
|
indent=2,
|
|
)
|
|
)
|
|
lines = ["# Recommended Downloads", "", f"Top {len(scored)} by seed-overlap + IMDb quality.", ""]
|
|
lines.append("| # | Title | Yr | Type | IMDb | Votes | Overlap | Because you have/watch |")
|
|
lines.append("|---|---|---|---|---|---|---|---|")
|
|
for i, s in enumerate(scored, 1):
|
|
rating = f"{s.imdb_rating:.1f}" if s.imdb_rating else "—"
|
|
votes = f"{s.imdb_votes:,}" if s.imdb_votes else "—"
|
|
because = ", ".join(s.seed_titles[:3])
|
|
lines.append(
|
|
f"| {i} | {s.cand.title} | {s.cand.year or '?'} | {s.cand.media_type} | "
|
|
f"{rating} | {votes} | {s.overlap} | {because} |"
|
|
)
|
|
(out_dir / "recommendations.md").write_text("\n".join(lines))
|
|
print(f"\nReports written to {out_dir}/recommendations.md and .json")
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--registry", type=Path, required=True)
|
|
ap.add_argument(
|
|
"--watchlog",
|
|
type=Path,
|
|
default=Path.home() / ".local/state/tv-anarchy/watched.jsonl",
|
|
)
|
|
ap.add_argument("--min-imdb-rating", type=float, default=7.0)
|
|
ap.add_argument("--min-imdb-votes", type=int, default=2000)
|
|
ap.add_argument("--max-seeds", type=int, default=120)
|
|
ap.add_argument("--top", type=int, default=40)
|
|
ap.add_argument("--out", type=Path, default=Path(__file__).parent / "out")
|
|
args = ap.parse_args()
|
|
_load_env()
|
|
|
|
try:
|
|
scored = run(
|
|
args.registry,
|
|
args.watchlog,
|
|
min_imdb_rating=args.min_imdb_rating,
|
|
min_imdb_votes=args.min_imdb_votes,
|
|
max_seeds=args.max_seeds,
|
|
top_n=args.top,
|
|
)
|
|
except TmdbError as exc:
|
|
raise SystemExit(f"TMDB error: {exc}\nSet a valid TMDB_API_KEY and retry.")
|
|
|
|
write_reports(scored, args.out)
|
|
print("\nTop 15:")
|
|
for i, s in enumerate(scored[:15], 1):
|
|
rating = f"IMDb {s.imdb_rating:.1f}" if s.imdb_rating else "IMDb —"
|
|
print(
|
|
f" {i:>2}. {s.cand.title} ({s.cand.year or '?'}) [{s.cand.media_type}] "
|
|
f"— {rating}, overlap {s.overlap} — via {', '.join(s.seed_titles[:2])}"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|