207 lines
8.1 KiB
Python
207 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
||
"""Local, TMDB-free recommendation engine — content-based on IMDb metadata.
|
||
|
||
Uses *show meta information* (IMDb genres + ratings) only — no API key.
|
||
|
||
Pipeline:
|
||
1. Build weighted seeds from the black registry + plum watchlog.
|
||
2. Resolve each seed to IMDb genres; build a weighted *genre fingerprint*
|
||
of the user's taste (watch-engagement counts more than mere ownership).
|
||
3. Score the IMDb candidate universe (rating/votes-gated) by genre similarity
|
||
to the fingerprint, boosted by rating and a recency nudge.
|
||
4. Drop owned titles. Emit a ranked discovery list (markdown + json).
|
||
|
||
Run:
|
||
uv run python recommend_local.py --registry registry.md
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import math
|
||
import re
|
||
from collections import Counter, defaultdict
|
||
from dataclasses import dataclass
|
||
from pathlib import Path
|
||
|
||
from media_rec.imdb_data import ImdbIndex, build as build_imdb
|
||
from media_rec.sources import build_seeds, owned_keys
|
||
|
||
DATA_DIR = Path(__file__).parent / "data"
|
||
|
||
# Genres that are structural noise rather than taste signal.
|
||
WEAK_GENRES = {"News", "Talk-Show", "Reality-TV", "Game-Show", "Short", "Adult"}
|
||
|
||
|
||
def _norm(title: str) -> str:
|
||
return "".join(c for c in title.lower() if c.isalnum() or c == " ").strip()
|
||
|
||
|
||
@dataclass
|
||
class Rec:
|
||
title: str
|
||
year: int | None
|
||
type: str
|
||
genres: str
|
||
rating: float
|
||
votes: int
|
||
score: float
|
||
matched: list[str]
|
||
|
||
|
||
def _accumulate(seeds, imdb: ImdbIndex, watched_only: bool) -> tuple[dict[str, float], int]:
|
||
genre_w: dict[str, float] = defaultdict(float)
|
||
resolved = 0
|
||
for s in seeds:
|
||
if watched_only and s.watch_count <= 0:
|
||
continue
|
||
hit = imdb.lookup(s.title, s.year)
|
||
if not hit:
|
||
continue
|
||
_rating, _votes, genres = hit
|
||
gs = [g for g in genres.split(",") if g and g not in WEAK_GENRES]
|
||
if not gs:
|
||
continue
|
||
resolved += 1
|
||
per = s.weight / len(gs) # split across genres so multi-genre shows don't dominate
|
||
for g in gs:
|
||
genre_w[g] += per
|
||
return genre_w, resolved
|
||
|
||
|
||
def _l2(vec: dict[str, float]) -> dict[str, float]:
|
||
norm = math.sqrt(sum(v * v for v in vec.values()))
|
||
return {k: v / norm for k, v in vec.items()} if norm else {}
|
||
|
||
|
||
def build_fingerprint(seeds, imdb: ImdbIndex, watched_alpha: float) -> dict[str, float]:
|
||
"""Blend an active-taste fingerprint (watchlog) with the library fingerprint.
|
||
|
||
The library is eclectic and anime-heavy; weighting it alone buries the
|
||
user's *active* taste. We L2-normalize each fingerprint separately, then
|
||
blend ``watched_alpha`` of the watched-only taste with the remainder of
|
||
the full-library taste.
|
||
"""
|
||
lib_raw, resolved = _accumulate(seeds, imdb, watched_only=False)
|
||
watched_raw, _ = _accumulate(seeds, imdb, watched_only=True)
|
||
lib_fp = _l2(lib_raw)
|
||
watched_fp = _l2(watched_raw)
|
||
|
||
blended: dict[str, float] = defaultdict(float)
|
||
for g, w in lib_fp.items():
|
||
blended[g] += (1.0 - watched_alpha) * w
|
||
for g, w in watched_fp.items():
|
||
blended[g] += watched_alpha * w
|
||
fingerprint = _l2(blended)
|
||
|
||
print(f" resolved {resolved}/{len(seeds)} seeds to IMDb genres", flush=True)
|
||
if watched_fp:
|
||
wt = sorted(watched_fp.items(), key=lambda x: -x[1])[:6]
|
||
print(" watched taste: " + ", ".join(f"{g} {w:.2f}" for g, w in wt), flush=True)
|
||
top = sorted(fingerprint.items(), key=lambda x: -x[1])[:8]
|
||
print(f" blended fingerprint (α={watched_alpha}): "
|
||
+ ", ".join(f"{g} {w:.2f}" for g, w in top), flush=True)
|
||
return fingerprint
|
||
|
||
|
||
def run(registry: Path, watchlog: Path, *, min_rating: float, min_votes: int,
|
||
top_n: int, current_year: int, watched_alpha: float) -> list[Rec]:
|
||
print("Building IMDb index ...", flush=True)
|
||
imdb = ImdbIndex(build_imdb(DATA_DIR))
|
||
|
||
print("Building seeds + taste fingerprint ...", flush=True)
|
||
seeds = build_seeds(registry, watchlog)
|
||
# Normalize owned keys with the SAME function used on candidates so
|
||
# punctuation/case differences (e.g. "Frieren: Beyond Journey's End") match.
|
||
owned = {_norm(t) for t in owned_keys(registry)}
|
||
owned |= {_norm(s.title) for s in seeds}
|
||
|
||
fingerprint = build_fingerprint(seeds, imdb, watched_alpha)
|
||
if not fingerprint:
|
||
raise SystemExit("Could not build a genre fingerprint — no seeds resolved.")
|
||
|
||
print(f"Scoring candidate universe (rating>={min_rating}, votes>={min_votes}) ...", flush=True)
|
||
recs: list[Rec] = []
|
||
scanned = 0
|
||
for display, year, ttype, genres, rating, votes in imdb.iter_candidates(min_rating, min_votes):
|
||
scanned += 1
|
||
if _norm(display) in owned:
|
||
continue
|
||
gs = [g for g in genres.split(",") if g and g not in WEAK_GENRES]
|
||
if not gs:
|
||
continue
|
||
# Cosine-style similarity of candidate genre set to the fingerprint.
|
||
matched = [g for g in gs if g in fingerprint]
|
||
if len(matched) < 2: # require real genre overlap, not a single tag
|
||
continue
|
||
cand_norm = math.sqrt(len(gs))
|
||
sim = sum(fingerprint[g] for g in matched) / cand_norm
|
||
# Quality + (gentle) popularity + recency shaping. Popularity is a small
|
||
# noise-floor nudge above the votes gate, NOT a blockbuster magnet, so
|
||
# genre-fit and rating drive ranking instead of raw fame.
|
||
quality = (rating - min_rating) * 0.18
|
||
popularity = max(0.0, (math.log10(votes) - math.log10(min_votes))) * 0.06
|
||
recency = 0.15 if (year and year >= current_year - 6) else 0.0
|
||
score = sim * (1.0 + quality + popularity + recency)
|
||
recs.append(Rec(display, year, ttype, genres, rating, votes, score, matched))
|
||
|
||
print(f" scanned {scanned} quality candidates", flush=True)
|
||
recs.sort(key=lambda r: r.score, reverse=True)
|
||
return recs[:top_n]
|
||
|
||
|
||
def write_reports(recs: list[Rec], out_dir: Path) -> None:
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
(out_dir / "recommendations_local.json").write_text(
|
||
json.dumps(
|
||
[
|
||
{
|
||
"title": r.title, "year": r.year, "type": r.type,
|
||
"genres": r.genres, "imdb_rating": r.rating, "imdb_votes": r.votes,
|
||
"score": round(r.score, 3), "matched_genres": r.matched,
|
||
}
|
||
for r in recs
|
||
],
|
||
indent=2,
|
||
)
|
||
)
|
||
lines = ["# Recommended Downloads (local, IMDb-metadata)", "",
|
||
f"Top {len(recs)} by genre-fingerprint match + IMDb quality. No TMDB used.", "",
|
||
"| # | Title | Yr | Type | IMDb | Votes | Genres |",
|
||
"|---|---|---|---|---|---|---|"]
|
||
for i, r in enumerate(recs, 1):
|
||
lines.append(
|
||
f"| {i} | {r.title} | {r.year or '?'} | {r.type} | {r.rating:.1f} | "
|
||
f"{r.votes:,} | {r.genres} |"
|
||
)
|
||
(out_dir / "recommendations_local.md").write_text("\n".join(lines))
|
||
print(f"\nReports: {out_dir}/recommendations_local.md and .json")
|
||
|
||
|
||
def main() -> None:
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--registry", type=Path, required=True)
|
||
ap.add_argument("--watchlog", type=Path,
|
||
default=Path.home() / ".local/state/plum-control-mcp/watched.jsonl")
|
||
ap.add_argument("--min-imdb-rating", type=float, default=7.5)
|
||
ap.add_argument("--min-imdb-votes", type=int, default=5000)
|
||
ap.add_argument("--top", type=int, default=40)
|
||
ap.add_argument("--year", type=int, default=2026)
|
||
ap.add_argument("--watched-alpha", type=float, default=0.65,
|
||
help="0=library-only taste, 1=watchlog-only taste")
|
||
ap.add_argument("--out", type=Path, default=Path(__file__).parent / "out")
|
||
args = ap.parse_args()
|
||
|
||
recs = run(args.registry, args.watchlog, min_rating=args.min_imdb_rating,
|
||
min_votes=args.min_imdb_votes, top_n=args.top, current_year=args.year,
|
||
watched_alpha=args.watched_alpha)
|
||
write_reports(recs, args.out)
|
||
print("\nTop 20:")
|
||
for i, r in enumerate(recs[:20], 1):
|
||
print(f" {i:>2}. {r.title} ({r.year or '?'}) [{r.type}] — IMDb {r.rating:.1f} "
|
||
f"({r.votes:,}) — {r.genres}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|