tv-anarchy/recommender/recommend_local.py
Natalie 4a2ceb9781 feat(offline): inline star-to-keep and trash-to-cull on cache rows
Surface the existing pin (keep-from-cull) and per-file delete actions as
visible inline buttons on each offline cache row instead of context-menu-only:
a star toggles protection from auto-cull (and restore-if-missing), a trash
culls that file early. Aligns wording/icons to the star metaphor.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 00:12:41 -04:00

207 lines
8.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Local, TMDB-free recommendation engine — content-based on IMDb metadata.
Uses *show meta information* (IMDb genres + ratings) only — no API key.
Pipeline:
1. Build weighted seeds from the black registry + plum watchlog.
2. Resolve each seed to IMDb genres; build a weighted *genre fingerprint*
of the user's taste (watch-engagement counts more than mere ownership).
3. Score the IMDb candidate universe (rating/votes-gated) by genre similarity
to the fingerprint, boosted by rating and a recency nudge.
4. Drop owned titles. Emit a ranked discovery list (markdown + json).
Run:
uv run python recommend_local.py --registry registry.md
"""
from __future__ import annotations
import argparse
import json
import math
import re
from collections import Counter, defaultdict
from dataclasses import dataclass
from pathlib import Path
from media_rec.imdb_data import ImdbIndex, build as build_imdb
from media_rec.sources import build_seeds, owned_keys
DATA_DIR = Path(__file__).parent / "data"
# Genres that are structural noise rather than taste signal.
WEAK_GENRES = {"News", "Talk-Show", "Reality-TV", "Game-Show", "Short", "Adult"}
def _norm(title: str) -> str:
return "".join(c for c in title.lower() if c.isalnum() or c == " ").strip()
@dataclass
class Rec:
title: str
year: int | None
type: str
genres: str
rating: float
votes: int
score: float
matched: list[str]
def _accumulate(seeds, imdb: ImdbIndex, watched_only: bool) -> tuple[dict[str, float], int]:
genre_w: dict[str, float] = defaultdict(float)
resolved = 0
for s in seeds:
if watched_only and s.watch_count <= 0:
continue
hit = imdb.lookup(s.title, s.year)
if not hit:
continue
_rating, _votes, genres = hit
gs = [g for g in genres.split(",") if g and g not in WEAK_GENRES]
if not gs:
continue
resolved += 1
per = s.weight / len(gs) # split across genres so multi-genre shows don't dominate
for g in gs:
genre_w[g] += per
return genre_w, resolved
def _l2(vec: dict[str, float]) -> dict[str, float]:
norm = math.sqrt(sum(v * v for v in vec.values()))
return {k: v / norm for k, v in vec.items()} if norm else {}
def build_fingerprint(seeds, imdb: ImdbIndex, watched_alpha: float) -> dict[str, float]:
"""Blend an active-taste fingerprint (watchlog) with the library fingerprint.
The library is eclectic and anime-heavy; weighting it alone buries the
user's *active* taste. We L2-normalize each fingerprint separately, then
blend ``watched_alpha`` of the watched-only taste with the remainder of
the full-library taste.
"""
lib_raw, resolved = _accumulate(seeds, imdb, watched_only=False)
watched_raw, _ = _accumulate(seeds, imdb, watched_only=True)
lib_fp = _l2(lib_raw)
watched_fp = _l2(watched_raw)
blended: dict[str, float] = defaultdict(float)
for g, w in lib_fp.items():
blended[g] += (1.0 - watched_alpha) * w
for g, w in watched_fp.items():
blended[g] += watched_alpha * w
fingerprint = _l2(blended)
print(f" resolved {resolved}/{len(seeds)} seeds to IMDb genres", flush=True)
if watched_fp:
wt = sorted(watched_fp.items(), key=lambda x: -x[1])[:6]
print(" watched taste: " + ", ".join(f"{g} {w:.2f}" for g, w in wt), flush=True)
top = sorted(fingerprint.items(), key=lambda x: -x[1])[:8]
print(f" blended fingerprint (α={watched_alpha}): "
+ ", ".join(f"{g} {w:.2f}" for g, w in top), flush=True)
return fingerprint
def run(registry: Path, watchlog: Path, *, min_rating: float, min_votes: int,
top_n: int, current_year: int, watched_alpha: float) -> list[Rec]:
print("Building IMDb index ...", flush=True)
imdb = ImdbIndex(build_imdb(DATA_DIR))
print("Building seeds + taste fingerprint ...", flush=True)
seeds = build_seeds(registry, watchlog)
# Normalize owned keys with the SAME function used on candidates so
# punctuation/case differences (e.g. "Frieren: Beyond Journey's End") match.
owned = {_norm(t) for t in owned_keys(registry)}
owned |= {_norm(s.title) for s in seeds}
fingerprint = build_fingerprint(seeds, imdb, watched_alpha)
if not fingerprint:
raise SystemExit("Could not build a genre fingerprint — no seeds resolved.")
print(f"Scoring candidate universe (rating>={min_rating}, votes>={min_votes}) ...", flush=True)
recs: list[Rec] = []
scanned = 0
for display, year, ttype, genres, rating, votes in imdb.iter_candidates(min_rating, min_votes):
scanned += 1
if _norm(display) in owned:
continue
gs = [g for g in genres.split(",") if g and g not in WEAK_GENRES]
if not gs:
continue
# Cosine-style similarity of candidate genre set to the fingerprint.
matched = [g for g in gs if g in fingerprint]
if len(matched) < 2: # require real genre overlap, not a single tag
continue
cand_norm = math.sqrt(len(gs))
sim = sum(fingerprint[g] for g in matched) / cand_norm
# Quality + (gentle) popularity + recency shaping. Popularity is a small
# noise-floor nudge above the votes gate, NOT a blockbuster magnet, so
# genre-fit and rating drive ranking instead of raw fame.
quality = (rating - min_rating) * 0.18
popularity = max(0.0, (math.log10(votes) - math.log10(min_votes))) * 0.06
recency = 0.15 if (year and year >= current_year - 6) else 0.0
score = sim * (1.0 + quality + popularity + recency)
recs.append(Rec(display, year, ttype, genres, rating, votes, score, matched))
print(f" scanned {scanned} quality candidates", flush=True)
recs.sort(key=lambda r: r.score, reverse=True)
return recs[:top_n]
def write_reports(recs: list[Rec], out_dir: Path) -> None:
out_dir.mkdir(parents=True, exist_ok=True)
(out_dir / "recommendations_local.json").write_text(
json.dumps(
[
{
"title": r.title, "year": r.year, "type": r.type,
"genres": r.genres, "imdb_rating": r.rating, "imdb_votes": r.votes,
"score": round(r.score, 3), "matched_genres": r.matched,
}
for r in recs
],
indent=2,
)
)
lines = ["# Recommended Downloads (local, IMDb-metadata)", "",
f"Top {len(recs)} by genre-fingerprint match + IMDb quality. No TMDB used.", "",
"| # | Title | Yr | Type | IMDb | Votes | Genres |",
"|---|---|---|---|---|---|---|"]
for i, r in enumerate(recs, 1):
lines.append(
f"| {i} | {r.title} | {r.year or '?'} | {r.type} | {r.rating:.1f} | "
f"{r.votes:,} | {r.genres} |"
)
(out_dir / "recommendations_local.md").write_text("\n".join(lines))
print(f"\nReports: {out_dir}/recommendations_local.md and .json")
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--registry", type=Path, required=True)
ap.add_argument("--watchlog", type=Path,
default=Path.home() / ".local/state/tv-anarchy/watched.jsonl")
ap.add_argument("--min-imdb-rating", type=float, default=7.5)
ap.add_argument("--min-imdb-votes", type=int, default=5000)
ap.add_argument("--top", type=int, default=40)
ap.add_argument("--year", type=int, default=2026)
ap.add_argument("--watched-alpha", type=float, default=0.65,
help="0=library-only taste, 1=watchlog-only taste")
ap.add_argument("--out", type=Path, default=Path(__file__).parent / "out")
args = ap.parse_args()
recs = run(args.registry, args.watchlog, min_rating=args.min_imdb_rating,
min_votes=args.min_imdb_votes, top_n=args.top, current_year=args.year,
watched_alpha=args.watched_alpha)
write_reports(recs, args.out)
print("\nTop 20:")
for i, r in enumerate(recs[:20], 1):
print(f" {i:>2}. {r.title} ({r.year or '?'}) [{r.type}] — IMDb {r.rating:.1f} "
f"({r.votes:,}) — {r.genres}")
if __name__ == "__main__":
main()