tv-anarchy/Sources/TVAnarchyCore/Library/ShowGrouping.swift

136 lines
7.3 KiB
Swift
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import Foundation
/// Final-reasoning seam for combining split/duplicate library entries of one show
/// into a single work (the Dandadan problem). Provider-agnostic: the cheap pass
/// (`ShowGrouping.candidateClusters`) blocks entries into candidate clusters with a
/// trivial algorithm; this reasoner makes the *final* same-work call only on the
/// ambiguous multi-entry clusters backed by a local LLM (the MLX `TitleRefiner`'s
/// bigger sibling), or by a TMDB/enrich lookup, or both. Unassigned by default, so
/// the cheap pass stands alone until a backend is plugged in.
public protocol ShowGrouper: Sendable {
/// Decide which entries in a candidate cluster are truly the same work, and the
/// canonical title. Return `nil` for "no opinion" (caller keeps the cheap result).
func resolve(cluster: [CachedShow]) -> ShowGroupDecision?
}
public struct ShowGroupDecision: Sendable, Equatable {
public let canonicalTitle: String
/// rootDirs that are the SAME work (others in the cluster stay separate e.g. an
/// anime and its live-action remake that merely share a title).
public let sameWork: [String]
public init(canonicalTitle: String, sameWork: [String]) {
self.canonicalTitle = canonicalTitle; self.sameWork = sameWork
}
}
/// The shipped default reasoner **deterministic, zero MB, no model**. The same-work
/// signal is structural, not semantic: within a candidate cluster (already same
/// canonical title), entries are the same work when their release years are close
/// (duplicate releases + split seasons of one show); a year-distant entry is a
/// different work (a remake) and stays separate. Handles every real case (Daria,
/// Dandadan dupes, Bridgerton splits) without a 839 MB LLM to ship; the generative
/// `LocalLLMGrouper` is an optional download for the rare same-year ambiguous tail.
public struct DeterministicGrouper: ShowGrouper {
public var maxYearGap: Int
public init(maxYearGap: Int = 4) { self.maxYearGap = maxYearGap }
public func resolve(cluster: [CachedShow]) -> ShowGroupDecision? {
guard cluster.count > 1 else { return nil }
// Anchor on the earliest release year present; absent years join the anchor.
let anchorYear = cluster.compactMap(\.year).min()
let same = cluster.filter { s in
guard let y = s.year, let a = anchorYear else { return true }
return y - a <= maxYearGap
}
guard same.count >= 2 else { return nil } // nothing to combine leave as-is
// Canonical title: the cleanest member name (shortest after release-noise
// stripping "DANDADAN" beats "DAN.Da.Dan.S01.1080p").
let title = same.map { LibraryScanner.normalizeShowName($0.name) }
.min { $0.count < $1.count } ?? same[0].name
return ShowGroupDecision(canonicalTitle: title, sameWork: same.map(\.rootDir))
}
}
public enum ShowGrouping {
/// Optional final reasoner for the messy tail (LLM/TMDB). Off by default.
public static var grouper: (any ShowGrouper)?
/// CHEAP stage "match all similar with an easy algorithm." Block shows into
/// candidate clusters by a canonical key, so spacing/punctuation variants land
/// together ("DAN DA DAN" / "DANDADAN" / "Dan.Da.Dan" one cluster). Order of
/// first appearance is preserved. Each multi-entry cluster is a *candidate* the
/// reasoner (or metadata) makes the final same-work call on.
public static func candidateClusters(_ shows: [CachedShow]) -> [[CachedShow]] {
var byKey: [String: [CachedShow]] = [:]
var order: [String] = []
for s in shows {
let k = canonicalKey(s.name)
if byKey[k] == nil { order.append(k) }
byKey[k, default: []].append(s)
}
return order.map { byKey[$0]! }
}
/// The multi-entry clusters the only ones worth the reasoner's time (a lone
/// entry is unambiguously its own show). This is the cheapLLM hand-off boundary.
public static func ambiguousClusters(_ shows: [CachedShow]) -> [[CachedShow]] {
candidateClusters(shows).filter { $0.count > 1 }
}
/// Canonical blocking key: lowercase, alphanumerics only collapses spacing and
/// punctuation so title variants share a key. Deliberately loose: it OVER-groups
/// (cheap recall), and the reasoner trims false positives (precision).
public static func canonicalKey(_ name: String) -> String {
String(name.lowercased().unicodeScalars.filter(CharacterSet.alphanumerics.contains))
}
/// Apply grouping: cluster, ask `decide` to resolve each multi-entry cluster
/// into a same-work set, and merge those entries into one combined show (the
/// rest stay separate). `decide` is the seam (LLM/cache/test fake); a `nil`
/// answer leaves the cluster untouched. Deterministic guard: entries whose
/// release years differ by more than `maxYearGap` are never merged even if the
/// reasoner says so (a small model can over-merge a remake see grouper.py).
public static func combine(_ shows: [CachedShow], maxYearGap: Int = 4,
decide: ([CachedShow]) -> ShowGroupDecision?) -> [CachedShow] {
var out: [CachedShow] = []
for cluster in candidateClusters(shows) {
guard cluster.count > 1, let decision = decide(cluster) else {
out.append(contentsOf: cluster); continue
}
let sameSet = Set(decision.sameWork)
var same = cluster.filter { sameSet.contains($0.rootDir) }
let rest = cluster.filter { !sameSet.contains($0.rootDir) }
// Year-gap guard: drop outliers the reasoner wrongly absorbed.
if let base = same.compactMap(\.year).min() {
let split = same.filter { ($0.year ?? base) - base > maxYearGap }
if !split.isEmpty {
same.removeAll { split.contains($0) }
out.append(contentsOf: split)
}
}
out.append(same.count >= 2 ? merge(same, title: decision.canonicalTitle)
: (same.first ?? cluster[0]))
out.append(contentsOf: rest)
}
return out
}
/// Combine same-work entries into one show: union episodes, dedup by
/// season×episode (alternate releases stay reachable via the quality switcher),
/// earliest year, first non-nil poster/overview, newest addedAt. Specials (s0) last.
static func merge(_ shows: [CachedShow], title: String) -> CachedShow {
var eps: [CachedEpisode] = []
var seen = Set<Int>()
for s in shows.sorted(by: { $0.rootDir < $1.rootDir }) {
for e in s.episodes where seen.insert(e.season * 1000 + e.episode).inserted { eps.append(e) }
}
eps.sort { (CachedShow.seasonRank($0.season), $0.episode) < (CachedShow.seasonRank($1.season), $1.episode) }
let first = shows[0]
return CachedShow(name: title.isEmpty ? first.name : title, rootDir: first.rootDir,
category: first.category, kind: .series,
posterPath: shows.compactMap(\.posterPath).first,
overview: shows.compactMap(\.overview).first,
episodes: eps, year: shows.compactMap(\.year).min(),
addedAt: shows.compactMap(\.addedAt).max())
}
}