136 lines
7.3 KiB
Swift
136 lines
7.3 KiB
Swift
import Foundation
|
||
|
||
/// Final-reasoning seam for combining split/duplicate library entries of one show
|
||
/// into a single work (the Dandadan problem). Provider-agnostic: the cheap pass
|
||
/// (`ShowGrouping.candidateClusters`) blocks entries into candidate clusters with a
|
||
/// trivial algorithm; this reasoner makes the *final* same-work call only on the
|
||
/// ambiguous multi-entry clusters — backed by a local LLM (the MLX `TitleRefiner`'s
|
||
/// bigger sibling), or by a TMDB/enrich lookup, or both. Unassigned by default, so
|
||
/// the cheap pass stands alone until a backend is plugged in.
|
||
public protocol ShowGrouper: Sendable {
|
||
/// Decide which entries in a candidate cluster are truly the same work, and the
|
||
/// canonical title. Return `nil` for "no opinion" (caller keeps the cheap result).
|
||
func resolve(cluster: [CachedShow]) -> ShowGroupDecision?
|
||
}
|
||
|
||
public struct ShowGroupDecision: Sendable, Equatable {
|
||
public let canonicalTitle: String
|
||
/// rootDirs that are the SAME work (others in the cluster stay separate — e.g. an
|
||
/// anime and its live-action remake that merely share a title).
|
||
public let sameWork: [String]
|
||
public init(canonicalTitle: String, sameWork: [String]) {
|
||
self.canonicalTitle = canonicalTitle; self.sameWork = sameWork
|
||
}
|
||
}
|
||
|
||
/// The shipped default reasoner — **deterministic, zero MB, no model**. The same-work
|
||
/// signal is structural, not semantic: within a candidate cluster (already same
|
||
/// canonical title), entries are the same work when their release years are close
|
||
/// (duplicate releases + split seasons of one show); a year-distant entry is a
|
||
/// different work (a remake) and stays separate. Handles every real case (Daria,
|
||
/// Dandadan dupes, Bridgerton splits) without a 839 MB LLM to ship; the generative
|
||
/// `LocalLLMGrouper` is an optional download for the rare same-year ambiguous tail.
|
||
public struct DeterministicGrouper: ShowGrouper {
|
||
public var maxYearGap: Int
|
||
public init(maxYearGap: Int = 4) { self.maxYearGap = maxYearGap }
|
||
|
||
public func resolve(cluster: [CachedShow]) -> ShowGroupDecision? {
|
||
guard cluster.count > 1 else { return nil }
|
||
// Anchor on the earliest release year present; absent years join the anchor.
|
||
let anchorYear = cluster.compactMap(\.year).min()
|
||
let same = cluster.filter { s in
|
||
guard let y = s.year, let a = anchorYear else { return true }
|
||
return y - a <= maxYearGap
|
||
}
|
||
guard same.count >= 2 else { return nil } // nothing to combine → leave as-is
|
||
// Canonical title: the cleanest member name (shortest after release-noise
|
||
// stripping — "DANDADAN" beats "DAN.Da.Dan.S01.1080p…").
|
||
let title = same.map { LibraryScanner.normalizeShowName($0.name) }
|
||
.min { $0.count < $1.count } ?? same[0].name
|
||
return ShowGroupDecision(canonicalTitle: title, sameWork: same.map(\.rootDir))
|
||
}
|
||
}
|
||
|
||
public enum ShowGrouping {
|
||
/// Optional final reasoner for the messy tail (LLM/TMDB). Off by default.
|
||
public static var grouper: (any ShowGrouper)?
|
||
|
||
/// CHEAP stage — "match all similar with an easy algorithm." Block shows into
|
||
/// candidate clusters by a canonical key, so spacing/punctuation variants land
|
||
/// together ("DAN DA DAN" / "DANDADAN" / "Dan.Da.Dan" → one cluster). Order of
|
||
/// first appearance is preserved. Each multi-entry cluster is a *candidate* the
|
||
/// reasoner (or metadata) makes the final same-work call on.
|
||
public static func candidateClusters(_ shows: [CachedShow]) -> [[CachedShow]] {
|
||
var byKey: [String: [CachedShow]] = [:]
|
||
var order: [String] = []
|
||
for s in shows {
|
||
let k = canonicalKey(s.name)
|
||
if byKey[k] == nil { order.append(k) }
|
||
byKey[k, default: []].append(s)
|
||
}
|
||
return order.map { byKey[$0]! }
|
||
}
|
||
|
||
/// The multi-entry clusters — the only ones worth the reasoner's time (a lone
|
||
/// entry is unambiguously its own show). This is the cheap→LLM hand-off boundary.
|
||
public static func ambiguousClusters(_ shows: [CachedShow]) -> [[CachedShow]] {
|
||
candidateClusters(shows).filter { $0.count > 1 }
|
||
}
|
||
|
||
/// Canonical blocking key: lowercase, alphanumerics only — collapses spacing and
|
||
/// punctuation so title variants share a key. Deliberately loose: it OVER-groups
|
||
/// (cheap recall), and the reasoner trims false positives (precision).
|
||
public static func canonicalKey(_ name: String) -> String {
|
||
String(name.lowercased().unicodeScalars.filter(CharacterSet.alphanumerics.contains))
|
||
}
|
||
|
||
/// Apply grouping: cluster, ask `decide` to resolve each multi-entry cluster
|
||
/// into a same-work set, and merge those entries into one combined show (the
|
||
/// rest stay separate). `decide` is the seam (LLM/cache/test fake); a `nil`
|
||
/// answer leaves the cluster untouched. Deterministic guard: entries whose
|
||
/// release years differ by more than `maxYearGap` are never merged even if the
|
||
/// reasoner says so (a small model can over-merge a remake — see grouper.py).
|
||
public static func combine(_ shows: [CachedShow], maxYearGap: Int = 4,
|
||
decide: ([CachedShow]) -> ShowGroupDecision?) -> [CachedShow] {
|
||
var out: [CachedShow] = []
|
||
for cluster in candidateClusters(shows) {
|
||
guard cluster.count > 1, let decision = decide(cluster) else {
|
||
out.append(contentsOf: cluster); continue
|
||
}
|
||
let sameSet = Set(decision.sameWork)
|
||
var same = cluster.filter { sameSet.contains($0.rootDir) }
|
||
let rest = cluster.filter { !sameSet.contains($0.rootDir) }
|
||
// Year-gap guard: drop outliers the reasoner wrongly absorbed.
|
||
if let base = same.compactMap(\.year).min() {
|
||
let split = same.filter { ($0.year ?? base) - base > maxYearGap }
|
||
if !split.isEmpty {
|
||
same.removeAll { split.contains($0) }
|
||
out.append(contentsOf: split)
|
||
}
|
||
}
|
||
out.append(same.count >= 2 ? merge(same, title: decision.canonicalTitle)
|
||
: (same.first ?? cluster[0]))
|
||
out.append(contentsOf: rest)
|
||
}
|
||
return out
|
||
}
|
||
|
||
/// Combine same-work entries into one show: union episodes, dedup by
|
||
/// season×episode (alternate releases stay reachable via the quality switcher),
|
||
/// earliest year, first non-nil poster/overview, newest addedAt. Specials (s0) last.
|
||
static func merge(_ shows: [CachedShow], title: String) -> CachedShow {
|
||
var eps: [CachedEpisode] = []
|
||
var seen = Set<Int>()
|
||
for s in shows.sorted(by: { $0.rootDir < $1.rootDir }) {
|
||
for e in s.episodes where seen.insert(e.season * 1000 + e.episode).inserted { eps.append(e) }
|
||
}
|
||
eps.sort { (CachedShow.seasonRank($0.season), $0.episode) < (CachedShow.seasonRank($1.season), $1.episode) }
|
||
let first = shows[0]
|
||
return CachedShow(name: title.isEmpty ? first.name : title, rootDir: first.rootDir,
|
||
category: first.category, kind: .series,
|
||
posterPath: shows.compactMap(\.posterPath).first,
|
||
overview: shows.compactMap(\.overview).first,
|
||
episodes: eps, year: shows.compactMap(\.year).min(),
|
||
addedAt: shows.compactMap(\.addedAt).max())
|
||
}
|
||
}
|