#!/bin/sh
# disk-reclaim [path] [--min SIZE] [--all] [--no-summary]
#
# Scan <path> (default $HOME) for generated/cache directories worth deleting.
# Read-only — never deletes. Reports dirs that regenerate from source (build
# outputs, dependency caches, IDE/framework state) sorted by size desc.
#
# Flags:
#   --min SIZE     only show entries >= SIZE (e.g. 100M, 1G; default 100M)
#   --all          alias for --min 0
#   --no-summary   skip the totals-per-category section
#
# Patterns it looks for (project-scoped, found via find):
#   JS/TS:    node_modules, .next, .nuxt, .turbo, .vite, .parcel-cache,
#             .svelte-kit, .astro, .cache, dist, build, out
#   Python:   __pycache__, .pytest_cache, .mypy_cache, .ruff_cache, .tox, .venv
#   Rust:     target
#   Other:    _build, Pods, DerivedData, .gradle, .android
#
# Plus top-level cache roots checked once each:
#   ~/Library/Caches, ~/Library/Developer/Xcode/DerivedData
#   ~/.cache, ~/.npm, ~/.pnpm-store, ~/.yarn/cache
#   ~/.cargo/registry, ~/.cargo/git
#
# Caveats:
#   - .venv requires a rebuild from pyproject/requirements after deletion
#   - target (Rust) requires a recompile that can take minutes
#   - node_modules needs npm/pnpm install
#   - vendor/ is intentionally NOT scanned — often committed (Go) or required (PHP)

set -eu

root=$HOME
min_human=100M
show_summary=1

die() { echo "disk-reclaim: $*" >&2; exit 1; }

usage() {
    sed -n '2,/^$/p' "$0" | sed 's/^# \{0,1\}//'
    exit 2
}

to_kb() {
    case "$1" in
        *[Kk])       echo "${1%[Kk]}" ;;
        *[Mm])       echo $(( ${1%[Mm]} * 1024 )) ;;
        *[Gg])       echo $(( ${1%[Gg]} * 1024 * 1024 )) ;;
        *[Tt])       echo $(( ${1%[Tt]} * 1024 * 1024 * 1024 )) ;;
        ''|*[!0-9]*) die "bad size: $1 (use K/M/G/T suffix or plain bytes)" ;;
        *)           echo "$(( $1 / 1024 ))" ;;
    esac
}

human() {
    awk -v kb="$1" 'BEGIN {
        if (kb >= 1048576) printf "%.1fG", kb/1048576
        else if (kb >= 1024) printf "%.0fM", kb/1024
        else printf "%dK", kb
    }'
}

while [ $# -gt 0 ]; do
    case "$1" in
        -h|--help|help) usage ;;
        --min)          [ $# -ge 2 ] || die "--min needs a value"; min_human=$2; shift 2 ;;
        --min=*)        min_human=${1#--min=}; shift ;;
        --all)          min_human=0; shift ;;
        --no-summary)   show_summary=0; shift ;;
        -*)             die "unknown flag: $1" ;;
        *)              root=$1; shift ;;
    esac
done

[ -d "$root" ] || die "not a directory: $root"
min_kb=$(to_kb "$min_human")
scan_root=$(cd "$root" && pwd -P)

patterns="node_modules .next .nuxt .turbo .vite .parcel-cache .svelte-kit .astro .cache dist build out __pycache__ .pytest_cache .mypy_cache .ruff_cache .tox .venv target _build Pods DerivedData .gradle .android"

# Build the find -name OR-chain.
expr=""
for n in $patterns; do
    expr="$expr -name $n -o"
done
expr=${expr% -o}

echo "scanning $scan_root (min size: $(human "$min_kb"))..."
echo

# Find each matching dir; once matched, -prune so we don't descend into it
# looking for nested matches (e.g. avoid target/ inside node_modules).
# stderr → /dev/null to silence permission-denied noise on system dirs.
# shellcheck disable=SC2086
results=$(
    find "$scan_root" -type d \( $expr \) -prune -print 2>/dev/null \
    | while IFS= read -r dir; do
        kb=$(du -sk "$dir" 2>/dev/null | awk '{print $1}')
        [ -z "$kb" ] && continue
        [ "$kb" -lt "$min_kb" ] && continue
        printf '%s\t%s\n' "$kb" "$dir"
    done \
    | sort -rn
)

if [ -z "$results" ]; then
    echo "  (no project-scoped entries >= $(human "$min_kb"))"
else
    printf '  %8s  %s\n' "SIZE" "PATH"
    printf '  %8s  %s\n' "----" "----"
    echo "$results" | while IFS="$(printf '\t')" read -r kb path; do
        printf '  %8s  %s\n' "$(human "$kb")" "$path"
    done
fi

echo
echo "top-level cache roots:"
cache_results=$(
    for p in \
        "$HOME/Library/Caches" \
        "$HOME/Library/Developer/Xcode/DerivedData" \
        "$HOME/.cache" \
        "$HOME/.npm" \
        "$HOME/.pnpm-store" \
        "$HOME/.yarn/cache" \
        "$HOME/.cargo/registry" \
        "$HOME/.cargo/git"
    do
        [ -d "$p" ] || continue
        kb=$(du -sk "$p" 2>/dev/null | awk '{print $1}')
        [ -z "$kb" ] && continue
        [ "$kb" -lt "$min_kb" ] && continue
        printf '%s\t%s\n' "$kb" "$p"
    done | sort -rn
)
if [ -z "$cache_results" ]; then
    echo "  (none >= $(human "$min_kb"))"
else
    echo "$cache_results" | while IFS="$(printf '\t')" read -r kb path; do
        printf '  %8s  %s\n' "$(human "$kb")" "$path"
    done
fi

if [ "$show_summary" = 1 ] && [ -n "$results" ]; then
    echo
    echo "totals by category:"
    totals=$(
        for n in $patterns; do
            sum=$(echo "$results" | awk -v n="$n" -F'\t' '
                { i = split($2, a, "/"); if (a[i] == n) total += $1 }
                END { print total+0 }
            ')
            [ "$sum" -gt 0 ] && printf '%s\t%s\n' "$sum" "$n"
        done | sort -rn
    )
    echo "$totals" | while IFS="$(printf '\t')" read -r kb name; do
        printf '  %8s  %s\n' "$(human "$kb")" "$name"
    done
fi

echo
echo "review carefully before rm -rf. some dirs (.venv, target, node_modules) need a rebuild after deletion."
