#!/bin/sh
# rvoice — push-to-talk dictation for remote rclaude sessions.
#
# Designed for the case where claude runs on another host (apricot) and the
# mic + keyboard are on the local Mac. /voice doesn't work over ssh because
# the claude binary tries to open the *remote* host's microphone. This
# helper records locally, transcribes via apricot's LAN speech-synthesis
# service (Whisper, GPU-accelerated, no external API dependency), and
# injects the transcript into the active remote tmux session via
# `tmux send-keys` over ssh.
#
# Usage:
#   rvoice start        Begin recording (called by Hammerspoon on key-down)
#   rvoice stop         Stop, transcribe, inject (called by Hammerspoon on key-up)
#   rvoice cancel       Stop without transcribing (key-up after very short hold)
#   rvoice target       Resolve and echo `<host>\t<tmux-session>` for the
#                       active iTerm2 tab (debug)
#
# Config: $XDG_CONFIG_HOME/rvoice/config (sourced; same convention as rclaude).
# Transcription uses the LAN-resident speech-synthesis service on apricot
# service (Whisper, GPU-accelerated, no external API dependency), and injects
# Optional env:
#   RVOICE_STT_URL=http://apricot.lan:8000   (speech-synthesis service base URL)
#   RVOICE_MODEL=base                         (tiny|base|small|medium|large-v2|large-v3)
#   RVOICE_LANG=en                            (force language; omit for auto-detect)
#   RVOICE_HOST=apricot.lan                   (overrides iTerm2 detection)
#   RVOICE_SESSION=claude-natalie-...         (overrides iTerm2 detection)
#   RVOICE_AUTOSEND=1                         (append Enter; default 0)
#   RVOICE_MIN_MS=200                         (ignore taps shorter than this)
#   RVOICE_MAX_S=60                           (hard cap on recording length)
#   RVOICE_AUDIO_INPUT=":default"             (avfoundation input; numeric
#                                              index or "default". List devices
#                                              with: ffmpeg -f avfoundation
#                                              -list_devices true -i "")
#
# State lives in $TMPDIR/rvoice/ — one recording at a time.

set -eu

CONF_DIR=${XDG_CONFIG_HOME:-$HOME/.config}/rvoice
[ -r "$CONF_DIR/config" ] && . "$CONF_DIR/config"

STT_URL=${RVOICE_STT_URL:-http://apricot.lan:8000}
MODEL=${RVOICE_MODEL:-base}
LANG_HINT=${RVOICE_LANG:-en}
AUTOSEND=${RVOICE_AUTOSEND:-0}
MIN_MS=${RVOICE_MIN_MS:-200}
MAX_S=${RVOICE_MAX_S:-60}
# avfoundation input spec. ":default" → macOS system default input
# (controlled via Sound Settings / Control Center → Sound). Numeric ":N"
# pins to a specific device index from `ffmpeg -f avfoundation -list_devices`.
AUDIO_INPUT=${RVOICE_AUDIO_INPUT:-:default}

STATE_DIR=${TMPDIR:-/tmp}/rvoice
mkdir -p "$STATE_DIR"
PID_FILE=$STATE_DIR/ffmpeg.pid
WAV_FILE=$STATE_DIR/recording.wav
START_FILE=$STATE_DIR/start-ms
LOG_FILE=$STATE_DIR/log

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

log() { printf '[rvoice %s] %s\n' "$(date +%H:%M:%S)" "$*" >> "$LOG_FILE"; }

now_ms() { python3 -c 'import time; print(int(time.time() * 1000))'; }

# Resolve the (host, tmux-session) for the active iTerm2 tab. Reads the
# title set by our canonical tmux config: "<host> · <session>". Falls back
# to env overrides, then to "apricot.lan" + most-recent remote claude session.
resolve_target() {
    if [ -n "${RVOICE_HOST:-}" ] && [ -n "${RVOICE_SESSION:-}" ]; then
        printf '%s\t%s\n' "$RVOICE_HOST" "$RVOICE_SESSION"
        return
    fi
    _title=$(osascript -e 'tell application "iTerm2" to tell current session of current window to return name' 2>/dev/null || true)
    # Title format from session-tools/tmux.conf: "<host> · <session>"
    _host=$(printf '%s' "$_title" | awk -F' · ' 'NF>=2 {print $1}')
    _sess=$(printf '%s' "$_title" | awk -F' · ' 'NF>=2 {print $2}')
    if [ -n "$_host" ] && [ -n "$_sess" ]; then
        printf '%s\t%s\n' "$_host" "$_sess"
        return
    fi
    # Fallback: pick the most recently created claude-* session on apricot.
    _host=${RVOICE_HOST:-apricot.lan}
    _sess=${RVOICE_SESSION:-}
    if [ -z "$_sess" ]; then
        _sess=$(ssh -o BatchMode=yes -o ConnectTimeout=3 "$_host" \
                'tmux ls -F "#{session_created} #{session_name}" 2>/dev/null \
                 | sort -n | awk "/claude-/{n=\$2} END{print n}"' 2>/dev/null || true)
    fi
    [ -z "$_sess" ] && { log "no target session resolvable"; return 1; }
    printf '%s\t%s\n' "$_host" "$_sess"
}

notify() {
    # Best-effort macOS notification + audible cue.
    osascript -e "display notification \"$1\" with title \"rvoice\"" 2>/dev/null || true
    [ "${2:-}" = "ok" ]   && afplay /System/Library/Sounds/Pop.wav    2>/dev/null &
    [ "${2:-}" = "err" ]  && afplay /System/Library/Sounds/Funk.wav   2>/dev/null &
    [ "${2:-}" = "go" ]   && afplay /System/Library/Sounds/Tink.wav   2>/dev/null &
    :
}

# ---------------------------------------------------------------------------
# Commands
# ---------------------------------------------------------------------------

cmd_start() {
    # If an old ffmpeg is still alive (key release missed), kill it first.
    if [ -f "$PID_FILE" ] && kill -0 "$(cat "$PID_FILE")" 2>/dev/null; then
        kill "$(cat "$PID_FILE")" 2>/dev/null || true
        rm -f "$PID_FILE"
    fi
    rm -f "$WAV_FILE"
    # Write start timestamp and pid atomically (mv after both files exist)
    # so a concurrent cmd_stop can't observe a half-written START_FILE.
    _start_ts=$(now_ms)
    printf '%s' "$_start_ts" > "${START_FILE}.tmp"
    mv -f "${START_FILE}.tmp" "$START_FILE"
    # 16kHz mono PCM, capped at MAX_S. AUDIO_INPUT defaults to ":default"
    # which honors the macOS system default input (configurable via Sound
    # Settings / Control Center). Numeric ":N" pins a specific device.
    nohup ffmpeg -hide_banner -loglevel error -nostdin \
        -f avfoundation -i "$AUDIO_INPUT" \
        -ac 1 -ar 16000 -t "$MAX_S" \
        -y "$WAV_FILE" >/dev/null 2>>"$LOG_FILE" &
    echo $! > "$PID_FILE"
    notify "listening…" go
    log "start pid=$(cat "$PID_FILE")"
}

cmd_stop() {
    # Optional flag: --print-text emits the transcribed text to stdout
    # (suppressing other stdout/stderr that would corrupt the consumer).
    # Used by the Hammerspoon module to surface a transcript toast.
    _print_text=0
    if [ "${1:-}" = "--print-text" ]; then _print_text=1; fi
    [ -f "$PID_FILE" ] || { log "stop: no recording in progress"; return 0; }
    _pid=$(cat "$PID_FILE")
    # START_FILE may be missing/empty if a concurrent start/stop raced us;
    # treat anything that isn't a sane recent timestamp as "unknown" and
    # let the empty-recording / min-ms guards handle the rest.
    _start=$(cat "$START_FILE" 2>/dev/null || true)
    case $_start in
        ''|*[!0-9]*) _start=$(now_ms) ;;
    esac
    _dur_ms=$(( $(now_ms) - _start ))
    [ "$_dur_ms" -lt 0 ] && _dur_ms=0
    # `q` on stdin is ffmpeg's clean-stop signal but with -nostdin we use
    # SIGINT — ffmpeg flushes the wav header on SIGINT.
    kill -INT "$_pid" 2>/dev/null || true
    # Wait briefly for ffmpeg to finalize the file.
    _i=0; while kill -0 "$_pid" 2>/dev/null && [ "$_i" -lt 30 ]; do sleep 0.1; _i=$((_i+1)); done
    rm -f "$PID_FILE" "$START_FILE"
    if [ "$_dur_ms" -lt "$MIN_MS" ]; then
        log "stop: too short (${_dur_ms}ms < ${MIN_MS}ms), discarding"
        rm -f "$WAV_FILE"
        return 0
    fi
    [ -s "$WAV_FILE" ] || { log "stop: empty recording"; notify "empty recording" err; return 1; }

    log "transcribing ${_dur_ms}ms via $STT_URL (model=$MODEL lang=$LANG_HINT)"
    _resp=$(curl -sS --fail-with-body \
        -F "audio=@$WAV_FILE" \
        -F "model=$MODEL" \
        ${LANG_HINT:+-F "language=$LANG_HINT"} \
        -F "task=transcribe" \
        "$STT_URL/stt/transcribe" 2>>"$LOG_FILE") || {
        notify "STT request failed" err
        log "curl failed against $STT_URL/stt/transcribe"
        return 1
    }
    _txt=$(printf '%s' "$_resp" | jq -r '.text // empty')
    rm -f "$WAV_FILE"
    if [ -z "$_txt" ]; then
        notify "transcription empty" err
        log "empty transcription"
        return 1
    fi
    log "text: $_txt"

    _target=$(resolve_target) || { notify "no target session" err; return 1; }
    _host=$(printf '%s' "$_target" | cut -f1)
    _sess=$(printf '%s' "$_target" | cut -f2)
    log "inject → $_host/$_sess"

    # Use `tmux send-keys -l` to send the text literally (no escape interp).
    # Then optional Enter if autosend.
    _esc=$(printf %s "$_txt" | sed "s/'/'\\\\''/g")
    if is_local_host "$_host"; then
        tmux send-keys -t "$_sess" -l "$_txt"
        [ "$AUTOSEND" = "1" ] && tmux send-keys -t "$_sess" Enter
    else
        ssh -o BatchMode=yes -o ConnectTimeout=5 "$_host" \
            "tmux send-keys -t '$_sess' -l '$_esc'" 2>>"$LOG_FILE"
        [ "$AUTOSEND" = "1" ] && \
            ssh -o BatchMode=yes "$_host" "tmux send-keys -t '$_sess' Enter" 2>>"$LOG_FILE"
    fi
    notify "✓ $_txt" ok
    [ "$_print_text" = "1" ] && printf '%s\n' "$_txt"
}

cmd_cancel() {
    [ -f "$PID_FILE" ] || return 0
    kill -INT "$(cat "$PID_FILE")" 2>/dev/null || true
    rm -f "$PID_FILE" "$START_FILE" "$WAV_FILE"
    log "cancel"
}

cmd_target() { resolve_target; }

is_local_host() {
    case $1 in
        local|localhost|127.0.0.1|::1) return 0 ;;
    esac
    [ "$1" = "$(hostname)" ] && return 0
    [ "$1" = "$(hostname -s 2>/dev/null)" ] && return 0
    return 1
}

# Guard: when sourced as a library (by tests/run-tests.sh), skip dispatch.
if [ "${RVOICE_LIB_ONLY:-0}" = "1" ]; then
    return 0 2>/dev/null || exit 0
fi

case ${1:-} in
    start)  cmd_start ;;
    stop)   shift; cmd_stop "$@" ;;
    cancel) cmd_cancel ;;
    target) cmd_target ;;
    log)    tail -50 "$LOG_FILE" 2>/dev/null ;;
    *)
        cat <<EOF >&2
usage: rvoice {start|stop|cancel|target|log}

This script is meant to be driven by a PTT key binding (Hammerspoon).
See ~/Code/@scripts/session-tools/hammerspoon/rvoice.lua for the
companion config.
EOF
        exit 2 ;;
esac