#!/bin/sh # rvoice — push-to-talk dictation for remote rclaude sessions. # # Designed for the case where claude runs on another host (apricot) and the # mic + keyboard are on the local Mac. /voice doesn't work over ssh because # the claude binary tries to open the *remote* host's microphone. This # helper records locally, transcribes via apricot's LAN speech-synthesis # service (Whisper, GPU-accelerated, no external API dependency), and # injects the transcript into the active remote tmux session via # `tmux send-keys` over ssh. # # Usage: # rvoice start Begin recording (called by Hammerspoon on key-down) # rvoice stop Stop, transcribe, inject (called by Hammerspoon on key-up) # rvoice cancel Stop without transcribing (key-up after very short hold) # rvoice target Resolve and echo `\t` for the # active iTerm2 tab (debug) # # Config: $XDG_CONFIG_HOME/rvoice/config (sourced; same convention as rclaude). # Transcription uses the LAN-resident speech-synthesis service on apricot # service (Whisper, GPU-accelerated, no external API dependency), and injects # Optional env: # RVOICE_STT_URL=http://apricot.lan:8000 (speech-synthesis service base URL) # RVOICE_MODEL=base (tiny|base|small|medium|large-v2|large-v3) # RVOICE_LANG=en (force language; omit for auto-detect) # RVOICE_HOST=apricot.lan (overrides iTerm2 detection) # RVOICE_SESSION=claude-natalie-... (overrides iTerm2 detection) # RVOICE_AUTOSEND=1 (append Enter; default 0) # RVOICE_MIN_MS=200 (ignore taps shorter than this) # RVOICE_MAX_S=60 (hard cap on recording length) # RVOICE_AUDIO_INPUT=":default" (avfoundation input; numeric # index or "default". List devices # with: ffmpeg -f avfoundation # -list_devices true -i "") # # State lives in $TMPDIR/rvoice/ — one recording at a time. set -eu CONF_DIR=${XDG_CONFIG_HOME:-$HOME/.config}/rvoice [ -r "$CONF_DIR/config" ] && . "$CONF_DIR/config" STT_URL=${RVOICE_STT_URL:-http://apricot.lan:8000} MODEL=${RVOICE_MODEL:-base} LANG_HINT=${RVOICE_LANG:-en} AUTOSEND=${RVOICE_AUTOSEND:-0} MIN_MS=${RVOICE_MIN_MS:-200} MAX_S=${RVOICE_MAX_S:-60} # avfoundation input spec. ":default" → macOS system default input # (controlled via Sound Settings / Control Center → Sound). Numeric ":N" # pins to a specific device index from `ffmpeg -f avfoundation -list_devices`. AUDIO_INPUT=${RVOICE_AUDIO_INPUT:-:default} STATE_DIR=${TMPDIR:-/tmp}/rvoice mkdir -p "$STATE_DIR" PID_FILE=$STATE_DIR/ffmpeg.pid WAV_FILE=$STATE_DIR/recording.wav START_FILE=$STATE_DIR/start-ms LOG_FILE=$STATE_DIR/log # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- log() { printf '[rvoice %s] %s\n' "$(date +%H:%M:%S)" "$*" >> "$LOG_FILE"; } now_ms() { python3 -c 'import time; print(int(time.time() * 1000))'; } # Resolve the (host, tmux-session) for the active iTerm2 tab. Reads the # title set by our canonical tmux config: " · ". Falls back # to env overrides, then to "apricot.lan" + most-recent remote claude session. resolve_target() { if [ -n "${RVOICE_HOST:-}" ] && [ -n "${RVOICE_SESSION:-}" ]; then printf '%s\t%s\n' "$RVOICE_HOST" "$RVOICE_SESSION" return fi _title=$(osascript -e 'tell application "iTerm2" to tell current session of current window to return name' 2>/dev/null || true) # Title format from session-tools/tmux.conf: " · " _host=$(printf '%s' "$_title" | awk -F' · ' 'NF>=2 {print $1}') _sess=$(printf '%s' "$_title" | awk -F' · ' 'NF>=2 {print $2}') if [ -n "$_host" ] && [ -n "$_sess" ]; then printf '%s\t%s\n' "$_host" "$_sess" return fi # Fallback: pick the most recently created claude-* session on apricot. _host=${RVOICE_HOST:-apricot.lan} _sess=${RVOICE_SESSION:-} if [ -z "$_sess" ]; then _sess=$(ssh -o BatchMode=yes -o ConnectTimeout=3 "$_host" \ 'tmux ls -F "#{session_created} #{session_name}" 2>/dev/null \ | sort -n | awk "/claude-/{n=\$2} END{print n}"' 2>/dev/null || true) fi [ -z "$_sess" ] && { log "no target session resolvable"; return 1; } printf '%s\t%s\n' "$_host" "$_sess" } notify() { # Best-effort macOS notification + audible cue. osascript -e "display notification \"$1\" with title \"rvoice\"" 2>/dev/null || true [ "${2:-}" = "ok" ] && afplay /System/Library/Sounds/Pop.wav 2>/dev/null & [ "${2:-}" = "err" ] && afplay /System/Library/Sounds/Funk.wav 2>/dev/null & [ "${2:-}" = "go" ] && afplay /System/Library/Sounds/Tink.wav 2>/dev/null & : } # --------------------------------------------------------------------------- # Commands # --------------------------------------------------------------------------- cmd_start() { # If an old ffmpeg is still alive (key release missed), kill it first. if [ -f "$PID_FILE" ] && kill -0 "$(cat "$PID_FILE")" 2>/dev/null; then kill "$(cat "$PID_FILE")" 2>/dev/null || true rm -f "$PID_FILE" fi rm -f "$WAV_FILE" # Write start timestamp and pid atomically (mv after both files exist) # so a concurrent cmd_stop can't observe a half-written START_FILE. _start_ts=$(now_ms) printf '%s' "$_start_ts" > "${START_FILE}.tmp" mv -f "${START_FILE}.tmp" "$START_FILE" # 16kHz mono PCM, capped at MAX_S. AUDIO_INPUT defaults to ":default" # which honors the macOS system default input (configurable via Sound # Settings / Control Center). Numeric ":N" pins a specific device. nohup ffmpeg -hide_banner -loglevel error -nostdin \ -f avfoundation -i "$AUDIO_INPUT" \ -ac 1 -ar 16000 -t "$MAX_S" \ -y "$WAV_FILE" >/dev/null 2>>"$LOG_FILE" & echo $! > "$PID_FILE" notify "listening…" go log "start pid=$(cat "$PID_FILE")" } cmd_stop() { # Optional flag: --print-text emits the transcribed text to stdout # (suppressing other stdout/stderr that would corrupt the consumer). # Used by the Hammerspoon module to surface a transcript toast. _print_text=0 if [ "${1:-}" = "--print-text" ]; then _print_text=1; fi [ -f "$PID_FILE" ] || { log "stop: no recording in progress"; return 0; } _pid=$(cat "$PID_FILE") # START_FILE may be missing/empty if a concurrent start/stop raced us; # treat anything that isn't a sane recent timestamp as "unknown" and # let the empty-recording / min-ms guards handle the rest. _start=$(cat "$START_FILE" 2>/dev/null || true) case $_start in ''|*[!0-9]*) _start=$(now_ms) ;; esac _dur_ms=$(( $(now_ms) - _start )) [ "$_dur_ms" -lt 0 ] && _dur_ms=0 # `q` on stdin is ffmpeg's clean-stop signal but with -nostdin we use # SIGINT — ffmpeg flushes the wav header on SIGINT. kill -INT "$_pid" 2>/dev/null || true # Wait briefly for ffmpeg to finalize the file. _i=0; while kill -0 "$_pid" 2>/dev/null && [ "$_i" -lt 30 ]; do sleep 0.1; _i=$((_i+1)); done rm -f "$PID_FILE" "$START_FILE" if [ "$_dur_ms" -lt "$MIN_MS" ]; then log "stop: too short (${_dur_ms}ms < ${MIN_MS}ms), discarding" rm -f "$WAV_FILE" return 0 fi [ -s "$WAV_FILE" ] || { log "stop: empty recording"; notify "empty recording" err; return 1; } log "transcribing ${_dur_ms}ms via $STT_URL (model=$MODEL lang=$LANG_HINT)" _resp=$(curl -sS --fail-with-body \ -F "audio=@$WAV_FILE" \ -F "model=$MODEL" \ ${LANG_HINT:+-F "language=$LANG_HINT"} \ -F "task=transcribe" \ "$STT_URL/stt/transcribe" 2>>"$LOG_FILE") || { notify "STT request failed" err log "curl failed against $STT_URL/stt/transcribe" return 1 } _txt=$(printf '%s' "$_resp" | jq -r '.text // empty') rm -f "$WAV_FILE" if [ -z "$_txt" ]; then notify "transcription empty" err log "empty transcription" return 1 fi log "text: $_txt" _target=$(resolve_target) || { notify "no target session" err; return 1; } _host=$(printf '%s' "$_target" | cut -f1) _sess=$(printf '%s' "$_target" | cut -f2) log "inject → $_host/$_sess" # Use `tmux send-keys -l` to send the text literally (no escape interp). # Then optional Enter if autosend. _esc=$(printf %s "$_txt" | sed "s/'/'\\\\''/g") if is_local_host "$_host"; then tmux send-keys -t "$_sess" -l "$_txt" [ "$AUTOSEND" = "1" ] && tmux send-keys -t "$_sess" Enter else ssh -o BatchMode=yes -o ConnectTimeout=5 "$_host" \ "tmux send-keys -t '$_sess' -l '$_esc'" 2>>"$LOG_FILE" [ "$AUTOSEND" = "1" ] && \ ssh -o BatchMode=yes "$_host" "tmux send-keys -t '$_sess' Enter" 2>>"$LOG_FILE" fi notify "✓ $_txt" ok [ "$_print_text" = "1" ] && printf '%s\n' "$_txt" } cmd_cancel() { [ -f "$PID_FILE" ] || return 0 kill -INT "$(cat "$PID_FILE")" 2>/dev/null || true rm -f "$PID_FILE" "$START_FILE" "$WAV_FILE" log "cancel" } cmd_target() { resolve_target; } is_local_host() { case $1 in local|localhost|127.0.0.1|::1) return 0 ;; esac [ "$1" = "$(hostname)" ] && return 0 [ "$1" = "$(hostname -s 2>/dev/null)" ] && return 0 return 1 } # Guard: when sourced as a library (by tests/run-tests.sh), skip dispatch. if [ "${RVOICE_LIB_ONLY:-0}" = "1" ]; then return 0 2>/dev/null || exit 0 fi case ${1:-} in start) cmd_start ;; stop) shift; cmd_stop "$@" ;; cancel) cmd_cancel ;; target) cmd_target ;; log) tail -50 "$LOG_FILE" 2>/dev/null ;; *) cat <&2 usage: rvoice {start|stop|cancel|target|log} This script is meant to be driven by a PTT key binding (Hammerspoon). See ~/Code/@scripts/session-tools/hammerspoon/rvoice.lua for the companion config. EOF exit 2 ;; esac