2026-05-17 17:23:54 -07:00
|
|
|
#!/bin/sh
|
|
|
|
|
# rvoice — push-to-talk dictation for remote rclaude sessions.
|
|
|
|
|
#
|
|
|
|
|
# Designed for the case where claude runs on another host (apricot) and the
|
|
|
|
|
# mic + keyboard are on the local Mac. /voice doesn't work over ssh because
|
|
|
|
|
# the claude binary tries to open the *remote* host's microphone. This
|
2026-05-17 18:12:14 -07:00
|
|
|
# helper records locally, transcribes via apricot's LAN speech-synthesis
|
|
|
|
|
# service (Whisper, GPU-accelerated, no external API dependency), and
|
|
|
|
|
# injects the transcript into the active remote tmux session via
|
2026-05-17 17:23:54 -07:00
|
|
|
# `tmux send-keys` over ssh.
|
|
|
|
|
#
|
|
|
|
|
# Usage:
|
|
|
|
|
# rvoice start Begin recording (called by Hammerspoon on key-down)
|
|
|
|
|
# rvoice stop Stop, transcribe, inject (called by Hammerspoon on key-up)
|
|
|
|
|
# rvoice cancel Stop without transcribing (key-up after very short hold)
|
|
|
|
|
# rvoice target Resolve and echo `<host>\t<tmux-session>` for the
|
|
|
|
|
# active iTerm2 tab (debug)
|
|
|
|
|
#
|
|
|
|
|
# Config: $XDG_CONFIG_HOME/rvoice/config (sourced; same convention as rclaude).
|
2026-05-17 18:12:14 -07:00
|
|
|
# Transcription uses the LAN-resident speech-synthesis service on apricot
|
|
|
|
|
# service (Whisper, GPU-accelerated, no external API dependency), and injects
|
|
|
|
|
# Optional env:
|
|
|
|
|
# RVOICE_STT_URL=http://apricot.lan:8000 (speech-synthesis service base URL)
|
|
|
|
|
# RVOICE_MODEL=base (tiny|base|small|medium|large-v2|large-v3)
|
|
|
|
|
# RVOICE_LANG=en (force language; omit for auto-detect)
|
2026-05-17 17:23:54 -07:00
|
|
|
# RVOICE_HOST=apricot.lan (overrides iTerm2 detection)
|
|
|
|
|
# RVOICE_SESSION=claude-natalie-... (overrides iTerm2 detection)
|
|
|
|
|
# RVOICE_AUTOSEND=1 (append Enter; default 0)
|
|
|
|
|
# RVOICE_MIN_MS=200 (ignore taps shorter than this)
|
|
|
|
|
# RVOICE_MAX_S=60 (hard cap on recording length)
|
2026-05-17 20:15:44 -07:00
|
|
|
# RVOICE_AUDIO_INPUT=":default" (avfoundation input; numeric
|
|
|
|
|
# index or "default". List devices
|
|
|
|
|
# with: ffmpeg -f avfoundation
|
|
|
|
|
# -list_devices true -i "")
|
2026-05-17 17:23:54 -07:00
|
|
|
#
|
|
|
|
|
# State lives in $TMPDIR/rvoice/ — one recording at a time.
|
|
|
|
|
|
|
|
|
|
set -eu
|
|
|
|
|
|
|
|
|
|
CONF_DIR=${XDG_CONFIG_HOME:-$HOME/.config}/rvoice
|
|
|
|
|
[ -r "$CONF_DIR/config" ] && . "$CONF_DIR/config"
|
|
|
|
|
|
2026-05-17 18:12:14 -07:00
|
|
|
STT_URL=${RVOICE_STT_URL:-http://apricot.lan:8000}
|
|
|
|
|
MODEL=${RVOICE_MODEL:-base}
|
|
|
|
|
LANG_HINT=${RVOICE_LANG:-en}
|
2026-05-17 17:23:54 -07:00
|
|
|
AUTOSEND=${RVOICE_AUTOSEND:-0}
|
|
|
|
|
MIN_MS=${RVOICE_MIN_MS:-200}
|
|
|
|
|
MAX_S=${RVOICE_MAX_S:-60}
|
2026-05-17 20:15:44 -07:00
|
|
|
# avfoundation input spec. ":default" → macOS system default input
|
|
|
|
|
# (controlled via Sound Settings / Control Center → Sound). Numeric ":N"
|
|
|
|
|
# pins to a specific device index from `ffmpeg -f avfoundation -list_devices`.
|
|
|
|
|
AUDIO_INPUT=${RVOICE_AUDIO_INPUT:-:default}
|
2026-05-17 17:23:54 -07:00
|
|
|
|
|
|
|
|
STATE_DIR=${TMPDIR:-/tmp}/rvoice
|
|
|
|
|
mkdir -p "$STATE_DIR"
|
|
|
|
|
PID_FILE=$STATE_DIR/ffmpeg.pid
|
|
|
|
|
WAV_FILE=$STATE_DIR/recording.wav
|
|
|
|
|
START_FILE=$STATE_DIR/start-ms
|
|
|
|
|
LOG_FILE=$STATE_DIR/log
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Helpers
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
log() { printf '[rvoice %s] %s\n' "$(date +%H:%M:%S)" "$*" >> "$LOG_FILE"; }
|
|
|
|
|
|
|
|
|
|
now_ms() { python3 -c 'import time; print(int(time.time() * 1000))'; }
|
|
|
|
|
|
|
|
|
|
# Resolve the (host, tmux-session) for the active iTerm2 tab. Reads the
|
|
|
|
|
# title set by our canonical tmux config: "<host> · <session>". Falls back
|
|
|
|
|
# to env overrides, then to "apricot.lan" + most-recent remote claude session.
|
|
|
|
|
resolve_target() {
|
|
|
|
|
if [ -n "${RVOICE_HOST:-}" ] && [ -n "${RVOICE_SESSION:-}" ]; then
|
|
|
|
|
printf '%s\t%s\n' "$RVOICE_HOST" "$RVOICE_SESSION"
|
|
|
|
|
return
|
|
|
|
|
fi
|
|
|
|
|
_title=$(osascript -e 'tell application "iTerm2" to tell current session of current window to return name' 2>/dev/null || true)
|
|
|
|
|
# Title format from session-tools/tmux.conf: "<host> · <session>"
|
|
|
|
|
_host=$(printf '%s' "$_title" | awk -F' · ' 'NF>=2 {print $1}')
|
|
|
|
|
_sess=$(printf '%s' "$_title" | awk -F' · ' 'NF>=2 {print $2}')
|
|
|
|
|
if [ -n "$_host" ] && [ -n "$_sess" ]; then
|
|
|
|
|
printf '%s\t%s\n' "$_host" "$_sess"
|
|
|
|
|
return
|
|
|
|
|
fi
|
|
|
|
|
# Fallback: pick the most recently created claude-* session on apricot.
|
|
|
|
|
_host=${RVOICE_HOST:-apricot.lan}
|
|
|
|
|
_sess=${RVOICE_SESSION:-}
|
|
|
|
|
if [ -z "$_sess" ]; then
|
|
|
|
|
_sess=$(ssh -o BatchMode=yes -o ConnectTimeout=3 "$_host" \
|
|
|
|
|
'tmux ls -F "#{session_created} #{session_name}" 2>/dev/null \
|
|
|
|
|
| sort -n | awk "/claude-/{n=\$2} END{print n}"' 2>/dev/null || true)
|
|
|
|
|
fi
|
|
|
|
|
[ -z "$_sess" ] && { log "no target session resolvable"; return 1; }
|
|
|
|
|
printf '%s\t%s\n' "$_host" "$_sess"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
notify() {
|
|
|
|
|
# Best-effort macOS notification + audible cue.
|
|
|
|
|
osascript -e "display notification \"$1\" with title \"rvoice\"" 2>/dev/null || true
|
|
|
|
|
[ "${2:-}" = "ok" ] && afplay /System/Library/Sounds/Pop.wav 2>/dev/null &
|
|
|
|
|
[ "${2:-}" = "err" ] && afplay /System/Library/Sounds/Funk.wav 2>/dev/null &
|
|
|
|
|
[ "${2:-}" = "go" ] && afplay /System/Library/Sounds/Tink.wav 2>/dev/null &
|
|
|
|
|
:
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Commands
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
cmd_start() {
|
|
|
|
|
# If an old ffmpeg is still alive (key release missed), kill it first.
|
|
|
|
|
if [ -f "$PID_FILE" ] && kill -0 "$(cat "$PID_FILE")" 2>/dev/null; then
|
|
|
|
|
kill "$(cat "$PID_FILE")" 2>/dev/null || true
|
|
|
|
|
rm -f "$PID_FILE"
|
|
|
|
|
fi
|
|
|
|
|
rm -f "$WAV_FILE"
|
2026-05-17 19:38:40 -07:00
|
|
|
# Write start timestamp and pid atomically (mv after both files exist)
|
|
|
|
|
# so a concurrent cmd_stop can't observe a half-written START_FILE.
|
|
|
|
|
_start_ts=$(now_ms)
|
|
|
|
|
printf '%s' "$_start_ts" > "${START_FILE}.tmp"
|
|
|
|
|
mv -f "${START_FILE}.tmp" "$START_FILE"
|
2026-05-17 20:15:44 -07:00
|
|
|
# 16kHz mono PCM, capped at MAX_S. AUDIO_INPUT defaults to ":default"
|
|
|
|
|
# which honors the macOS system default input (configurable via Sound
|
|
|
|
|
# Settings / Control Center). Numeric ":N" pins a specific device.
|
2026-05-17 17:23:54 -07:00
|
|
|
nohup ffmpeg -hide_banner -loglevel error -nostdin \
|
2026-05-17 20:15:44 -07:00
|
|
|
-f avfoundation -i "$AUDIO_INPUT" \
|
2026-05-17 17:23:54 -07:00
|
|
|
-ac 1 -ar 16000 -t "$MAX_S" \
|
|
|
|
|
-y "$WAV_FILE" >/dev/null 2>>"$LOG_FILE" &
|
|
|
|
|
echo $! > "$PID_FILE"
|
|
|
|
|
notify "listening…" go
|
|
|
|
|
log "start pid=$(cat "$PID_FILE")"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cmd_stop() {
|
2026-05-17 19:38:40 -07:00
|
|
|
# Optional flag: --print-text emits the transcribed text to stdout
|
|
|
|
|
# (suppressing other stdout/stderr that would corrupt the consumer).
|
|
|
|
|
# Used by the Hammerspoon module to surface a transcript toast.
|
|
|
|
|
_print_text=0
|
|
|
|
|
if [ "${1:-}" = "--print-text" ]; then _print_text=1; fi
|
2026-05-17 17:23:54 -07:00
|
|
|
[ -f "$PID_FILE" ] || { log "stop: no recording in progress"; return 0; }
|
|
|
|
|
_pid=$(cat "$PID_FILE")
|
2026-05-17 19:38:40 -07:00
|
|
|
# START_FILE may be missing/empty if a concurrent start/stop raced us;
|
|
|
|
|
# treat anything that isn't a sane recent timestamp as "unknown" and
|
|
|
|
|
# let the empty-recording / min-ms guards handle the rest.
|
|
|
|
|
_start=$(cat "$START_FILE" 2>/dev/null || true)
|
|
|
|
|
case $_start in
|
|
|
|
|
''|*[!0-9]*) _start=$(now_ms) ;;
|
|
|
|
|
esac
|
2026-05-17 17:23:54 -07:00
|
|
|
_dur_ms=$(( $(now_ms) - _start ))
|
2026-05-17 19:38:40 -07:00
|
|
|
[ "$_dur_ms" -lt 0 ] && _dur_ms=0
|
2026-05-17 17:23:54 -07:00
|
|
|
# `q` on stdin is ffmpeg's clean-stop signal but with -nostdin we use
|
|
|
|
|
# SIGINT — ffmpeg flushes the wav header on SIGINT.
|
|
|
|
|
kill -INT "$_pid" 2>/dev/null || true
|
|
|
|
|
# Wait briefly for ffmpeg to finalize the file.
|
|
|
|
|
_i=0; while kill -0 "$_pid" 2>/dev/null && [ "$_i" -lt 30 ]; do sleep 0.1; _i=$((_i+1)); done
|
|
|
|
|
rm -f "$PID_FILE" "$START_FILE"
|
|
|
|
|
if [ "$_dur_ms" -lt "$MIN_MS" ]; then
|
|
|
|
|
log "stop: too short (${_dur_ms}ms < ${MIN_MS}ms), discarding"
|
|
|
|
|
rm -f "$WAV_FILE"
|
|
|
|
|
return 0
|
|
|
|
|
fi
|
|
|
|
|
[ -s "$WAV_FILE" ] || { log "stop: empty recording"; notify "empty recording" err; return 1; }
|
|
|
|
|
|
2026-05-17 18:12:14 -07:00
|
|
|
log "transcribing ${_dur_ms}ms via $STT_URL (model=$MODEL lang=$LANG_HINT)"
|
|
|
|
|
_resp=$(curl -sS --fail-with-body \
|
|
|
|
|
-F "audio=@$WAV_FILE" \
|
2026-05-17 17:23:54 -07:00
|
|
|
-F "model=$MODEL" \
|
2026-05-17 18:12:14 -07:00
|
|
|
${LANG_HINT:+-F "language=$LANG_HINT"} \
|
|
|
|
|
-F "task=transcribe" \
|
|
|
|
|
"$STT_URL/stt/transcribe" 2>>"$LOG_FILE") || {
|
|
|
|
|
notify "STT request failed" err
|
|
|
|
|
log "curl failed against $STT_URL/stt/transcribe"
|
|
|
|
|
return 1
|
|
|
|
|
}
|
|
|
|
|
_txt=$(printf '%s' "$_resp" | jq -r '.text // empty')
|
2026-05-17 17:23:54 -07:00
|
|
|
rm -f "$WAV_FILE"
|
|
|
|
|
if [ -z "$_txt" ]; then
|
|
|
|
|
notify "transcription empty" err
|
|
|
|
|
log "empty transcription"
|
|
|
|
|
return 1
|
|
|
|
|
fi
|
|
|
|
|
log "text: $_txt"
|
|
|
|
|
|
|
|
|
|
_target=$(resolve_target) || { notify "no target session" err; return 1; }
|
|
|
|
|
_host=$(printf '%s' "$_target" | cut -f1)
|
|
|
|
|
_sess=$(printf '%s' "$_target" | cut -f2)
|
|
|
|
|
log "inject → $_host/$_sess"
|
|
|
|
|
|
|
|
|
|
# Use `tmux send-keys -l` to send the text literally (no escape interp).
|
|
|
|
|
# Then optional Enter if autosend.
|
|
|
|
|
_esc=$(printf %s "$_txt" | sed "s/'/'\\\\''/g")
|
|
|
|
|
if is_local_host "$_host"; then
|
|
|
|
|
tmux send-keys -t "$_sess" -l "$_txt"
|
|
|
|
|
[ "$AUTOSEND" = "1" ] && tmux send-keys -t "$_sess" Enter
|
|
|
|
|
else
|
|
|
|
|
ssh -o BatchMode=yes -o ConnectTimeout=5 "$_host" \
|
|
|
|
|
"tmux send-keys -t '$_sess' -l '$_esc'" 2>>"$LOG_FILE"
|
|
|
|
|
[ "$AUTOSEND" = "1" ] && \
|
|
|
|
|
ssh -o BatchMode=yes "$_host" "tmux send-keys -t '$_sess' Enter" 2>>"$LOG_FILE"
|
|
|
|
|
fi
|
|
|
|
|
notify "✓ $_txt" ok
|
2026-05-17 19:38:40 -07:00
|
|
|
[ "$_print_text" = "1" ] && printf '%s\n' "$_txt"
|
2026-05-17 17:23:54 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cmd_cancel() {
|
|
|
|
|
[ -f "$PID_FILE" ] || return 0
|
|
|
|
|
kill -INT "$(cat "$PID_FILE")" 2>/dev/null || true
|
|
|
|
|
rm -f "$PID_FILE" "$START_FILE" "$WAV_FILE"
|
|
|
|
|
log "cancel"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cmd_target() { resolve_target; }
|
|
|
|
|
|
|
|
|
|
is_local_host() {
|
|
|
|
|
case $1 in
|
|
|
|
|
local|localhost|127.0.0.1|::1) return 0 ;;
|
|
|
|
|
esac
|
|
|
|
|
[ "$1" = "$(hostname)" ] && return 0
|
|
|
|
|
[ "$1" = "$(hostname -s 2>/dev/null)" ] && return 0
|
|
|
|
|
return 1
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-17 17:54:08 -07:00
|
|
|
# Guard: when sourced as a library (by tests/run-tests.sh), skip dispatch.
|
|
|
|
|
if [ "${RVOICE_LIB_ONLY:-0}" = "1" ]; then
|
|
|
|
|
return 0 2>/dev/null || exit 0
|
|
|
|
|
fi
|
|
|
|
|
|
2026-05-17 17:23:54 -07:00
|
|
|
case ${1:-} in
|
|
|
|
|
start) cmd_start ;;
|
2026-05-17 19:38:40 -07:00
|
|
|
stop) shift; cmd_stop "$@" ;;
|
2026-05-17 17:23:54 -07:00
|
|
|
cancel) cmd_cancel ;;
|
|
|
|
|
target) cmd_target ;;
|
|
|
|
|
log) tail -50 "$LOG_FILE" 2>/dev/null ;;
|
|
|
|
|
*)
|
|
|
|
|
cat <<EOF >&2
|
|
|
|
|
usage: rvoice {start|stop|cancel|target|log}
|
|
|
|
|
|
|
|
|
|
This script is meant to be driven by a PTT key binding (Hammerspoon).
|
|
|
|
|
See ~/Code/@scripts/session-tools/hammerspoon/rvoice.lua for the
|
|
|
|
|
companion config.
|
|
|
|
|
EOF
|
|
|
|
|
exit 2 ;;
|
|
|
|
|
esac
|