session-tools/bin/rvoice

212 lines
8 KiB
Text
Raw Normal View History

#!/bin/sh
# rvoice — push-to-talk dictation for remote rclaude sessions.
#
# Designed for the case where claude runs on another host (apricot) and the
# mic + keyboard are on the local Mac. /voice doesn't work over ssh because
# the claude binary tries to open the *remote* host's microphone. This
# helper records locally, transcribes via Groq Whisper (no local RAM hit),
# and injects the transcript into the active remote tmux session via
# `tmux send-keys` over ssh.
#
# Usage:
# rvoice start Begin recording (called by Hammerspoon on key-down)
# rvoice stop Stop, transcribe, inject (called by Hammerspoon on key-up)
# rvoice cancel Stop without transcribing (key-up after very short hold)
# rvoice target Resolve and echo `<host>\t<tmux-session>` for the
# active iTerm2 tab (debug)
#
# Config: $XDG_CONFIG_HOME/rvoice/config (sourced; same convention as rclaude).
# Required env: GROQ_API_KEY. Optional:
# RVOICE_MODEL=whisper-large-v3-turbo (default; very fast)
# RVOICE_HOST=apricot.lan (overrides iTerm2 detection)
# RVOICE_SESSION=claude-natalie-... (overrides iTerm2 detection)
# RVOICE_AUTOSEND=1 (append Enter; default 0)
# RVOICE_MIN_MS=200 (ignore taps shorter than this)
# RVOICE_MAX_S=60 (hard cap on recording length)
#
# State lives in $TMPDIR/rvoice/ — one recording at a time.
set -eu
CONF_DIR=${XDG_CONFIG_HOME:-$HOME/.config}/rvoice
[ -r "$CONF_DIR/config" ] && . "$CONF_DIR/config"
MODEL=${RVOICE_MODEL:-whisper-large-v3-turbo}
AUTOSEND=${RVOICE_AUTOSEND:-0}
MIN_MS=${RVOICE_MIN_MS:-200}
MAX_S=${RVOICE_MAX_S:-60}
STATE_DIR=${TMPDIR:-/tmp}/rvoice
mkdir -p "$STATE_DIR"
PID_FILE=$STATE_DIR/ffmpeg.pid
WAV_FILE=$STATE_DIR/recording.wav
START_FILE=$STATE_DIR/start-ms
LOG_FILE=$STATE_DIR/log
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
log() { printf '[rvoice %s] %s\n' "$(date +%H:%M:%S)" "$*" >> "$LOG_FILE"; }
now_ms() { python3 -c 'import time; print(int(time.time() * 1000))'; }
# Resolve the (host, tmux-session) for the active iTerm2 tab. Reads the
# title set by our canonical tmux config: "<host> · <session>". Falls back
# to env overrides, then to "apricot.lan" + most-recent remote claude session.
resolve_target() {
if [ -n "${RVOICE_HOST:-}" ] && [ -n "${RVOICE_SESSION:-}" ]; then
printf '%s\t%s\n' "$RVOICE_HOST" "$RVOICE_SESSION"
return
fi
_title=$(osascript -e 'tell application "iTerm2" to tell current session of current window to return name' 2>/dev/null || true)
# Title format from session-tools/tmux.conf: "<host> · <session>"
_host=$(printf '%s' "$_title" | awk -F' · ' 'NF>=2 {print $1}')
_sess=$(printf '%s' "$_title" | awk -F' · ' 'NF>=2 {print $2}')
if [ -n "$_host" ] && [ -n "$_sess" ]; then
printf '%s\t%s\n' "$_host" "$_sess"
return
fi
# Fallback: pick the most recently created claude-* session on apricot.
_host=${RVOICE_HOST:-apricot.lan}
_sess=${RVOICE_SESSION:-}
if [ -z "$_sess" ]; then
_sess=$(ssh -o BatchMode=yes -o ConnectTimeout=3 "$_host" \
'tmux ls -F "#{session_created} #{session_name}" 2>/dev/null \
| sort -n | awk "/claude-/{n=\$2} END{print n}"' 2>/dev/null || true)
fi
[ -z "$_sess" ] && { log "no target session resolvable"; return 1; }
printf '%s\t%s\n' "$_host" "$_sess"
}
notify() {
# Best-effort macOS notification + audible cue.
osascript -e "display notification \"$1\" with title \"rvoice\"" 2>/dev/null || true
[ "${2:-}" = "ok" ] && afplay /System/Library/Sounds/Pop.wav 2>/dev/null &
[ "${2:-}" = "err" ] && afplay /System/Library/Sounds/Funk.wav 2>/dev/null &
[ "${2:-}" = "go" ] && afplay /System/Library/Sounds/Tink.wav 2>/dev/null &
:
}
# ---------------------------------------------------------------------------
# Commands
# ---------------------------------------------------------------------------
cmd_start() {
# If an old ffmpeg is still alive (key release missed), kill it first.
if [ -f "$PID_FILE" ] && kill -0 "$(cat "$PID_FILE")" 2>/dev/null; then
kill "$(cat "$PID_FILE")" 2>/dev/null || true
rm -f "$PID_FILE"
fi
rm -f "$WAV_FILE"
now_ms > "$START_FILE"
# 16kHz mono PCM, capped at MAX_S. Device "0" is the default macOS input;
# change with AVFoundation list if you have multiple mics.
nohup ffmpeg -hide_banner -loglevel error -nostdin \
-f avfoundation -i ":0" \
-ac 1 -ar 16000 -t "$MAX_S" \
-y "$WAV_FILE" >/dev/null 2>>"$LOG_FILE" &
echo $! > "$PID_FILE"
notify "listening…" go
log "start pid=$(cat "$PID_FILE")"
}
cmd_stop() {
[ -f "$PID_FILE" ] || { log "stop: no recording in progress"; return 0; }
_pid=$(cat "$PID_FILE")
_start=$(cat "$START_FILE" 2>/dev/null || echo 0)
_dur_ms=$(( $(now_ms) - _start ))
# `q` on stdin is ffmpeg's clean-stop signal but with -nostdin we use
# SIGINT — ffmpeg flushes the wav header on SIGINT.
kill -INT "$_pid" 2>/dev/null || true
# Wait briefly for ffmpeg to finalize the file.
_i=0; while kill -0 "$_pid" 2>/dev/null && [ "$_i" -lt 30 ]; do sleep 0.1; _i=$((_i+1)); done
rm -f "$PID_FILE" "$START_FILE"
if [ "$_dur_ms" -lt "$MIN_MS" ]; then
log "stop: too short (${_dur_ms}ms < ${MIN_MS}ms), discarding"
rm -f "$WAV_FILE"
return 0
fi
[ -s "$WAV_FILE" ] || { log "stop: empty recording"; notify "empty recording" err; return 1; }
if [ -z "${GROQ_API_KEY:-}" ]; then
notify "GROQ_API_KEY not set" err
log "GROQ_API_KEY missing"
return 1
fi
log "transcribing ${_dur_ms}ms via $MODEL"
_txt=$(curl -sS --fail-with-body \
-H "Authorization: Bearer $GROQ_API_KEY" \
-F "file=@$WAV_FILE" \
-F "model=$MODEL" \
-F "response_format=json" \
https://api.groq.com/openai/v1/audio/transcriptions \
| jq -r '.text // empty')
rm -f "$WAV_FILE"
if [ -z "$_txt" ]; then
notify "transcription empty" err
log "empty transcription"
return 1
fi
log "text: $_txt"
_target=$(resolve_target) || { notify "no target session" err; return 1; }
_host=$(printf '%s' "$_target" | cut -f1)
_sess=$(printf '%s' "$_target" | cut -f2)
log "inject → $_host/$_sess"
# Use `tmux send-keys -l` to send the text literally (no escape interp).
# Then optional Enter if autosend.
_esc=$(printf %s "$_txt" | sed "s/'/'\\\\''/g")
if is_local_host "$_host"; then
tmux send-keys -t "$_sess" -l "$_txt"
[ "$AUTOSEND" = "1" ] && tmux send-keys -t "$_sess" Enter
else
ssh -o BatchMode=yes -o ConnectTimeout=5 "$_host" \
"tmux send-keys -t '$_sess' -l '$_esc'" 2>>"$LOG_FILE"
[ "$AUTOSEND" = "1" ] && \
ssh -o BatchMode=yes "$_host" "tmux send-keys -t '$_sess' Enter" 2>>"$LOG_FILE"
fi
notify "✓ $_txt" ok
}
cmd_cancel() {
[ -f "$PID_FILE" ] || return 0
kill -INT "$(cat "$PID_FILE")" 2>/dev/null || true
rm -f "$PID_FILE" "$START_FILE" "$WAV_FILE"
log "cancel"
}
cmd_target() { resolve_target; }
is_local_host() {
case $1 in
local|localhost|127.0.0.1|::1) return 0 ;;
esac
[ "$1" = "$(hostname)" ] && return 0
[ "$1" = "$(hostname -s 2>/dev/null)" ] && return 0
return 1
}
# Guard: when sourced as a library (by tests/run-tests.sh), skip dispatch.
if [ "${RVOICE_LIB_ONLY:-0}" = "1" ]; then
return 0 2>/dev/null || exit 0
fi
case ${1:-} in
start) cmd_start ;;
stop) cmd_stop ;;
cancel) cmd_cancel ;;
target) cmd_target ;;
log) tail -50 "$LOG_FILE" 2>/dev/null ;;
*)
cat <<EOF >&2
usage: rvoice {start|stop|cancel|target|log}
This script is meant to be driven by a PTT key binding (Hammerspoon).
See ~/Code/@scripts/session-tools/hammerspoon/rvoice.lua for the
companion config.
EOF
exit 2 ;;
esac