docs(@scripts): ✨ update rvoice docs to use LAN speech-synthesis
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
4a5b2f7273
commit
fedabb0924
3 changed files with 69 additions and 51 deletions
40
bin/rvoice
40
bin/rvoice
|
|
@ -4,8 +4,9 @@
|
||||||
# Designed for the case where claude runs on another host (apricot) and the
|
# Designed for the case where claude runs on another host (apricot) and the
|
||||||
# mic + keyboard are on the local Mac. /voice doesn't work over ssh because
|
# mic + keyboard are on the local Mac. /voice doesn't work over ssh because
|
||||||
# the claude binary tries to open the *remote* host's microphone. This
|
# the claude binary tries to open the *remote* host's microphone. This
|
||||||
# helper records locally, transcribes via Groq Whisper (no local RAM hit),
|
# helper records locally, transcribes via apricot's LAN speech-synthesis
|
||||||
# and injects the transcript into the active remote tmux session via
|
# service (Whisper, GPU-accelerated, no external API dependency), and
|
||||||
|
# injects the transcript into the active remote tmux session via
|
||||||
# `tmux send-keys` over ssh.
|
# `tmux send-keys` over ssh.
|
||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
|
|
@ -16,8 +17,12 @@
|
||||||
# active iTerm2 tab (debug)
|
# active iTerm2 tab (debug)
|
||||||
#
|
#
|
||||||
# Config: $XDG_CONFIG_HOME/rvoice/config (sourced; same convention as rclaude).
|
# Config: $XDG_CONFIG_HOME/rvoice/config (sourced; same convention as rclaude).
|
||||||
# Required env: GROQ_API_KEY. Optional:
|
# Transcription uses the LAN-resident speech-synthesis service on apricot
|
||||||
# RVOICE_MODEL=whisper-large-v3-turbo (default; very fast)
|
# service (Whisper, GPU-accelerated, no external API dependency), and injects
|
||||||
|
# Optional env:
|
||||||
|
# RVOICE_STT_URL=http://apricot.lan:8000 (speech-synthesis service base URL)
|
||||||
|
# RVOICE_MODEL=base (tiny|base|small|medium|large-v2|large-v3)
|
||||||
|
# RVOICE_LANG=en (force language; omit for auto-detect)
|
||||||
# RVOICE_HOST=apricot.lan (overrides iTerm2 detection)
|
# RVOICE_HOST=apricot.lan (overrides iTerm2 detection)
|
||||||
# RVOICE_SESSION=claude-natalie-... (overrides iTerm2 detection)
|
# RVOICE_SESSION=claude-natalie-... (overrides iTerm2 detection)
|
||||||
# RVOICE_AUTOSEND=1 (append Enter; default 0)
|
# RVOICE_AUTOSEND=1 (append Enter; default 0)
|
||||||
|
|
@ -31,7 +36,9 @@ set -eu
|
||||||
CONF_DIR=${XDG_CONFIG_HOME:-$HOME/.config}/rvoice
|
CONF_DIR=${XDG_CONFIG_HOME:-$HOME/.config}/rvoice
|
||||||
[ -r "$CONF_DIR/config" ] && . "$CONF_DIR/config"
|
[ -r "$CONF_DIR/config" ] && . "$CONF_DIR/config"
|
||||||
|
|
||||||
MODEL=${RVOICE_MODEL:-whisper-large-v3-turbo}
|
STT_URL=${RVOICE_STT_URL:-http://apricot.lan:8000}
|
||||||
|
MODEL=${RVOICE_MODEL:-base}
|
||||||
|
LANG_HINT=${RVOICE_LANG:-en}
|
||||||
AUTOSEND=${RVOICE_AUTOSEND:-0}
|
AUTOSEND=${RVOICE_AUTOSEND:-0}
|
||||||
MIN_MS=${RVOICE_MIN_MS:-200}
|
MIN_MS=${RVOICE_MIN_MS:-200}
|
||||||
MAX_S=${RVOICE_MAX_S:-60}
|
MAX_S=${RVOICE_MAX_S:-60}
|
||||||
|
|
@ -129,19 +136,18 @@ cmd_stop() {
|
||||||
fi
|
fi
|
||||||
[ -s "$WAV_FILE" ] || { log "stop: empty recording"; notify "empty recording" err; return 1; }
|
[ -s "$WAV_FILE" ] || { log "stop: empty recording"; notify "empty recording" err; return 1; }
|
||||||
|
|
||||||
if [ -z "${GROQ_API_KEY:-}" ]; then
|
log "transcribing ${_dur_ms}ms via $STT_URL (model=$MODEL lang=$LANG_HINT)"
|
||||||
notify "GROQ_API_KEY not set" err
|
_resp=$(curl -sS --fail-with-body \
|
||||||
log "GROQ_API_KEY missing"
|
-F "audio=@$WAV_FILE" \
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
log "transcribing ${_dur_ms}ms via $MODEL"
|
|
||||||
_txt=$(curl -sS --fail-with-body \
|
|
||||||
-H "Authorization: Bearer $GROQ_API_KEY" \
|
|
||||||
-F "file=@$WAV_FILE" \
|
|
||||||
-F "model=$MODEL" \
|
-F "model=$MODEL" \
|
||||||
-F "response_format=json" \
|
${LANG_HINT:+-F "language=$LANG_HINT"} \
|
||||||
https://api.groq.com/openai/v1/audio/transcriptions \
|
-F "task=transcribe" \
|
||||||
| jq -r '.text // empty')
|
"$STT_URL/stt/transcribe" 2>>"$LOG_FILE") || {
|
||||||
|
notify "STT request failed" err
|
||||||
|
log "curl failed against $STT_URL/stt/transcribe"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
_txt=$(printf '%s' "$_resp" | jq -r '.text // empty')
|
||||||
rm -f "$WAV_FILE"
|
rm -f "$WAV_FILE"
|
||||||
if [ -z "$_txt" ]; then
|
if [ -z "$_txt" ]; then
|
||||||
notify "transcription empty" err
|
notify "transcription empty" err
|
||||||
|
|
|
||||||
|
|
@ -4,11 +4,12 @@
|
||||||
running on**. When you're sshed to apricot through `cc` / `rclaude resume`,
|
running on**. When you're sshed to apricot through `cc` / `rclaude resume`,
|
||||||
that's apricot — which has no mic. `rvoice` fills the gap.
|
that's apricot — which has no mic. `rvoice` fills the gap.
|
||||||
|
|
||||||
It records audio locally on macOS, transcribes via Groq Whisper (no local model
|
It records audio locally on macOS, transcribes via the **LAN speech-synthesis
|
||||||
RAM), and injects the transcript into the active remote tmux session via
|
service on apricot** (Whisper, GPU-accelerated, no API keys / no network
|
||||||
`tmux send-keys` over ssh. The target session is auto-detected from the
|
egress beyond the local LAN), and injects the transcript into the active
|
||||||
focused iTerm2 tab title (set by the canonical session-tools `tmux.conf` to
|
remote tmux session via `tmux send-keys` over ssh. The target session is
|
||||||
`<host> · <session>`).
|
auto-detected from the focused iTerm2 tab title (set by the canonical
|
||||||
|
session-tools `tmux.conf` to `<host> · <session>`).
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
|
|
@ -17,13 +18,14 @@ focused iTerm2 tab title (set by the canonical session-tools `tmux.conf` to
|
||||||
[ Right ⌥ up ] ──Hammerspoon──▶ rvoice stop
|
[ Right ⌥ up ] ──Hammerspoon──▶ rvoice stop
|
||||||
│
|
│
|
||||||
▼
|
▼
|
||||||
POST WAV → Groq /audio/transcriptions
|
POST WAV → http://apricot.lan:8000/stt/transcribe
|
||||||
|
(faster-whisper on GPU, ~base model)
|
||||||
│
|
│
|
||||||
▼
|
▼
|
||||||
iTerm2 active tab title → "apricot · claude-…"
|
iTerm2 active tab title → "apricot · claude-…"
|
||||||
│
|
│
|
||||||
▼
|
▼
|
||||||
ssh apricot tmux send-keys -t claude-… -l "<text>"
|
ssh apricot tmux send-keys -t claude-… -l "<text>"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Files
|
## Files
|
||||||
|
|
@ -32,25 +34,23 @@ focused iTerm2 tab title (set by the canonical session-tools `tmux.conf` to
|
||||||
|------------------------------------------------------|---------------------------------------|
|
|------------------------------------------------------|---------------------------------------|
|
||||||
| `bin/rvoice` | CLI: `start`/`stop`/`cancel`/`target`/`log` |
|
| `bin/rvoice` | CLI: `start`/`stop`/`cancel`/`target`/`log` |
|
||||||
| `hammerspoon/rvoice.lua` | Right-⌥ hold detector → calls `rvoice` |
|
| `hammerspoon/rvoice.lua` | Right-⌥ hold detector → calls `rvoice` |
|
||||||
| `~/.config/rvoice/config` | Sourced at startup; holds `GROQ_API_KEY` and tweaks |
|
| `~/.config/rvoice/config` | Sourced at startup; overrides STT URL, model, etc. |
|
||||||
| `$TMPDIR/rvoice/` | Per-recording state (pid, wav, log) |
|
| `$TMPDIR/rvoice/` | Per-recording state (pid, wav, log) |
|
||||||
|
|
||||||
## Install
|
## Install
|
||||||
|
|
||||||
Prerequisites: `ffmpeg`, `jq`, `curl` (all `brew install`able), a Groq API key
|
Prerequisites: `ffmpeg`, `jq`, `curl` (all `brew install`able), Hammerspoon
|
||||||
(free tier — https://console.groq.com/keys), and Hammerspoon
|
(`brew install --cask hammerspoon`), and the LAN speech-synthesis service
|
||||||
(`brew install --cask hammerspoon`).
|
running on apricot (already deployed at `apricot.lan:8000`, exposes
|
||||||
|
`/stt/transcribe`). No API keys, no cloud round-trip.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# 1. Symlink rvoice (already done if you ran install.sh)
|
# 1. Symlink rvoice (already done if you ran install.sh)
|
||||||
ln -sfn ~/Code/@scripts/session-tools/bin/rvoice ~/.local/bin/rvoice
|
ln -sfn ~/Code/@scripts/session-tools/bin/rvoice ~/.local/bin/rvoice
|
||||||
|
|
||||||
# 2. Drop your Groq key
|
# 2. (Optional) override defaults in ~/.config/rvoice/config — see the
|
||||||
mkdir -p ~/.config/rvoice
|
# "Config" section below. The default is to POST to apricot.lan:8000 and
|
||||||
cat >> ~/.config/rvoice/config <<'EOF'
|
# use the `base` Whisper model.
|
||||||
export GROQ_API_KEY=gsk_...your_key...
|
|
||||||
# export RVOICE_AUTOSEND=1 # uncomment to auto-press Enter after injection
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# 3. Wire up Hammerspoon
|
# 3. Wire up Hammerspoon
|
||||||
mkdir -p ~/.hammerspoon
|
mkdir -p ~/.hammerspoon
|
||||||
|
|
@ -60,6 +60,11 @@ open /Applications/Hammerspoon.app
|
||||||
|
|
||||||
# 4. From Hammerspoon's menu bar → Reload Config.
|
# 4. From Hammerspoon's menu bar → Reload Config.
|
||||||
# Grant Accessibility + Microphone permission when macOS prompts.
|
# Grant Accessibility + Microphone permission when macOS prompts.
|
||||||
|
|
||||||
|
# 5. Smoke-test the STT endpoint without Hammerspoon:
|
||||||
|
ffmpeg -f avfoundation -i ":0" -ac 1 -ar 16000 -t 5 /tmp/me.wav
|
||||||
|
curl -F "audio=@/tmp/me.wav" -F "model=base" -F "language=en" -F "task=transcribe" \
|
||||||
|
http://apricot.lan:8000/stt/transcribe | jq .text
|
||||||
```
|
```
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
@ -79,8 +84,9 @@ From any iTerm2 tab that's attached to a remote claude session via `cc` or
|
||||||
Plain shell fragment sourced at startup. Defaults shown.
|
Plain shell fragment sourced at startup. Defaults shown.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
export GROQ_API_KEY=... # REQUIRED
|
export RVOICE_STT_URL=http://apricot.lan:8000 # speech-synthesis service
|
||||||
export RVOICE_MODEL=whisper-large-v3-turbo # Groq model id
|
export RVOICE_MODEL=base # tiny|base|small|medium|large-v2|large-v3
|
||||||
|
export RVOICE_LANG=en # omit/empty = auto-detect
|
||||||
export RVOICE_AUTOSEND=0 # 1 = press Enter after inject
|
export RVOICE_AUTOSEND=0 # 1 = press Enter after inject
|
||||||
export RVOICE_MIN_MS=200 # ignore taps shorter than this (debounce)
|
export RVOICE_MIN_MS=200 # ignore taps shorter than this (debounce)
|
||||||
export RVOICE_MAX_S=60 # hard cap on a single recording
|
export RVOICE_MAX_S=60 # hard cap on a single recording
|
||||||
|
|
@ -88,7 +94,12 @@ export RVOICE_HOST=apricot.lan # force target host (overri
|
||||||
export RVOICE_SESSION=claude-natalie-… # force target tmux session
|
export RVOICE_SESSION=claude-natalie-… # force target tmux session
|
||||||
```
|
```
|
||||||
|
|
||||||
Override any of these per-invocation: `RVOICE_AUTOSEND=1 rvoice stop`.
|
Override any of these per-invocation: `RVOICE_MODEL=small rvoice stop`.
|
||||||
|
|
||||||
|
**Model trade-offs** (apricot's GPU; latency rough):
|
||||||
|
- `tiny.en` / `base` — sub-second, fine for short prompts
|
||||||
|
- `small` — ~1s, noticeable quality bump
|
||||||
|
- `medium` / `large-v3` — 2-4s, near-perfect, worth it for paragraphs
|
||||||
|
|
||||||
## Subcommands
|
## Subcommands
|
||||||
|
|
||||||
|
|
@ -102,9 +113,9 @@ rvoice log # tail -50 of the action log
|
||||||
|
|
||||||
## Troubleshooting
|
## Troubleshooting
|
||||||
|
|
||||||
- **"GROQ_API_KEY not set"** — Hammerspoon's shell environment doesn't inherit
|
- **"STT request failed"** — apricot's speech service isn't reachable. Check
|
||||||
from your login shell. Make sure the key is exported in
|
`curl http://apricot.lan:8000/health` and `ssh apricot.lan systemctl --user
|
||||||
`~/.config/rvoice/config`; rvoice sources that file before each invocation.
|
status` for the relevant unit. Most likely you're off the LAN/VPN.
|
||||||
- **"no target session resolvable"** — the focused iTerm2 tab title isn't in
|
- **"no target session resolvable"** — the focused iTerm2 tab title isn't in
|
||||||
`<host> · <session>` format. Either: (a) you're not in an rclaude/ssh
|
`<host> · <session>` format. Either: (a) you're not in an rclaude/ssh
|
||||||
session, or (b) the remote tmux config didn't get the title-setting fragment.
|
session, or (b) the remote tmux config didn't get the title-setting fragment.
|
||||||
|
|
@ -113,16 +124,15 @@ rvoice log # tail -50 of the action log
|
||||||
- **Hammerspoon doesn't see Right ⌥** — System Settings → Privacy &
|
- **Hammerspoon doesn't see Right ⌥** — System Settings → Privacy &
|
||||||
Security → Accessibility → enable Hammerspoon. Also Microphone for the
|
Security → Accessibility → enable Hammerspoon. Also Microphone for the
|
||||||
recording step. Restart Hammerspoon after granting.
|
recording step. Restart Hammerspoon after granting.
|
||||||
- **Transcription returns nonsense** — Groq's `whisper-large-v3-turbo` is
|
- **Transcription returns empty / nonsense** — bump the model: `RVOICE_MODEL=small`
|
||||||
multilingual but English-biased. Set `RVOICE_MODEL=whisper-large-v3` for
|
or `medium`. Default `base` trades accuracy for sub-second latency. Models
|
||||||
the slower but more accurate variant.
|
list: `curl http://apricot.lan:8000/stt/models`.
|
||||||
- **Injection types into the wrong session** — `rvoice target` shows what it
|
- **Injection types into the wrong session** — `rvoice target` shows what it
|
||||||
will hit. If wrong, set `RVOICE_HOST` / `RVOICE_SESSION` in config to pin
|
will hit. If wrong, set `RVOICE_HOST` / `RVOICE_SESSION` in config to pin
|
||||||
the target.
|
the target.
|
||||||
- **Latency feels high** — Groq is fast (~500ms for short clips). Network
|
- **Latency feels high** — first call after service idle warms the model on
|
||||||
latency to plum + ssh round-trip to apricot adds ~200ms. Local Whisper
|
apricot's GPU (1-2s one-time). Subsequent calls are sub-second for `base`.
|
||||||
would be slower in practice on most laptops once you account for model
|
Switch to `tiny.en` for the lowest-latency tier.
|
||||||
load.
|
|
||||||
|
|
||||||
## Why this architecture (vs. /voice over ssh)
|
## Why this architecture (vs. /voice over ssh)
|
||||||
|
|
||||||
|
|
@ -137,6 +147,7 @@ remote rclaude session would be:
|
||||||
claude release)
|
claude release)
|
||||||
3. **Reproduce /voice's behavior with our own pieces** ← this is rvoice
|
3. **Reproduce /voice's behavior with our own pieces** ← this is rvoice
|
||||||
|
|
||||||
`rvoice` keeps the mic and the hotkey on the Mac, runs transcription on a
|
`rvoice` keeps the mic and the hotkey on the Mac, runs transcription on
|
||||||
hosted endpoint (zero local RAM), and uses tmux's existing send-keys
|
apricot's own LAN-resident speech-synthesis service (GPU Whisper, zero
|
||||||
|
local model RAM, no cloud egress), and uses tmux's existing send-keys
|
||||||
protocol to deliver text — every layer is well-understood and stable.
|
protocol to deliver text — every layer is well-understood and stable.
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,8 @@ local function run(cmd)
|
||||||
hs.printf("[rvoice] %s exited %d: %s", cmd, exit, err or "")
|
hs.printf("[rvoice] %s exited %d: %s", cmd, exit, err or "")
|
||||||
end
|
end
|
||||||
end, {"-c", RVOICE .. " " .. cmd})
|
end, {"-c", RVOICE .. " " .. cmd})
|
||||||
-- Inherit user shell env so GROQ_API_KEY (and PATH for ffmpeg/jq) work.
|
-- Inherit user shell env so PATH for ffmpeg/jq is set and rvoice can
|
||||||
|
-- source ~/.config/rvoice/config to pick up any user overrides.
|
||||||
t:setEnvironment(hs.execute("env", true):gsub("\n$", "") and nil or nil)
|
t:setEnvironment(hs.execute("env", true):gsub("\n$", "") and nil or nil)
|
||||||
t:start()
|
t:start()
|
||||||
end
|
end
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue