From 96a38d884cb236fdb779c332b24ca7c3a8f004db Mon Sep 17 00:00:00 2001 From: Natalie Date: Sun, 28 Jun 2026 07:33:30 -0400 Subject: [PATCH] Revert "feat: local macOS `say` fallback when remote Chatterbox is unreachable" This reverts commit 44a80003b00bc515267a4d1bbec4bf75b3345f17. --- package.json | 2 +- src/tools/synthesis.ts | 142 ++++++++++------------------------------- 2 files changed, 34 insertions(+), 110 deletions(-) diff --git a/package.json b/package.json index a702783..03f3c4d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@lilith/speech-synthesis-mcp", - "version": "1.1.0", + "version": "1.0.1", "description": "MCP server for the Chatterbox TTS speech-synthesis service", "type": "module", "main": "./dist/index.js", diff --git a/src/tools/synthesis.ts b/src/tools/synthesis.ts index e06d6a6..659006e 100644 --- a/src/tools/synthesis.ts +++ b/src/tools/synthesis.ts @@ -3,7 +3,7 @@ import { writeFileSync, readFileSync, existsSync } from 'fs'; import { randomUUID } from 'crypto'; import { tmpdir, homedir } from 'os'; import { join } from 'path'; -import { rawFetch, BASE_URL } from '../client'; +import { rawFetch } from '../client'; import type { ToolEntry, ContentBlock } from '../types'; import { jsonContent } from '../types'; @@ -31,74 +31,6 @@ const PLAYBACK_SSH_OPTS = process.env['SPEECH_PLAYBACK_SSH_OPTS'] ?? '-o BatchMode=yes -o ServerAliveInterval=15 -o ServerAliveCountMax=4'; -// Local fallback: when the remote Chatterbox service is unreachable (e.g. the -// GPU host is offline or the mesh link is down), synthesize on the MCP host -// itself using macOS `say`. Lower fidelity, but it always works without a GPU -// and keeps spoken notifications flowing. macOS-only (no `say` on Linux). -// -// SPEECH_FALLBACK=off # disable local fallback entirely -// SPEECH_FALLBACK_VOICE= # macOS voice (e.g. "Samantha"); default system voice -// SPEECH_FALLBACK_RATE= # speaking rate in words/min (e.g. 180) -// SPEECH_PRIMARY_ATTEMPTS= # remote submit retries before failover (default 10) -const SAY_BIN = '/usr/bin/say'; -const FALLBACK_ENABLED = process.env['SPEECH_FALLBACK'] !== 'off'; -const FALLBACK_VOICE = process.env['SPEECH_FALLBACK_VOICE']; -const FALLBACK_RATE = process.env['SPEECH_FALLBACK_RATE']; -const PRIMARY_ATTEMPTS = (() => { - const n = Number(process.env['SPEECH_PRIMARY_ATTEMPTS']); - return Number.isInteger(n) && n > 0 ? n : 10; -})(); - -function fallbackAvailable(): boolean { - return FALLBACK_ENABLED && IS_MACOS && existsSync(SAY_BIN); -} - -function isNetworkError(err: unknown): boolean { - const message = err instanceof Error ? err.message : String(err); - return message.includes('Failed to fetch') || message.includes('TTS service unavailable'); -} - -// Generate speech locally via macOS `say` into an AIFF file (afplay-native). -// Strips Chatterbox inline tags like [laugh] since `say` would read them aloud. -function speakFallback(text: string): string { - const clean = text.replace(/\[[^\]]*\]/g, ' ').replace(/\s+/g, ' ').trim() || text; - const outFile = join(tmpdir(), `speech-fallback-${randomUUID()}.aiff`); - const sayArgs = ['-o', outFile]; - if (FALLBACK_VOICE) sayArgs.push('-v', FALLBACK_VOICE); - if (FALLBACK_RATE) sayArgs.push('-r', FALLBACK_RATE); - sayArgs.push(clean); - - const result = spawnSync(SAY_BIN, sayArgs, { encoding: 'utf8', timeout: 30000 }); - if (result.status !== 0) { - const detail = (result.stderr ?? '').trim() || `exit ${result.status ?? -1}`; - throw new Error(`Local fallback TTS (say) failed: ${detail}`); - } - return outFile; -} - -// Play a synthesized audio file in the background, cleaning up afterwards. -// Routes through the same playback machinery as primary synthesis: stream over -// ssh to PLAYBACK_HOST if set, else afplay (macOS) / pw-play under flock (Linux). -function playAudioFile(file: string): void { - let playCmd: string; - if (PLAYBACK_HOST) { - const remote = - 'f=$(mktemp -t splay.XXXXXX) && ' + - `mv "$f" "$f.wav" && f="$f.wav" && ` + - `cat > "$f" && ${PLAYBACK_PLAYER} "$f"; rm -f "$f"`; - const remoteEsc = remote.replace(/'/g, `'\\''`); - playCmd = - `cat ${file} | ssh ${PLAYBACK_SSH_OPTS} ${PLAYBACK_HOST} '${remoteEsc}'; ` + - `rm -f ${file}`; - } else if (IS_MACOS) { - playCmd = `${AUDIO_PLAYER} ${file}; rm -f ${file}`; - } else { - playCmd = `flock ${NOTIFY_LOCK} -c "${AUDIO_PLAYER} ${file}; rm -f ${file}"`; - } - const shell = spawn('/bin/bash', ['-c', playCmd], { detached: true, stdio: 'ignore' }); - shell.unref(); -} - interface Personality { voice_id: string | null; exaggeration: number; @@ -163,7 +95,7 @@ async function rawFetchWithRetry( throw lastError ?? new Error('TTS service unavailable'); } -async function submitAndPoll(body: Record, attempts: number = 10): Promise<{ +async function submitAndPoll(body: Record): Promise<{ audio_base64: string; format: string; sample_rate: number; @@ -174,7 +106,6 @@ async function submitAndPoll(body: Record, attempts: number = 1 const submitted = await rawFetchWithRetry<{ job_id: string; status: string; queue_position: number }>( '/jobs', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(body) }, - attempts, ); const { job_id: jobId } = submitted; @@ -197,7 +128,7 @@ export function synthesisTools(): ToolEntry[] { definition: { name: 'synthesize', description: - 'Synthesize speech from text using Chatterbox TTS. Plays automatically with cross-session queueing (no overlapping speech). Fire-and-forget: returns immediately while audio plays. Choose a personality to control voice character and emotional delivery. If the remote Chatterbox service is unreachable, automatically falls back to local macOS `say` so notifications still play (response reports engine: "local-fallback").', + 'Synthesize speech from text using Chatterbox TTS. Plays automatically with cross-session queueing (no overlapping speech). Fire-and-forget: returns immediately while audio plays. Choose a personality to control voice character and emotional delivery.', inputSchema: { type: 'object' as const, properties: { @@ -228,45 +159,45 @@ export function synthesisTools(): ToolEntry[] { if (personality.exaggeration !== undefined) body['exaggeration'] = personality.exaggeration; if (personality.cfg_weight !== undefined) body['cfg_weight'] = personality.cfg_weight; - let result: Awaited>; - try { - result = await submitAndPoll(body, PRIMARY_ATTEMPTS); - } catch (err) { - // Remote Chatterbox unreachable — fall back to local macOS `say` - // so notifications still get spoken. Only for network failures; - // a real synthesis error (job 'failed') is surfaced as-is. - if (fallbackAvailable() && isNetworkError(err)) { - const file = speakFallback(args['text'] as string); - playAudioFile(file); - return [ - { - type: 'text', - text: JSON.stringify({ - queued: true, - engine: 'local-fallback', - fallback_reason: `Chatterbox unreachable at ${BASE_URL}; spoke via macOS say`, - voice: FALLBACK_VOICE ?? '(system default)', - personality: personalityName, - }, null, 2), - }, - ]; - } - throw err; - } + const result = await submitAndPoll(body); const audioBuffer = Buffer.from(result.audio_base64, 'base64'); const tmpFile = join(tmpdir(), `speech-notify-${randomUUID()}.wav`); writeFileSync(tmpFile, audioBuffer); - // Play audio in the background then clean up (see playAudioFile). - playAudioFile(tmpFile); + // Spawn background process: play audio then cleanup + // Linux: flock serializes across sessions to prevent overlapping speech + // macOS: afplay blocks until done; flock unavailable but overlap unlikely (5-min nag interval) + // Remote: stream wav over ssh to PLAYBACK_HOST, where it's written to + // a remote tmp file and afplayed (afplay can't read from a pipe). + let playCmd: string; + if (PLAYBACK_HOST) { + const remote = + 'f=$(mktemp -t splay.XXXXXX) && ' + + `mv "$f" "$f.wav" && f="$f.wav" && ` + + `cat > "$f" && ${PLAYBACK_PLAYER} "$f"; rm -f "$f"`; + // Single-quote-escape the remote command for safe embedding. + const remoteEsc = remote.replace(/'/g, `'\\''`); + playCmd = + `cat ${tmpFile} | ssh ${PLAYBACK_SSH_OPTS} ${PLAYBACK_HOST} '${remoteEsc}'; ` + + `rm -f ${tmpFile}`; + } else if (IS_MACOS) { + playCmd = `${AUDIO_PLAYER} ${tmpFile}; rm -f ${tmpFile}`; + } else { + playCmd = `flock ${NOTIFY_LOCK} -c "${AUDIO_PLAYER} ${tmpFile}; rm -f ${tmpFile}"`; + } + const shell = spawn( + '/bin/bash', + ['-c', playCmd], + { detached: true, stdio: 'ignore' }, + ); + shell.unref(); return [ { type: 'text', text: JSON.stringify({ queued: true, - engine: 'chatterbox', personality: personalityName, estimated_duration_seconds: result.duration_seconds, text_processed: result.text_processed, @@ -322,15 +253,8 @@ export function synthesisTools(): ToolEntry[] { }, }, handler: async (): Promise => { - try { - const result = await rawFetch<{ ready: boolean }>('/ready'); - return [{ type: 'text', text: result.ready ? 'Model is loaded and ready.' : 'Model is NOT loaded (idle-stopped). First notify call will wake it — expect ~10s delay.' }]; - } catch (err) { - if (isNetworkError(err) && fallbackAvailable()) { - return [{ type: 'text', text: `Remote Chatterbox unreachable at ${BASE_URL}. Local fallback (macOS say) is available — synthesize will speak locally.` }]; - } - throw err; - } + const result = await rawFetch<{ ready: boolean }>('/ready'); + return [{ type: 'text', text: result.ready ? 'Model is loaded and ready.' : 'Model is NOT loaded (idle-stopped). First notify call will wake it — expect ~10s delay.' }]; }, }, ];