From a7fa2fd218ff5d9b9b6b949facd584cde319990c Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 31 May 2026 03:59:11 +0000 Subject: [PATCH] Add Pending_Query class and voice interaction improvements - lib/pending-query.mjs: new state machine for query accumulation wake word, silence timer, send/cancel/pause/resume, instant dispatch, mode toggle (always listen / stop listening), mode query - query-demo.mjs: refactored to use Pending_Query; wake word on by default with silence timer; chimes for dispatch/working/cancel/activate - tts-server.mjs: track last_speak_at, expose /activity endpoint, chime playback via Python queue (soundfile + librosa), preload on startup - chatterbox-server.py: chime and preload commands via stdin protocol - lib/chatterbox-tts.mjs: play_chime and preload_chime methods - test-chime.mjs: simple chime test script - voices.yaml: configured ready/cancel/working/dispatch chimes - CLEANUP-PLAN.md: updated with current state, command vocabulary, future plans Co-Authored-By: Claude Sonnet 4.6 --- CLEANUP-PLAN.md | 95 ++++++++++++++++- chatterbox-server.py | 44 +++++++- lib/chatterbox-tts.mjs | 20 ++++ lib/pending-query.mjs | 229 +++++++++++++++++++++++++++++++++++++++++ query-demo.mjs | 148 +++++++++++++------------- test-chime.mjs | 13 +++ tts-server.mjs | 26 +++-- voice-buddy.mjs | 2 + voices.yaml | 5 +- 9 files changed, 487 insertions(+), 95 deletions(-) create mode 100644 lib/pending-query.mjs create mode 100755 test-chime.mjs diff --git a/CLEANUP-PLAN.md b/CLEANUP-PLAN.md index 5dba735..64a76db 100644 --- a/CLEANUP-PLAN.md +++ b/CLEANUP-PLAN.md @@ -13,7 +13,8 @@ The project has accumulated experiment scripts, dead TTS backends, and outdated | File | Role | |------|------| -| `query-demo.mjs` | Main entry point — STT → classifier → dispatch | +| `query-demo.mjs` | Main entry point — STT → Pending_Query → dispatch | +| `lib/pending-query.mjs` | Query state machine — wake word, silence timer, send/cancel/pause/instant dispatch | | `tts-server.mjs` | TTS HTTP server (Chatterbox, voice switching) | | `voices.yaml` | Named voice config | | `lib/stt.mjs` | Silero VAD + Whisper, pre-roll buffer | @@ -44,7 +45,8 @@ The project has accumulated experiment scripts, dead TTS backends, and outdated | `lib/markdown.mjs` | Unused? Verify before deleting | | `lib/llm.mjs` | Unused? Verify before deleting | | `requirements.txt` | Lists kokoro + faster-whisper deps that aren't used | -| `voice-buddy.mjs` | Merged into query-demo.mjs — delete | +| `voice-buddy.mjs` | Merged into query-demo.mjs — **already deleted** | +| `test-chime.mjs` | Keep — useful for testing chime config | --- @@ -134,6 +136,95 @@ voice/ --- +## tmux pane monitoring for Claude state (future) + +After dispatching a query, poll `docker exec tmux capture-pane -p -t claude` to watch the pane contents. Detect completion by looking for Claude Code's end-of-task timing line, which matches a pattern like `"for \d+ seconds"` or `"for \d+ minutes"`. This line always appears after Claude finishes a task and is reliable enough to use as a signal. + +Use this to: +- Cancel the working chime timeout early when Claude finishes quickly +- Play a "done but silent" notification if Claude finishes without calling `speak` + +Polling interval of ~1s should be sufficient. Stop polling once the pattern is seen or a max timeout is reached. + +Also detect Claude Code's session feedback prompt ("How is Claude doing this session?"). When seen: +- Speak the question via TTS +- Enter a temporary voice response mode: "good", "fine", "bad" → send the corresponding rating; "dismiss" or "ignore" → dismiss the prompt +- This lets users give Anthropic feedback even when using the voice interface exclusively + +--- + +## Audio mixing — chimes over speech (future) + +Currently the playback queue is strictly sequential — chimes and speech never overlap. A future improvement would be to mix chime audio on top of ongoing speech rather than waiting for it to finish. This would make activation chimes feel more responsive. Requires summing float32 buffers before passing to pacat rather than queuing them separately. + +--- + +## Replace chatterbox-server.py stdin protocol with HTTP (future) + +The current stdin/stdout JSON line protocol between `chatterbox-tts.mjs` and `chatterbox-server.py` is internal and fragile. Replace it with a lightweight HTTP server inside the Python process so: +- Multiple Node services can connect independently +- External tools can also talk to it directly +- The protocol is self-documenting and testable with curl + +This also applies to other Python backend processes if they appear. + +--- + +## Pending_Query — migrate to ESM library event dispatch (future) + +The current constructor callback pattern (`on_submit`, `on_cancel`, `on_activate`, `on_mode_change`, etc.) is reinventing an event emitter. Replace with the event dispatch system from the shared ESM library (`nodejs.esm-library`) so callers use `.on('submit', handler)` etc. instead of constructor options. No functional change — just removes the code smell and aligns with the shared library. + +--- + +## Current voice command vocabulary + +| Phrase | Effect | Mode | +|--------|--------|------| +| `computer` | Activate query (wake word) | idle | +| `go` / `done` / `send` | Submit accumulated query | active | +| `abort` / `cancel` | Discard query, return to idle | active | +| `pause` | Freeze accumulation and timer (thinking pause) | active | +| `resume` / `continue` | Unfreeze and restart timer | paused | +| `yes please` | Instant dispatch, bypasses wake word | any | +| `always listen` | Switch to always-on mode | any | +| `stop listening` | Switch to wake-word mode | any | +| `mode query` | Speak current mode status | any | + +Silence timer auto-enabled in wake-word mode; disabled in always-on mode. + +--- + +## Utterance pre-processing: collapse repeated words (planned) + +STT occasionally produces repeated words as artifacts (e.g. "go go", "cancel cancel"). Add a pre-processing step before command matching that collapses consecutive identical words in a fragment into one (e.g. "go go" → "go"). This would be a simple normalisation rule applied to the `norm` string before any word-set lookups. + +--- + +## Utterance routing layer (planned) + +As query interaction grows more complex, utterance dispatch needs its own class/config rather than living inside `Pending_Query`. Rough shape: + +- A routing table mapping normalized utterance strings (or patterns) to handler callbacks +- Handlers run before the utterance is appended — if one matches, it consumes the utterance +- Examples: `'read back'` → speak accumulated text; `'redo'` / `'start over'` → clear without cancel; `'yes please'` / `'no'` → immediate dispatch to Claude +- Send words and cancel words become entries in this same table +- `Pending_Query` only handles accumulation and submission; routing logic lives outside it + +--- + +## Query staging / readback (planned) + +Before a query is sent to Claude, give the user a chance to review and correct it via voice: + +- After accumulating utterances, read back the query text via TTS so the user can confirm it looks right +- Commands like "read back" or "what did I say" speak the current accumulated text +- Commands like "redo" or "start over" clear the accumulation without cancelling the session +- Only "go" / "send" actually dispatches to Claude + +This requires a richer interaction loop inside `Pending_Query` — the current send/cancel/append model is the foundation. + +--- + ## Open questions - Keep `acting-demo-chatterbox.mjs` as a reference for TTS capability demos, or delete? diff --git a/chatterbox-server.py b/chatterbox-server.py index 8236314..a0a9958 100755 --- a/chatterbox-server.py +++ b/chatterbox-server.py @@ -4,6 +4,8 @@ Chatterbox TTS server — keeps model loaded, reads JSON lines from stdin. Protocol: stdin: {"text": "...", "temperature": 0.8, "top_p": 0.95} + {"chime": "/path/to/file.wav"} + {"preload": "/path/to/file.wav"} stdout: "ok\n" after each utterance is generated (playback may still be in progress) stderr: status/timing messages @@ -169,18 +171,50 @@ def generate(text, opts): return samples +_chime_cache = {} + +def load_chime(path): + if path in _chime_cache: + return _chime_cache[path] + samples, sr = sf.read(path, dtype='float32', always_2d=True) + samples = samples.mean(axis=1) # stereo → mono + if sr != SAMPLE_RATE: + samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE) + _chime_cache[path] = samples + return samples + + for line in sys.stdin: line = line.strip() if not line: continue try: - req = json.loads(line) - text = req.pop('text', '') - opts = req # remaining fields are generation options + req = json.loads(line) except json.JSONDecodeError: - text = line - opts = {} + req = {'text': line} + + if 'preload' in req: + try: + load_chime(req['preload']) + log(f'preloaded chime: {req["preload"]}') + except Exception as e: + log(f'preload error: {e}') + print('ok', flush=True) + continue + + if 'chime' in req: + try: + samples = load_chime(req['chime']) + playback_queue.put(samples) + except Exception as e: + log(f'chime error: {e}') + traceback.print_exc(file=sys.stderr) + print('ok', flush=True) + continue + + text = req.pop('text', '') + opts = req if not text: print('ok', flush=True) diff --git a/lib/chatterbox-tts.mjs b/lib/chatterbox-tts.mjs index e78c326..19ec48c 100644 --- a/lib/chatterbox-tts.mjs +++ b/lib/chatterbox-tts.mjs @@ -92,6 +92,26 @@ export class Chatterbox_Tts { }) } + play_chime(path) { + return new Promise((resolve, reject) => { + if (!this._proc) { + return reject(new Error('Chatterbox_Tts not initialized — call init() first')) + } + this._resolve = resolve + this._proc.stdin.write(JSON.stringify({ chime: path }) + '\n') + }) + } + + preload_chime(path) { + return new Promise((resolve, reject) => { + if (!this._proc) { + return reject(new Error('Chatterbox_Tts not initialized — call init() first')) + } + this._resolve = resolve + this._proc.stdin.write(JSON.stringify({ preload: path }) + '\n') + }) + } + stop() { this._rl?.close() this._proc?.kill() diff --git a/lib/pending-query.mjs b/lib/pending-query.mjs new file mode 100644 index 0000000..8634669 --- /dev/null +++ b/lib/pending-query.mjs @@ -0,0 +1,229 @@ +const LOUD_STREAK_NEEDED = 3 + +export class Pending_Query { + constructor({ + silence_timeout = 6000, + use_timer = false, + use_classifier = false, + classifier = null, + send_words = new Set(['go', 'done', 'send']), + cancel_words = new Set(['abort', 'cancel']), + use_wake_word = false, + wake_words = new Set(['computer']), + always_listen_words = new Set(['always listen']), + stop_listening_words = new Set(['stop listening']), + mode_query_words = new Set(['mode query']), + instant_dispatch = new Map([['yes please', 'Yes please.']]), + pause_words = new Set(['pause']), + resume_words = new Set(['resume', 'continue']), + on_submit = null, + on_cancel = null, + on_empty_submit = null, + on_activate = null, + on_mode_change = null, + on_mode_query = null, + } = {}) { + this.accumulated = '' + this.silence_timeout = silence_timeout + this.use_timer = use_timer + this.use_classifier = use_classifier + this.classifier = classifier + this.send_words = send_words + this.cancel_words = cancel_words + this.use_wake_word = use_wake_word + this.wake_words = wake_words + this.always_listen_words = always_listen_words + this.stop_listening_words = stop_listening_words + this.mode_query_words = mode_query_words + this.instant_dispatch = instant_dispatch + this.pause_words = pause_words + this.resume_words = resume_words + this._paused = false + this.on_submit = on_submit + this.on_cancel = on_cancel + this.on_empty_submit = on_empty_submit + this.on_activate = on_activate + this.on_mode_change = on_mode_change + this.on_mode_query = on_mode_query + this._active = !use_wake_word + this._silence_timer = null + this._loud_streak = 0 + } + + get is_empty() { + return !this.accumulated.trim() + } + + _has_real_words(text) { + return /[a-z]{2,}/i.test(text) + } + + append(text) { + this.accumulated = this.accumulated ? `${this.accumulated}\n${text}` : text + if (this.use_timer) { + this._reset_silence_timer() + } + } + + async check_classifier() { + if (!this.use_classifier || !this.classifier || this.is_empty) { + return false + } + const last_line = this.accumulated.split('\n').at(-1) + return await this.classifier(last_line) + } + + on_audio(chunk) { + if (!this.use_timer || this.is_empty) { + return + } + const samples = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.byteLength >> 1) + let sum = 0 + for (let i = 0; i < samples.length; i++) { + sum += samples[i] * samples[i] + } + const rms = Math.sqrt(sum / samples.length) / 32768 + if (rms > 0.02) { + this._loud_streak++ + if (this._loud_streak >= LOUD_STREAK_NEEDED) { + this._reset_silence_timer() + } + } else { + this._loud_streak = 0 + } + } + + async process_utterance(text) { + const norm = text.trim().toLowerCase().replace(/[^a-z ]/g, '').replace(/\s+/g, ' ').trim() + + process.stderr.write(`[pending-query] utterance norm: ${JSON.stringify(norm)}\n`) + + if (this.mode_query_words.has(norm)) { + process.stderr.write(`[pending-query] mode query — wake_word=${this.use_wake_word}, active=${this._active}\n`) + this.on_mode_query?.({ use_wake_word: this.use_wake_word, active: this._active }) + return + } + + if (this.always_listen_words.has(norm)) { + this.use_wake_word = false + this.use_timer = false + this._active = true + clearTimeout(this._silence_timer) + this._silence_timer = null + process.stderr.write('[pending-query] mode: always listening\n') + this.on_mode_change?.({ use_wake_word: false }) + return + } + + if (this.stop_listening_words.has(norm)) { + this.use_wake_word = true + this.use_timer = true + this._active = false + process.stderr.write('[pending-query] mode: wake word\n') + this.on_mode_change?.({ use_wake_word: true }) + return + } + + const norm_compact_early = norm.replace(/ /g, '') + + if (this.resume_words.has(norm_compact_early)) { + if (this._paused) { + this._paused = false + process.stderr.write('[pending-query] resumed\n') + if (this._active && this.use_timer && !this.is_empty) { + this._reset_silence_timer() + } + } + return + } + + if (this.pause_words.has(norm_compact_early)) { + this._paused = true + clearTimeout(this._silence_timer) + this._silence_timer = null + process.stderr.write('[pending-query] paused\n') + return + } + + if (this._paused) { + process.stderr.write(`[pending-query] paused, ignored: ${JSON.stringify(text)}\n`) + return + } + + if (this.instant_dispatch.has(norm)) { + const query = this.instant_dispatch.get(norm) + process.stderr.write(`[pending-query] instant dispatch: ${JSON.stringify(query)}\n`) + this.on_submit?.(query) + return + } + + const norm_compact = norm_compact_early + + if (!this._active) { + if (this.wake_words.has(norm_compact)) { + this._active = true + process.stderr.write('[pending-query] wake word detected — active\n') + this.on_activate?.() + } else { + process.stderr.write(`[pending-query] idle, ignored: ${JSON.stringify(text)}\n`) + } + return + } + + if (this.cancel_words.has(norm_compact)) { + this.cancel() + return + } + if (this.send_words.has(norm_compact)) { + this.submit() + return + } + if (!this._has_real_words(text)) { + process.stderr.write(`[pending-query] noise fragment dropped: ${JSON.stringify(text)}\n`) + return + } + this.append(text) + process.stderr.write(`[pending-query] fragment: ${JSON.stringify(this.accumulated)}\n`) + if (await this.check_classifier()) { + this.submit() + } + } + + submit() { + clearTimeout(this._silence_timer) + this._silence_timer = null + const text = this.accumulated.trim() + this.accumulated = '' + this._loud_streak = 0 + if (this.use_wake_word) { + this._active = false + } + if (text) { + this.on_submit?.(text) + } else { + this.on_empty_submit?.() + } + } + + cancel() { + clearTimeout(this._silence_timer) + this._silence_timer = null + this.accumulated = '' + this._loud_streak = 0 + if (this.use_wake_word) { + this._active = false + } + this.on_cancel?.() + } + + _reset_silence_timer() { + clearTimeout(this._silence_timer) + this._silence_timer = setTimeout(() => { + if (this.is_empty) { + return + } + process.stderr.write('[pending-query] silence timeout — submitting\n') + this.submit() + }, this.silence_timeout) + } +} diff --git a/query-demo.mjs b/query-demo.mjs index 904caa2..6017fc3 100644 --- a/query-demo.mjs +++ b/query-demo.mjs @@ -3,21 +3,22 @@ * completeness with a local LLM classifier, then dispatches to Claude Code. * * Usage: - * node query-demo.mjs [--audio-prompt voice.wav] [--whisper model] [--window-id 0x...] [--claude-remote /path/to/claude-remote.mjs] + * node query-demo.mjs [--audio-prompt voice.wav] [--whisper model] [--tmux-container-id 0x...] [--claude-remote /path/to/claude-remote.mjs] * - * If --window-id and --claude-remote are supplied, completed queries are + * If tmux-container-id and --claude-remote are supplied, completed queries are * dispatched to Claude Code. Otherwise they are only logged to stderr. * - * A query is considered complete when: - * - The classifier says so - * - The last word is a send-word (go/done/send) - * - No new fragment arrives within SILENCE_TIMEOUT ms + * Wake word mode (default on): say "computer" to activate, then speak your query. + * End with a send-word (go/done/send) to dispatch, or abort/cancel to discard. + * Pass --wake-word off to disable and always accumulate. */ + + import { execFileSync } from 'node:child_process' import { Stt } from './lib/stt.mjs' import { Tts_Client } from './lib/tts-client.mjs' -import { is_query_complete } from './lib/local-query-complete.mjs' +import { Pending_Query } from './lib/pending-query.mjs' function get_arg(name) { const i = process.argv.indexOf(name) @@ -26,23 +27,31 @@ function get_arg(name) { const audio_prompt = get_arg('--audio-prompt') ?? '/home/devilholk/Documents/rommie-sample.wav' const whisper_name = get_arg('--whisper') ?? 'base.en' -const window_id = get_arg('--window-id') +const container_id = get_arg('--tmux-container-id') const claude_remote = get_arg('--claude-remote') const SILENCE_TIMEOUT = parseInt(get_arg('--silence-timeout') ?? '6000') + const tts = new Tts_Client() const stt = new Stt({ whisper_name }) +const USE_WAKE_WORD = get_arg('--wake-word') !== 'off' + stt.init() process.stderr.write(`[query-demo] whisper model: ${whisper_name}\n`) process.stderr.write(`[query-demo] voice: ${audio_prompt}\n`) -if (window_id && claude_remote) { - process.stderr.write(`[query-demo] dispatch: window ${window_id} via ${claude_remote}\n`) +if (container_id && claude_remote) { + process.stderr.write(`[query-demo] dispatch: container ${container_id} via ${claude_remote}\n`) } else { - process.stderr.write(`[query-demo] no --window-id/--claude-remote — logging only\n`) + process.stderr.write(`[query-demo] no --tmux-container-id/--claude-remote — logging only\n`) +} +process.stderr.write(`[query-demo] wake word: ${USE_WAKE_WORD ? 'on (say "computer" to activate)' : 'off'}\n`) +if (USE_WAKE_WORD) { + await tts.speak('Ready. Say computer to begin a query, or always listen for hands-free mode. Say help for usage.', { audio_prompt }) +} else { + await tts.speak('Always listening. Say go to send, cancel to discard, or always listen to switch to wake-word mode. Say help for usage.', { audio_prompt }) } -await tts.speak('Ready for input.', { audio_prompt }) function make_prompt(query) { return [ @@ -60,79 +69,62 @@ function make_prompt(query) { } function dispatch(query) { - execFileSync('node', [claude_remote, '--window-id', window_id], { + execFileSync('node', [claude_remote, '--tmux-container-id', container_id], { input: make_prompt(query), encoding: 'utf8', stdio: ['pipe', 'inherit', 'inherit'], }) } -async function submit(query) { - query = query.trim() - if (!query) { - return - } - process.stderr.write(`[query-demo] query: ${JSON.stringify(query)}\n`) - if (window_id && claude_remote) { - dispatch(query) - } - await tts.speak('Efforting.', { audio_prompt }) -} -const SEND_WORDS = new Set(['go', 'done', 'send']) - -let accumulated = '' -let silence_timer = null -let loud_chunk_streak = 0 -const LOUD_STREAK_NEEDED = 3 // ~150ms of sustained loud audio to count as speech - -function reset_silence_timer() { - clearTimeout(silence_timer) - silence_timer = setTimeout(async () => { - if (!accumulated) { - return +const pending = new Pending_Query({ + silence_timeout: SILENCE_TIMEOUT, + use_wake_word: USE_WAKE_WORD, + use_timer: USE_WAKE_WORD, + on_activate: async () => { + process.stderr.write('[query-demo] activated\n') + await tts.chime('ready').catch(() => {}) + }, + on_mode_query: async ({ use_wake_word, active }) => { + const msg = use_wake_word + ? (active ? 'Listening for your query.' : 'Wake word mode. Say computer to begin.') + : 'Always listening.' + await tts.speak(msg, { audio_prompt }) + }, + on_mode_change: async ({ use_wake_word }) => { + const msg = use_wake_word ? 'Wake word mode. Say computer to begin.' : 'Always listening.' + process.stderr.write(`[query-demo] mode: ${use_wake_word ? 'wake word' : 'always listening'}\n`) + await tts.speak(msg, { audio_prompt }) + }, + on_submit: async (text) => { + process.stderr.write(`[query-demo] query: ${JSON.stringify(text)}\n`) + if (container_id && claude_remote) { + await tts.chime('dispatch').catch(() => {}) + const dispatch_time = Date.now() + dispatch(text) + setTimeout(async () => { + try { + const res = await fetch(`${tts._url}/activity`) + const { last_speak_at } = await res.json() + if (last_speak_at < dispatch_time) { + await tts.chime('working').catch(() => {}) + } + } catch { + await tts.chime('working').catch(() => {}) + } + }, 4000) } - process.stderr.write('[query-demo] silence timeout — submitting\n') - const query = accumulated - accumulated = '' - await submit(query) - }, SILENCE_TIMEOUT) -} + }, + on_cancel: async () => { + process.stderr.write('[query-demo] query cancelled\n') + await tts.chime('cancel').catch(() => tts.speak('Cancelled.', { audio_prompt })) + }, + on_empty_submit: async () => { + process.stderr.write('[query-demo] submit with nothing accumulated\n') + await tts.speak('Nothing to send.', { audio_prompt }) + }, +}) stt.listen(async (text) => { - accumulated = accumulated ? `${accumulated}\n${text}` : text - process.stderr.write(`[query-demo] fragment: ${JSON.stringify(accumulated)}\n`) - - reset_silence_timer() - - const last_line = accumulated.split('\n').at(-1) - const last_norm = last_line.toLowerCase().replace(/[^a-z]/g, '') - const forced = SEND_WORDS.has(last_norm) - - if (!forced) { - const complete = await is_query_complete(last_line) - if (!complete) { - return - } - } - - clearTimeout(silence_timer) - - const lines = accumulated.split('\n') - const query = (forced ? lines.slice(0, -1) : lines).join('\n') - accumulated = '' - - await submit(query) -}, { on_audio: (chunk) => { - if (!accumulated) return - const samples = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.byteLength >> 1) - let sum = 0 - for (let i = 0; i < samples.length; i++) sum += samples[i] * samples[i] - const rms = Math.sqrt(sum / samples.length) / 32768 - if (rms > 0.02) { - loud_chunk_streak++ - if (loud_chunk_streak >= LOUD_STREAK_NEEDED) reset_silence_timer() - } else { - loud_chunk_streak = 0 - } -} }) + await pending.process_utterance(text) +}, { on_audio: (chunk) => pending.on_audio(chunk) }) diff --git a/test-chime.mjs b/test-chime.mjs new file mode 100755 index 0000000..d5a2a79 --- /dev/null +++ b/test-chime.mjs @@ -0,0 +1,13 @@ +#!/usr/bin/env node +import { Tts_Client } from './lib/tts-client.mjs' + +const name = process.argv[2] ?? 'ready' +const tts = new Tts_Client() + +try { + await tts.chime(name) + process.stderr.write(`chime '${name}' played\n`) +} catch (err) { + process.stderr.write(`chime '${name}' failed: ${err.message}\n`) + process.exit(1) +} diff --git a/tts-server.mjs b/tts-server.mjs index dace80b..f85a496 100644 --- a/tts-server.mjs +++ b/tts-server.mjs @@ -20,7 +20,6 @@ import * as http from 'node:http' import * as fs from 'node:fs' import * as path from 'node:path' -import { spawn } from 'node:child_process' import yaml from 'js-yaml' import { Chatterbox_Tts } from './lib/chatterbox-tts.mjs' @@ -38,6 +37,7 @@ function reload_config() { let { voices, chimes } = reload_config() let current_voice = null // name of active voice, or null +let last_speak_at = 0 // --- TTS setup --- const tts = new Chatterbox_Tts() @@ -45,6 +45,16 @@ process.stderr.write('[tts-server] starting chatterbox...\n') await tts.init() process.stderr.write('[tts-server] chatterbox ready\n') +// Preload all configured chimes so first play has no decode latency +{ + const { chimes: configured_chimes } = reload_config() + for (const [name, file] of Object.entries(configured_chimes)) { + tts.preload_chime(file).catch(err => + process.stderr.write(`[tts-server] preload failed for chime '${name}': ${err.message}\n`) + ) + } +} + // Serialize all speak requests through a promise chain let queue = Promise.resolve() @@ -72,19 +82,16 @@ function send(res, status, body) { res.end(payload) } -function play_file(file_path) { - return new Promise((resolve, reject) => { - const player = spawn('pacat', ['--playback', file_path]) - player.on('close', code => code === 0 ? resolve() : reject(new Error(`pacat exited ${code}`))) - player.on('error', reject) - }) -} const server = http.createServer(async (req, res) => { if (req.method === 'GET' && req.url === '/health') { return send(res, 200, { ok: true }) } + if (req.method === 'GET' && req.url === '/activity') { + return send(res, 200, { last_speak_at }) + } + if (req.method === 'GET' && req.url === '/voices') { ({ voices, chimes } = reload_config()) const list = Object.entries(voices).map(([name, v]) => ({ @@ -120,7 +127,7 @@ const server = http.createServer(async (req, res) => { const file = chimes[name] ?? null if (!file) return send(res, 404, { error: `chime not found: ${name}` }) try { - await enqueue(() => play_file(file)) + await enqueue(() => tts.play_chime(file)) return send(res, 200, { ok: true }) } catch (err) { return send(res, 500, { error: err.message }) @@ -139,6 +146,7 @@ const server = http.createServer(async (req, res) => { if (!text) { return send(res, 400, { error: 'text required' }) } + last_speak_at = Date.now() // Inject current voice as default audio_prompt if none provided if (!opts.audio_prompt && current_voice && voices[current_voice]) { diff --git a/voice-buddy.mjs b/voice-buddy.mjs index b85797e..f279775 100644 --- a/voice-buddy.mjs +++ b/voice-buddy.mjs @@ -1,3 +1,5 @@ +// DEPRECATED + /** * Voice buddy — reads voice queries from stdin (JSON lines) and dispatches * them to a Claude Code instance via claude-remote. diff --git a/voices.yaml b/voices.yaml index b784bfa..3e676e9 100644 --- a/voices.yaml +++ b/voices.yaml @@ -56,7 +56,10 @@ voices: # System event chimes — map event name to audio file path. # Falls back to chimes/.wav or .ogg if not listed here. chimes: -# ready: /home/devilholk/sounds/trek-chirp.wav + ready: /home/devilholk/Documents/ready.ogg + cancel: /home/devilholk/Documents/abort.ogg + working: /home/devilholk/Documents/working.ogg + dispatch: /home/devilholk/Documents/dispatch.ogg # dispatch: /home/devilholk/sounds/trek-send.wav # cancel: /home/devilholk/sounds/cancel.ogg # error: /home/devilholk/sounds/error.ogg