- README explaining experimental/transparency purpose - faster-whisper STT backend (fw-stt.mjs, faster-whisper-server.py, install-faster-whisper.sh) - Bug fixes: Buffer alignment in on_audio, --debug-waveform URL parsing, silent fetch errors, instant dispatch timer leak - Global uncaughtException/unhandledRejection handlers in query-demo.mjs - Design docs: CHANGELOG, COMMAND-DISPATCH, INTERFACE-THEORY, VOICE-POLICY - Systemd service unit templates Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
153 lines
5.7 KiB
JavaScript
153 lines
5.7 KiB
JavaScript
/**
|
||
* Voice query demo <20><> accumulates STT utterances into a query, checks
|
||
* completeness with a local LLM classifier, then dispatches to Claude Code.
|
||
*
|
||
* Usage:
|
||
* node query-demo.mjs [--audio-prompt voice.wav] [--whisper model] [--tmux-container-id 0x...] [--claude-remote /path/to/claude-remote.mjs]
|
||
*
|
||
* If tmux-container-id and --claude-remote are supplied, completed queries are
|
||
* dispatched to Claude Code. Otherwise they are only logged to stderr.
|
||
*
|
||
* Wake word mode (default on): say "computer" to activate, then speak your query.
|
||
* End with a send-word (go/done/send) to dispatch, or abort/cancel to discard.
|
||
* Pass --wake-word off to disable and always accumulate.
|
||
*/
|
||
|
||
|
||
|
||
import { execFileSync } from 'node:child_process'
|
||
|
||
process.on('uncaughtException', err => {
|
||
process.stderr.write(`[uncaughtException] ${err.stack ?? err.message}\n`)
|
||
})
|
||
process.on('unhandledRejection', (reason) => {
|
||
process.stderr.write(`[unhandledRejection] ${reason?.stack ?? reason}\n`)
|
||
})
|
||
import { Tts_Client } from './lib/tts-client.mjs'
|
||
import { Pending_Query } from './lib/pending-query.mjs'
|
||
|
||
function get_arg(name) {
|
||
const i = process.argv.indexOf(name)
|
||
return i !== -1 ? process.argv[i + 1] : null
|
||
}
|
||
|
||
const audio_prompt = get_arg('--audio-prompt') ?? '/home/devilholk/Documents/rommie-sample.wav'
|
||
const whisper_name = get_arg('--whisper') ?? 'base.en'
|
||
const container_id = get_arg('--tmux-container-id')
|
||
const claude_remote = get_arg('--claude-remote')
|
||
const _dw = get_arg('--debug-waveform')
|
||
const debug_waveform = (_dw && !_dw.startsWith('--'))
|
||
? _dw
|
||
: (process.argv.includes('--debug-waveform') ? 'http://localhost:3888/emit/audio-debug' : null)
|
||
const stt_backend = get_arg('--stt') ?? 'sherpa-onnx'
|
||
|
||
const SILENCE_TIMEOUT = parseInt(get_arg('--silence-timeout') ?? '6000')
|
||
|
||
const { Stt } = stt_backend === 'faster-whisper'
|
||
? await import('./lib/fw-stt.mjs')
|
||
: await import('./lib/stt.mjs')
|
||
|
||
const tts = new Tts_Client()
|
||
const stt = new Stt({ whisper_name, debug_url: debug_waveform })
|
||
|
||
const USE_WAKE_WORD = get_arg('--wake-word') !== 'off'
|
||
|
||
stt.init()
|
||
process.stderr.write(`[query-demo] whisper model: ${whisper_name}\n`)
|
||
process.stderr.write(`[query-demo] voice: ${audio_prompt}\n`)
|
||
if (container_id && claude_remote) {
|
||
process.stderr.write(`[query-demo] dispatch: container ${container_id} via ${claude_remote}\n`)
|
||
} else {
|
||
process.stderr.write(`[query-demo] no --tmux-container-id/--claude-remote — logging only\n`)
|
||
}
|
||
process.stderr.write(`[query-demo] wake word: ${USE_WAKE_WORD ? 'on (say "computer" to activate)' : 'off'}\n`)
|
||
process.stderr.write(`[query-demo] stt backend: ${stt_backend}\n`)
|
||
if (debug_waveform) {
|
||
process.stderr.write(`[query-demo] debug waveform: ${debug_waveform}\n`)
|
||
}
|
||
if (USE_WAKE_WORD) {
|
||
await tts.speak('Ready. Say computer to begin a query, or always listen for hands-free mode. Say help for usage.', { audio_prompt })
|
||
} else {
|
||
await tts.speak('Always listening. Say go to send, cancel to discard, or always listen to switch to wake-word mode. Say help for usage.', { audio_prompt })
|
||
}
|
||
|
||
function make_prompt(query) {
|
||
return [
|
||
'[voice-buddy] You have received a voice query.',
|
||
'',
|
||
'This is a voice interface. When you are done, use the `speak` shell command to inform the user of the outcome — keep it brief and spoken-word natural.',
|
||
'If the task is a question, speak the answer directly.',
|
||
'If the task is an action, carry it out and then speak a short confirmation.',
|
||
'If the task will take more than a few seconds, use `speak` to say a brief acknowledgement BEFORE starting work, so the user knows you received it.',
|
||
'',
|
||
'--- Query ---',
|
||
query,
|
||
'--- End of query ---',
|
||
].join('\n')
|
||
}
|
||
|
||
function dispatch(query) {
|
||
execFileSync('node', [claude_remote, '--tmux-container-id', container_id], {
|
||
input: make_prompt(query),
|
||
encoding: 'utf8',
|
||
stdio: ['pipe', 'inherit', 'inherit'],
|
||
})
|
||
}
|
||
|
||
|
||
const pending = new Pending_Query({
|
||
silence_timeout: SILENCE_TIMEOUT,
|
||
use_wake_word: USE_WAKE_WORD,
|
||
use_timer: USE_WAKE_WORD,
|
||
on_activate: async () => {
|
||
process.stderr.write('[query-demo] activated\n')
|
||
await tts.chime('ready').catch(() => {})
|
||
},
|
||
on_mode_query: async ({ use_wake_word, active }) => {
|
||
const msg = use_wake_word
|
||
? (active ? 'Listening for your query.' : 'Wake word mode. Say computer to begin.')
|
||
: 'Always listening.'
|
||
await tts.speak(msg, { audio_prompt })
|
||
},
|
||
on_mode_change: async ({ use_wake_word }) => {
|
||
const msg = use_wake_word ? 'Wake word mode. Say computer to begin.' : 'Always listening.'
|
||
process.stderr.write(`[query-demo] mode: ${use_wake_word ? 'wake word' : 'always listening'}\n`)
|
||
await tts.speak(msg, { audio_prompt })
|
||
},
|
||
on_submit: async (text) => {
|
||
process.stderr.write(`[query-demo] query: ${JSON.stringify(text)}\n`)
|
||
if (container_id && claude_remote) {
|
||
await tts.chime('dispatch').catch(() => {})
|
||
const dispatch_time = Date.now()
|
||
dispatch(text)
|
||
setTimeout(async () => {
|
||
try {
|
||
const res = await fetch(`${tts._url}/activity`)
|
||
if (!res.ok) {
|
||
await tts.chime('working').catch(() => {})
|
||
return
|
||
}
|
||
const { last_speak_at } = await res.json()
|
||
if (!(last_speak_at >= dispatch_time)) {
|
||
await tts.chime('working').catch(() => {})
|
||
}
|
||
} catch {
|
||
await tts.chime('working').catch(() => {})
|
||
}
|
||
}, 4000)
|
||
}
|
||
},
|
||
on_cancel: async () => {
|
||
process.stderr.write('[query-demo] query cancelled\n')
|
||
await tts.chime('cancel').catch(() => tts.speak('Cancelled.', { audio_prompt }))
|
||
},
|
||
on_empty_submit: async () => {
|
||
process.stderr.write('[query-demo] submit with nothing accumulated\n')
|
||
await tts.speak('Nothing to send.', { audio_prompt })
|
||
},
|
||
})
|
||
|
||
stt.listen(async (text) => {
|
||
await pending.process_utterance(text)
|
||
}, { on_audio: (chunk) => pending.on_audio(chunk) })
|