claude-voice-experiment/query-demo.mjs

/**
 * Voice query demo <20><> accumulates STT utterances into a query, checks
 * completeness with a local LLM classifier, then dispatches to Claude Code.
 *
 * Usage:
 *   node query-demo.mjs [--audio-prompt voice.wav] [--whisper model] [--window-id 0x...] [--claude-remote /path/to/claude-remote.mjs]
 *
 * If --window-id and --claude-remote are supplied, completed queries are
 * dispatched to Claude Code. Otherwise they are only logged to stderr.
 *
 * A query is considered complete when:
 *   - The classifier says so
 *   - The last word is a send-word (go/done/send)
 *   - No new fragment arrives within SILENCE_TIMEOUT ms
 */

import { execFileSync }    from 'node:child_process'
import { Stt }              from './lib/stt.mjs'
import { Tts_Client }       from './lib/tts-client.mjs'
import { is_query_complete } from './lib/local-query-complete.mjs'

function get_arg(name) {
	const i = process.argv.indexOf(name)
	return i !== -1 ? process.argv[i + 1] : null
}

const audio_prompt   = get_arg('--audio-prompt') ?? '/home/devilholk/Documents/rommie-sample.wav'
const whisper_name   = get_arg('--whisper')      ?? 'base.en'
const window_id      = get_arg('--window-id')
const claude_remote  = get_arg('--claude-remote')

const SILENCE_TIMEOUT = parseInt(get_arg('--silence-timeout') ?? '6000')

const tts = new Tts_Client()
const stt = new Stt({ whisper_name })

stt.init()
process.stderr.write(`[query-demo] whisper model: ${whisper_name}\n`)
process.stderr.write(`[query-demo] voice: ${audio_prompt}\n`)
if (window_id && claude_remote) {
	process.stderr.write(`[query-demo] dispatch: window ${window_id} via ${claude_remote}\n`)
} else {
	process.stderr.write(`[query-demo] no --window-id/--claude-remote — logging only\n`)
}
await tts.speak('Ready for input.', { audio_prompt })

function make_prompt(query) {
	return [
		'[voice-buddy] You have received a voice query.',
		'',
		'This is a voice interface. When you are done, use the `speak` shell command to inform the user of the outcome — keep it brief and spoken-word natural.',
		'If the task is a question, speak the answer directly.',
		'If the task is an action, carry it out and then speak a short confirmation.',
		'If the task will take more than a few seconds, use `speak` to say a brief acknowledgement BEFORE starting work, so the user knows you received it.',
		'',
		'--- Query ---',
		query,
		'--- End of query ---',
	].join('\n')
}

function dispatch(query) {
	execFileSync('node', [claude_remote, '--window-id', window_id], {
		input:    make_prompt(query),
		encoding: 'utf8',
		stdio:    ['pipe', 'inherit', 'inherit'],
	})
}

async function submit(query) {
	query = query.trim()
	if (!query) {
		return
	}
	process.stderr.write(`[query-demo] query: ${JSON.stringify(query)}\n`)
	if (window_id && claude_remote) {
		dispatch(query)
	}
	await tts.speak('Efforting.', { audio_prompt })
}

const SEND_WORDS = new Set(['go', 'done', 'send'])

let accumulated    = ''
let silence_timer  = null

function reset_silence_timer() {
	clearTimeout(silence_timer)
	silence_timer = setTimeout(async () => {
		if (!accumulated) {
			return
		}
		process.stderr.write('[query-demo] silence timeout — submitting\n')
		const query = accumulated
		accumulated = ''
		await submit(query)
	}, SILENCE_TIMEOUT)
}

stt.listen(async (text) => {
	accumulated = accumulated ? `${accumulated}\n${text}` : text
	process.stderr.write(`[query-demo] fragment: ${JSON.stringify(accumulated)}\n`)

	reset_silence_timer()

	const last_line = accumulated.split('\n').at(-1)
	const last_norm = last_line.toLowerCase().replace(/[^a-z]/g, '')
	const forced    = SEND_WORDS.has(last_norm)

	if (!forced) {
		const complete = await is_query_complete(last_line)
		if (!complete) {
			return
		}
	}

	clearTimeout(silence_timer)

	const lines = accumulated.split('\n')
	const query = (forced ? lines.slice(0, -1) : lines).join('\n')
	accumulated = ''

	await submit(query)
}, { on_audio: (chunk) => {
	if (!accumulated) return
	// Only reset silence timer if audio has significant energy (not silence)
	const samples = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.byteLength >> 1)
	let sum = 0
	for (let i = 0; i < samples.length; i++) sum += samples[i] * samples[i]
	const rms = Math.sqrt(sum / samples.length) / 32768
	if (rms > 0.02) reset_silence_timer()
} })