/** * Voice demo: microphone → Whisper STT → (optional LLM cleanup) → Kokoro TTS * * Environment variables: * WHISPER_MODEL default: base.en * options: tiny.en base.en small.en medium.en large-v3 large-v3-turbo * VOICE default: af_heart * options: af_heart af_bella af_nicole af_sarah af_sky am_adam am_michael * STT_PROVIDER default: cuda * USE_LLM set to 0 to disable (default: 1) * OLLAMA_MODEL default: phi3:mini */ import { Stt } from './lib/stt.mjs' import { Tts, VOICES } from './lib/tts.mjs' import { llm_available, list_models, cleanup } from './lib/llm.mjs' const WHISPER_MODEL = process.env.WHISPER_MODEL || 'base.en' const VOICE = process.env.VOICE || 'af_heart' const STT_PROVIDER = process.env.STT_PROVIDER || 'cuda' const USE_LLM = process.env.USE_LLM !== '0' if (!(VOICE in VOICES)) { process.stderr.write(`[voice] unknown voice "${VOICE}". Available: ${Object.keys(VOICES).join(', ')}\n`) process.exit(1) } process.stderr.write(`[stt] loading whisper-${WHISPER_MODEL} on ${STT_PROVIDER}...\n`) const stt = new Stt({ whisper_name: WHISPER_MODEL, provider: STT_PROVIDER }) stt.init() process.stderr.write('[stt] ready\n') process.stderr.write('[tts] loading Kokoro...\n') const tts = new Tts({ voice: VOICE }) tts.init() process.stderr.write('[tts] ready\n') let has_llm = false if (USE_LLM) { has_llm = await llm_available() if (has_llm) { const models = await list_models() process.stderr.write(`[llm] ollama available. models: ${models.slice(0, 6).join(', ')}\n`) } else { process.stderr.write('[llm] ollama not reachable, skipping cleanup\n') } } process.stderr.write(`\n[ready] whisper=${WHISPER_MODEL} voice=${VOICE} llm=${has_llm}\n`) process.stderr.write('[ready] speak into your microphone. Ctrl+C to stop.\n\n') let speaking = false const stop = stt.listen(async (raw_text) => { if (speaking) { process.stderr.write(`[skipped] ${raw_text}\n`) return } speaking = true try { process.stdout.write(`[raw] ${raw_text}\n`) let text = raw_text if (has_llm) { text = await cleanup(raw_text) if (text !== raw_text) { process.stdout.write(`[llm] ${text}\n`) } } await tts.speak_streaming(text) process.stdout.write('\n') } catch (err) { process.stderr.write(`[error] ${err.message}\n`) } finally { speaking = false } }) process.on('SIGINT', () => { process.stderr.write('\n[voice] stopping\n') stop() process.exit(0) })