/** * Voice demo: microphone → Whisper STT → (optional LLM cleanup) → Bark TTS * * Bark supports emphasis via UPPERCASE, paralinguistic tokens ([laughs], [sighs], * [clears throat]), and hesitation (...). Markdown is preprocessed automatically: * **bold** → UPPERCASE, headers get a pause, etc. * * Environment variables: * WHISPER_MODEL default: base.en * options: tiny.en base.en small.en medium.en large-v3 large-v3-turbo * BARK_MODEL default: suno/bark (use suno/bark-small for faster/smaller) * BARK_VOICE default: v2/en_speaker_6 * options: v2/en_speaker_0 .. v2/en_speaker_9 * STT_PROVIDER default: cuda * USE_LLM set to 0 to disable (default: 1) * OLLAMA_MODEL default: phi3:mini */ import { Stt } from './lib/stt.mjs' import { Bark_Tts } from './lib/bark-tts.mjs' import { llm_available, list_models, cleanup } from './lib/llm.mjs' const WHISPER_MODEL = process.env.WHISPER_MODEL || 'base.en' const STT_PROVIDER = process.env.STT_PROVIDER || 'cuda' const USE_LLM = process.env.USE_LLM !== '0' process.stderr.write(`[stt] loading whisper-${WHISPER_MODEL} on ${STT_PROVIDER}...\n`) const stt = new Stt({ whisper_name: WHISPER_MODEL, provider: STT_PROVIDER }) stt.init() process.stderr.write('[stt] ready\n') process.stderr.write('[tts] loading Bark (this takes a moment)...\n') const tts = new Bark_Tts() await tts.init() process.stderr.write('[tts] Bark ready\n') let has_llm = false if (USE_LLM) { has_llm = await llm_available() if (has_llm) { const models = await list_models() process.stderr.write(`[llm] ollama available. models: ${models.slice(0, 6).join(', ')}\n`) } else { process.stderr.write('[llm] ollama not reachable, skipping cleanup\n') } } process.stderr.write(`\n[ready] whisper=${WHISPER_MODEL} bark=${process.env.BARK_MODEL || 'suno/bark'} llm=${has_llm}\n`) process.stderr.write('[ready] speak into your microphone. Ctrl+C to stop.\n\n') let speaking = false const stop = stt.listen(async (raw_text) => { if (speaking) { process.stderr.write(`[skipped] ${raw_text}\n`) return } speaking = true try { process.stdout.write(`[raw] ${raw_text}\n`) let text = raw_text if (has_llm) { text = await cleanup(raw_text) if (text !== raw_text) { process.stdout.write(`[llm] ${text}\n`) } } await tts.speak_streaming(text) process.stdout.write('\n') } catch (err) { process.stderr.write(`[error] ${err.message}\n`) } finally { speaking = false } }) process.on('SIGINT', () => { process.stderr.write('\n[voice] stopping\n') stop() tts.stop() process.exit(0) })