Initial commit — voice pipeline experiment
STT (Silero VAD + Whisper via sherpa-onnx), Chatterbox TTS HTTP server, query completeness classifier (Ollama), multi-voice demo scripts, and planning docs. Kept as reference; clean rewrite planned in separate repos. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
85
demo-kokoro.mjs
Normal file
85
demo-kokoro.mjs
Normal file
@@ -0,0 +1,85 @@
|
||||
/**
|
||||
* Voice demo: microphone → Whisper STT → (optional LLM cleanup) → Kokoro TTS
|
||||
*
|
||||
* Environment variables:
|
||||
* WHISPER_MODEL default: base.en
|
||||
* options: tiny.en base.en small.en medium.en large-v3 large-v3-turbo
|
||||
* VOICE default: af_heart
|
||||
* options: af_heart af_bella af_nicole af_sarah af_sky am_adam am_michael
|
||||
* STT_PROVIDER default: cuda
|
||||
* USE_LLM set to 0 to disable (default: 1)
|
||||
* OLLAMA_MODEL default: phi3:mini
|
||||
*/
|
||||
|
||||
import { Stt } from './lib/stt.mjs'
|
||||
import { Tts, VOICES } from './lib/tts.mjs'
|
||||
import { llm_available, list_models, cleanup } from './lib/llm.mjs'
|
||||
|
||||
const WHISPER_MODEL = process.env.WHISPER_MODEL || 'base.en'
|
||||
const VOICE = process.env.VOICE || 'af_heart'
|
||||
const STT_PROVIDER = process.env.STT_PROVIDER || 'cuda'
|
||||
const USE_LLM = process.env.USE_LLM !== '0'
|
||||
|
||||
if (!(VOICE in VOICES)) {
|
||||
process.stderr.write(`[voice] unknown voice "${VOICE}". Available: ${Object.keys(VOICES).join(', ')}\n`)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
process.stderr.write(`[stt] loading whisper-${WHISPER_MODEL} on ${STT_PROVIDER}...\n`)
|
||||
const stt = new Stt({ whisper_name: WHISPER_MODEL, provider: STT_PROVIDER })
|
||||
stt.init()
|
||||
process.stderr.write('[stt] ready\n')
|
||||
|
||||
process.stderr.write('[tts] loading Kokoro...\n')
|
||||
const tts = new Tts({ voice: VOICE })
|
||||
tts.init()
|
||||
process.stderr.write('[tts] ready\n')
|
||||
|
||||
let has_llm = false
|
||||
if (USE_LLM) {
|
||||
has_llm = await llm_available()
|
||||
if (has_llm) {
|
||||
const models = await list_models()
|
||||
process.stderr.write(`[llm] ollama available. models: ${models.slice(0, 6).join(', ')}\n`)
|
||||
} else {
|
||||
process.stderr.write('[llm] ollama not reachable, skipping cleanup\n')
|
||||
}
|
||||
}
|
||||
|
||||
process.stderr.write(`\n[ready] whisper=${WHISPER_MODEL} voice=${VOICE} llm=${has_llm}\n`)
|
||||
process.stderr.write('[ready] speak into your microphone. Ctrl+C to stop.\n\n')
|
||||
|
||||
let speaking = false
|
||||
|
||||
const stop = stt.listen(async (raw_text) => {
|
||||
if (speaking) {
|
||||
process.stderr.write(`[skipped] ${raw_text}\n`)
|
||||
return
|
||||
}
|
||||
speaking = true
|
||||
|
||||
try {
|
||||
process.stdout.write(`[raw] ${raw_text}\n`)
|
||||
|
||||
let text = raw_text
|
||||
if (has_llm) {
|
||||
text = await cleanup(raw_text)
|
||||
if (text !== raw_text) {
|
||||
process.stdout.write(`[llm] ${text}\n`)
|
||||
}
|
||||
}
|
||||
|
||||
await tts.speak_streaming(text)
|
||||
process.stdout.write('\n')
|
||||
} catch (err) {
|
||||
process.stderr.write(`[error] ${err.message}\n`)
|
||||
} finally {
|
||||
speaking = false
|
||||
}
|
||||
})
|
||||
|
||||
process.on('SIGINT', () => {
|
||||
process.stderr.write('\n[voice] stopping\n')
|
||||
stop()
|
||||
process.exit(0)
|
||||
})
|
||||
Reference in New Issue
Block a user