Files
claude-voice-experiment/demo-bark.mjs
mikael-lovqvists-claude-agent db8889aeed Initial commit — voice pipeline experiment
STT (Silero VAD + Whisper via sherpa-onnx), Chatterbox TTS HTTP server,
query completeness classifier (Ollama), multi-voice demo scripts, and
planning docs. Kept as reference; clean rewrite planned in separate repos.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-30 04:48:54 +00:00

86 lines
2.6 KiB
JavaScript

/**
* Voice demo: microphone → Whisper STT → (optional LLM cleanup) → Bark TTS
*
* Bark supports emphasis via UPPERCASE, paralinguistic tokens ([laughs], [sighs],
* [clears throat]), and hesitation (...). Markdown is preprocessed automatically:
* **bold** → UPPERCASE, headers get a pause, etc.
*
* Environment variables:
* WHISPER_MODEL default: base.en
* options: tiny.en base.en small.en medium.en large-v3 large-v3-turbo
* BARK_MODEL default: suno/bark (use suno/bark-small for faster/smaller)
* BARK_VOICE default: v2/en_speaker_6
* options: v2/en_speaker_0 .. v2/en_speaker_9
* STT_PROVIDER default: cuda
* USE_LLM set to 0 to disable (default: 1)
* OLLAMA_MODEL default: phi3:mini
*/
import { Stt } from './lib/stt.mjs'
import { Bark_Tts } from './lib/bark-tts.mjs'
import { llm_available, list_models, cleanup } from './lib/llm.mjs'
const WHISPER_MODEL = process.env.WHISPER_MODEL || 'base.en'
const STT_PROVIDER = process.env.STT_PROVIDER || 'cuda'
const USE_LLM = process.env.USE_LLM !== '0'
process.stderr.write(`[stt] loading whisper-${WHISPER_MODEL} on ${STT_PROVIDER}...\n`)
const stt = new Stt({ whisper_name: WHISPER_MODEL, provider: STT_PROVIDER })
stt.init()
process.stderr.write('[stt] ready\n')
process.stderr.write('[tts] loading Bark (this takes a moment)...\n')
const tts = new Bark_Tts()
await tts.init()
process.stderr.write('[tts] Bark ready\n')
let has_llm = false
if (USE_LLM) {
has_llm = await llm_available()
if (has_llm) {
const models = await list_models()
process.stderr.write(`[llm] ollama available. models: ${models.slice(0, 6).join(', ')}\n`)
} else {
process.stderr.write('[llm] ollama not reachable, skipping cleanup\n')
}
}
process.stderr.write(`\n[ready] whisper=${WHISPER_MODEL} bark=${process.env.BARK_MODEL || 'suno/bark'} llm=${has_llm}\n`)
process.stderr.write('[ready] speak into your microphone. Ctrl+C to stop.\n\n')
let speaking = false
const stop = stt.listen(async (raw_text) => {
if (speaking) {
process.stderr.write(`[skipped] ${raw_text}\n`)
return
}
speaking = true
try {
process.stdout.write(`[raw] ${raw_text}\n`)
let text = raw_text
if (has_llm) {
text = await cleanup(raw_text)
if (text !== raw_text) {
process.stdout.write(`[llm] ${text}\n`)
}
}
await tts.speak_streaming(text)
process.stdout.write('\n')
} catch (err) {
process.stderr.write(`[error] ${err.message}\n`)
} finally {
speaking = false
}
})
process.on('SIGINT', () => {
process.stderr.write('\n[voice] stopping\n')
stop()
tts.stop()
process.exit(0)
})