Files
claude-voice-experiment/lib/tts.mjs
mikael-lovqvists-claude-agent db8889aeed Initial commit — voice pipeline experiment
STT (Silero VAD + Whisper via sherpa-onnx), Chatterbox TTS HTTP server,
query completeness classifier (Ollama), multi-voice demo scripts, and
planning docs. Kept as reference; clean rewrite planned in separate repos.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-30 04:48:54 +00:00

108 lines
2.7 KiB
JavaScript

/**
* Text-to-Speech using sherpa-onnx-node with the Kokoro model.
*
* Model layout expected under models_dir:
* kokoro-en-v0_19/
* model.onnx
* voices.bin
* tokens.txt
* lexicon-us-en.txt
* espeak-ng-data/
*
* Audio output goes to pacat (PulseAudio) as float32le @ 24kHz.
* speak_streaming() splits on sentence boundaries so the first sentence
* plays while the rest are still being synthesized.
*/
import { spawn } from 'node:child_process'
import { createRequire } from 'node:module'
import * as path from 'node:path'
const require = createRequire(import.meta.url)
const DEFAULT_MODELS_DIR = path.join(import.meta.dirname, '..', 'models')
// Speaker IDs in kokoro-en-v0_19
export const VOICES = {
af_heart: 0,
af_bella: 1,
af_nicole: 2,
af_sarah: 3,
af_sky: 4,
am_adam: 5,
am_michael: 6,
}
export class Tts {
constructor({
models_dir = DEFAULT_MODELS_DIR,
voice = 'af_heart',
speed = 1.0,
provider = 'cpu', // Kokoro synthesis is fast enough on CPU
} = {}) {
this._kokoro_dir = path.join(models_dir, 'kokoro-en-v0_19')
this._voice = voice
this._speed = speed
this._provider = provider
this._tts = null
}
init() {
const sherpa = require('sherpa-onnx-node')
const { _kokoro_dir: dir, _provider } = this
this._tts = new sherpa.OfflineTts({
model: {
kokoro: {
model: path.join(dir, 'model.onnx'),
voices: path.join(dir, 'voices.bin'),
tokens: path.join(dir, 'tokens.txt'),
dataDir: path.join(dir, 'espeak-ng-data'),
},
},
maxNumSentences: 2,
numThreads: 2,
debug: false,
provider: _provider,
})
}
/** Synthesize and play a single piece of text. Resolves when playback ends. */
async speak(text) {
const sid = VOICES[this._voice] ?? 0
const audio = this._tts.generate({ text, sid, speed: this._speed })
await this._play(audio.samples, audio.sampleRate)
}
/**
* Split text on sentence boundaries and synthesize + play each sentence
* in sequence. Lower latency than waiting for full synthesis first.
*/
async speak_streaming(text) {
for (const sentence of split_sentences(text)) {
if (sentence.trim()) {
await this.speak(sentence.trim())
}
}
}
_play(samples, sample_rate) {
return new Promise((resolve, reject) => {
const proc = spawn('pacat', [
'--format=float32le',
`--rate=${sample_rate}`,
'--channels=1',
], { stdio: ['pipe', 'ignore', 'inherit'] })
proc.on('error', reject)
proc.on('close', resolve)
proc.stdin.write(Buffer.from(samples.buffer))
proc.stdin.end()
})
}
}
function split_sentences(text) {
// Split after . ! ? — keep the punctuation with the preceding sentence
return text.split(/(?<=[.!?])\s+/)
}