STT (Silero VAD + Whisper via sherpa-onnx), Chatterbox TTS HTTP server, query completeness classifier (Ollama), multi-voice demo scripts, and planning docs. Kept as reference; clean rewrite planned in separate repos. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
108 lines
2.7 KiB
JavaScript
108 lines
2.7 KiB
JavaScript
/**
|
|
* Text-to-Speech using sherpa-onnx-node with the Kokoro model.
|
|
*
|
|
* Model layout expected under models_dir:
|
|
* kokoro-en-v0_19/
|
|
* model.onnx
|
|
* voices.bin
|
|
* tokens.txt
|
|
* lexicon-us-en.txt
|
|
* espeak-ng-data/
|
|
*
|
|
* Audio output goes to pacat (PulseAudio) as float32le @ 24kHz.
|
|
* speak_streaming() splits on sentence boundaries so the first sentence
|
|
* plays while the rest are still being synthesized.
|
|
*/
|
|
|
|
import { spawn } from 'node:child_process'
|
|
import { createRequire } from 'node:module'
|
|
import * as path from 'node:path'
|
|
|
|
const require = createRequire(import.meta.url)
|
|
const DEFAULT_MODELS_DIR = path.join(import.meta.dirname, '..', 'models')
|
|
|
|
// Speaker IDs in kokoro-en-v0_19
|
|
export const VOICES = {
|
|
af_heart: 0,
|
|
af_bella: 1,
|
|
af_nicole: 2,
|
|
af_sarah: 3,
|
|
af_sky: 4,
|
|
am_adam: 5,
|
|
am_michael: 6,
|
|
}
|
|
|
|
export class Tts {
|
|
constructor({
|
|
models_dir = DEFAULT_MODELS_DIR,
|
|
voice = 'af_heart',
|
|
speed = 1.0,
|
|
provider = 'cpu', // Kokoro synthesis is fast enough on CPU
|
|
} = {}) {
|
|
this._kokoro_dir = path.join(models_dir, 'kokoro-en-v0_19')
|
|
this._voice = voice
|
|
this._speed = speed
|
|
this._provider = provider
|
|
this._tts = null
|
|
}
|
|
|
|
init() {
|
|
const sherpa = require('sherpa-onnx-node')
|
|
const { _kokoro_dir: dir, _provider } = this
|
|
|
|
this._tts = new sherpa.OfflineTts({
|
|
model: {
|
|
kokoro: {
|
|
model: path.join(dir, 'model.onnx'),
|
|
voices: path.join(dir, 'voices.bin'),
|
|
tokens: path.join(dir, 'tokens.txt'),
|
|
dataDir: path.join(dir, 'espeak-ng-data'),
|
|
},
|
|
},
|
|
maxNumSentences: 2,
|
|
numThreads: 2,
|
|
debug: false,
|
|
provider: _provider,
|
|
})
|
|
}
|
|
|
|
/** Synthesize and play a single piece of text. Resolves when playback ends. */
|
|
async speak(text) {
|
|
const sid = VOICES[this._voice] ?? 0
|
|
const audio = this._tts.generate({ text, sid, speed: this._speed })
|
|
await this._play(audio.samples, audio.sampleRate)
|
|
}
|
|
|
|
/**
|
|
* Split text on sentence boundaries and synthesize + play each sentence
|
|
* in sequence. Lower latency than waiting for full synthesis first.
|
|
*/
|
|
async speak_streaming(text) {
|
|
for (const sentence of split_sentences(text)) {
|
|
if (sentence.trim()) {
|
|
await this.speak(sentence.trim())
|
|
}
|
|
}
|
|
}
|
|
|
|
_play(samples, sample_rate) {
|
|
return new Promise((resolve, reject) => {
|
|
const proc = spawn('pacat', [
|
|
'--format=float32le',
|
|
`--rate=${sample_rate}`,
|
|
'--channels=1',
|
|
], { stdio: ['pipe', 'ignore', 'inherit'] })
|
|
|
|
proc.on('error', reject)
|
|
proc.on('close', resolve)
|
|
proc.stdin.write(Buffer.from(samples.buffer))
|
|
proc.stdin.end()
|
|
})
|
|
}
|
|
}
|
|
|
|
function split_sentences(text) {
|
|
// Split after . ! ? — keep the punctuation with the preceding sentence
|
|
return text.split(/(?<=[.!?])\s+/)
|
|
}
|