/** * Text-to-Speech using sherpa-onnx-node with the Kokoro model. * * Model layout expected under models_dir: * kokoro-en-v0_19/ * model.onnx * voices.bin * tokens.txt * lexicon-us-en.txt * espeak-ng-data/ * * Audio output goes to pacat (PulseAudio) as float32le @ 24kHz. * speak_streaming() splits on sentence boundaries so the first sentence * plays while the rest are still being synthesized. */ import { spawn } from 'node:child_process' import { createRequire } from 'node:module' import * as path from 'node:path' const require = createRequire(import.meta.url) const DEFAULT_MODELS_DIR = path.join(import.meta.dirname, '..', 'models') // Speaker IDs in kokoro-en-v0_19 export const VOICES = { af_heart: 0, af_bella: 1, af_nicole: 2, af_sarah: 3, af_sky: 4, am_adam: 5, am_michael: 6, } export class Tts { constructor({ models_dir = DEFAULT_MODELS_DIR, voice = 'af_heart', speed = 1.0, provider = 'cpu', // Kokoro synthesis is fast enough on CPU } = {}) { this._kokoro_dir = path.join(models_dir, 'kokoro-en-v0_19') this._voice = voice this._speed = speed this._provider = provider this._tts = null } init() { const sherpa = require('sherpa-onnx-node') const { _kokoro_dir: dir, _provider } = this this._tts = new sherpa.OfflineTts({ model: { kokoro: { model: path.join(dir, 'model.onnx'), voices: path.join(dir, 'voices.bin'), tokens: path.join(dir, 'tokens.txt'), dataDir: path.join(dir, 'espeak-ng-data'), }, }, maxNumSentences: 2, numThreads: 2, debug: false, provider: _provider, }) } /** Synthesize and play a single piece of text. Resolves when playback ends. */ async speak(text) { const sid = VOICES[this._voice] ?? 0 const audio = this._tts.generate({ text, sid, speed: this._speed }) await this._play(audio.samples, audio.sampleRate) } /** * Split text on sentence boundaries and synthesize + play each sentence * in sequence. Lower latency than waiting for full synthesis first. */ async speak_streaming(text) { for (const sentence of split_sentences(text)) { if (sentence.trim()) { await this.speak(sentence.trim()) } } } _play(samples, sample_rate) { return new Promise((resolve, reject) => { const proc = spawn('pacat', [ '--format=float32le', `--rate=${sample_rate}`, '--channels=1', ], { stdio: ['pipe', 'ignore', 'inherit'] }) proc.on('error', reject) proc.on('close', resolve) proc.stdin.write(Buffer.from(samples.buffer)) proc.stdin.end() }) } } function split_sentences(text) { // Split after . ! ? — keep the punctuation with the preceding sentence return text.split(/(?<=[.!?])\s+/) }