claude-voice-experiment/lib/tts.mjs

/**
 * Text-to-Speech using sherpa-onnx-node with the Kokoro model.
 *
 * Model layout expected under models_dir:
 *   kokoro-en-v0_19/
 *     model.onnx
 *     voices.bin
 *     tokens.txt
 *     lexicon-us-en.txt
 *     espeak-ng-data/
 *
 * Audio output goes to pacat (PulseAudio) as float32le @ 24kHz.
 * speak_streaming() splits on sentence boundaries so the first sentence
 * plays while the rest are still being synthesized.
 */

import { spawn } from 'node:child_process'
import { createRequire } from 'node:module'
import * as path from 'node:path'

const require = createRequire(import.meta.url)
const DEFAULT_MODELS_DIR = path.join(import.meta.dirname, '..', 'models')

// Speaker IDs in kokoro-en-v0_19
export const VOICES = {
	af_heart:   0,
	af_bella:   1,
	af_nicole:  2,
	af_sarah:   3,
	af_sky:     4,
	am_adam:    5,
	am_michael: 6,
}

export class Tts {
	constructor({
		models_dir = DEFAULT_MODELS_DIR,
		voice      = 'af_heart',
		speed      = 1.0,
		provider   = 'cpu',   // Kokoro synthesis is fast enough on CPU
	} = {}) {
		this._kokoro_dir = path.join(models_dir, 'kokoro-en-v0_19')
		this._voice      = voice
		this._speed      = speed
		this._provider   = provider
		this._tts        = null
	}

	init() {
		const sherpa = require('sherpa-onnx-node')
		const { _kokoro_dir: dir, _provider } = this

		this._tts = new sherpa.OfflineTts({
			model: {
				kokoro: {
					model:   path.join(dir, 'model.onnx'),
					voices:  path.join(dir, 'voices.bin'),
					tokens:  path.join(dir, 'tokens.txt'),
					dataDir: path.join(dir, 'espeak-ng-data'),
				},
			},
			maxNumSentences: 2,
			numThreads:      2,
			debug:           false,
			provider:        _provider,
		})
	}

	/** Synthesize and play a single piece of text. Resolves when playback ends. */
	async speak(text) {
		const sid   = VOICES[this._voice] ?? 0
		const audio = this._tts.generate({ text, sid, speed: this._speed })
		await this._play(audio.samples, audio.sampleRate)
	}

	/**
	 * Split text on sentence boundaries and synthesize + play each sentence
	 * in sequence. Lower latency than waiting for full synthesis first.
	 */
	async speak_streaming(text) {
		for (const sentence of split_sentences(text)) {
			if (sentence.trim()) {
				await this.speak(sentence.trim())
			}
		}
	}

	_play(samples, sample_rate) {
		return new Promise((resolve, reject) => {
			const proc = spawn('pacat', [
				'--format=float32le',
				`--rate=${sample_rate}`,
				'--channels=1',
			], { stdio: ['pipe', 'ignore', 'inherit'] })

			proc.on('error', reject)
			proc.on('close', resolve)
			proc.stdin.write(Buffer.from(samples.buffer))
			proc.stdin.end()
		})
	}
}

function split_sentences(text) {
	// Split after .  !  ? — keep the punctuation with the preceding sentence
	return text.split(/(?<=[.!?])\s+/)
}