claude-voice-experiment/lib/chatterbox-tts.mjs

/**
 * Chatterbox TTS — Node.js wrapper around chatterbox-server.py.
 *
 * Usage:
 *   const tts = new Chatterbox_Tts()
 *   await tts.init()
 *   await tts.speak('Hello! [chuckle] That is funny.')
 *   await tts.speak('Something intense.', { exaggeration: 0.8 })  // full model only
 *   tts.stop()
 *
 * Paralinguistic tags (embed in text):
 *   [laugh] [chuckle] [cough] [clear throat] [sigh] [shush] [groan] [sniff] [gasp]
 *
 * Generation options (passed as second arg to speak()):
 *   temperature         0.05–2.0   default 0.8
 *   top_p               0–1        default 0.95
 *   top_k               0–1000     default 1000
 *   repetition_penalty  ≥1.0       default 1.2
 *   min_p               0–1        default 0.0
 *   audio_prompt        string     path to reference WAV for voice cloning
 *   exaggeration        0–1        emotion intensity (full model only)
 *   cfg_weight          0–1        classifier-free guidance (full model only)
 */

import { spawn }     from 'node:child_process'
import * as path     from 'node:path'
import * as readline from 'node:readline'
import { markdown_to_bark as markdown_to_speech, split_sentences } from './markdown.mjs'

const SERVER = path.join(import.meta.dirname, '..', 'chatterbox-server.py')

export class Chatterbox_Tts {
    constructor({
        variant = 'turbo',  // 'turbo' or 'full'
    } = {}) {
        this._variant  = variant
        this._proc     = null
        this._rl       = null
        this._resolve  = null
    }

    init() {
        return new Promise((resolve, reject) => {
            this._proc = spawn(SERVER, [this._variant], {
                stdio: ['pipe', 'pipe', 'inherit'],
            })

            this._proc.on('error', reject)
            this._proc.on('close', (code) => {
                if (code !== null && code !== 0) {
                    process.stderr.write(`[chatterbox] server exited with code ${code}\n`)
                }
            })

            this._rl = readline.createInterface({ input: this._proc.stdout })
            this._rl.on('line', (line) => {
                if (line === 'ready') {
                    resolve()
                    return
                }
                if (line === 'ok' && this._resolve) {
                    const res = this._resolve
                    this._resolve = null
                    res()
                }
            })
        })
    }

    async speak(text, opts = {}) {
        const clean = opts.preprocess === true ? markdown_to_speech(text) : text
        return this._send(clean, opts)
    }

    async speak_streaming(text, opts = {}) {
        const clean     = opts.preprocess !== false ? markdown_to_speech(text) : text
        const sentences = split_sentences(clean)
        for (const s of sentences) {
            await this._send(s, opts)
        }
    }

    _send(text, opts = {}) {
        return new Promise((resolve, reject) => {
            if (!this._proc) {
                return reject(new Error('Chatterbox_Tts not initialized — call init() first'))
            }
            this._resolve = resolve
            const { preprocess: _, ...gen_opts } = opts
            const payload = JSON.stringify({ text, ...gen_opts }) + '\n'
            this._proc.stdin.write(payload)
        })
    }

    stop() {
        this._rl?.close()
        this._proc?.kill()
        this._proc = null
        this._rl   = null
    }
}