claude-voice-experiment/lib/bark-tts.mjs

/**
 * Bark TTS — Node.js wrapper around bark-server.py.
 *
 * Spawns the Python server once, keeps it alive, sends requests as JSON lines.
 * Markdown is preprocessed before sending: **bold** → UPPERCASE, etc.
 *
 * Usage:
 *   const tts = new Bark_Tts()
 *   await tts.init()          // spawns server, waits for model load
 *   await tts.speak('Hello')
 *   await tts.speak('**Very** important point.')   // emphasis via CAPS
 *   tts.stop()
 *
 * Environment variables:
 *   BARK_MODEL   HuggingFace model id (default: suno/bark)
 *   BARK_VOICE   voice preset        (default: v2/en_speaker_6)
 *
 * Bark voice presets (English):
 *   v2/en_speaker_0  calm female
 *   v2/en_speaker_1  calm male
 *   v2/en_speaker_3  deep male
 *   v2/en_speaker_6  neutral/warm (default)
 *   v2/en_speaker_9  expressive
 */

import { spawn }        from 'node:child_process'
import * as path        from 'node:path'
import * as readline    from 'node:readline'
import { markdown_to_bark, split_sentences } from './markdown.mjs'

const BARK_MODEL = process.env.BARK_MODEL || 'suno/bark'
const BARK_VOICE = process.env.BARK_VOICE || 'v2/en_speaker_6'
const SERVER     = path.join(import.meta.dirname, '..', 'bark-server.py')

export class Bark_Tts {
    constructor({
        model = BARK_MODEL,
        voice = BARK_VOICE,
    } = {}) {
        this._model   = model
        this._voice   = voice
        this._proc    = null
        this._rl      = null
        this._resolve = null  // resolver for the current in-flight request
    }

    /** Spawn bark-server.py and wait until it signals "ready". */
    init() {
        return new Promise((resolve, reject) => {
            this._proc = spawn(SERVER, [this._model, this._voice], {
                stdio: ['pipe', 'pipe', 'inherit'],
            })

            this._proc.on('error', reject)
            this._proc.on('close', (code) => {
                if (code !== 0 && code !== null) {
                    process.stderr.write(`[bark] server exited with code ${code}\n`)
                }
            })

            this._rl = readline.createInterface({ input: this._proc.stdout })

            this._rl.on('line', (line) => {
                if (line === 'ready') {
                    resolve()
                    return
                }
                if (line === 'ok' && this._resolve) {
                    const res  = this._resolve
                    this._resolve = null
                    res()
                }
            })
        })
    }

    /** Preprocess markdown and speak as a single request. */
    async speak(text, { voice = this._voice, preprocess = true } = {}) {
        const clean = preprocess ? markdown_to_bark(text) : text
        return this._send(clean, voice)
    }

    /**
     * Preprocess markdown and speak sentence by sentence.
     * Lower latency — first sentence starts playing while rest are queued.
     */
    async speak_streaming(text, opts = {}) {
        const clean     = opts.preprocess !== false ? markdown_to_bark(text) : text
        const sentences = split_sentences(clean)
        for (const s of sentences) {
            await this._send(s, opts.voice ?? this._voice)
        }
    }

    _send(text, voice) {
        return new Promise((resolve, reject) => {
            if (!this._proc) {
                return reject(new Error('Bark_Tts not initialized — call init() first'))
            }
            this._resolve = resolve
            const payload = JSON.stringify({ text, voice }) + '\n'
            this._proc.stdin.write(payload)
        })
    }

    stop() {
        this._rl?.close()
        this._proc?.kill()
        this._proc  = null
        this._rl    = null
    }
}