STT (Silero VAD + Whisper via sherpa-onnx), Chatterbox TTS HTTP server, query completeness classifier (Ollama), multi-voice demo scripts, and planning docs. Kept as reference; clean rewrite planned in separate repos. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
113 lines
3.6 KiB
JavaScript
113 lines
3.6 KiB
JavaScript
/**
|
|
* Bark TTS — Node.js wrapper around bark-server.py.
|
|
*
|
|
* Spawns the Python server once, keeps it alive, sends requests as JSON lines.
|
|
* Markdown is preprocessed before sending: **bold** → UPPERCASE, etc.
|
|
*
|
|
* Usage:
|
|
* const tts = new Bark_Tts()
|
|
* await tts.init() // spawns server, waits for model load
|
|
* await tts.speak('Hello')
|
|
* await tts.speak('**Very** important point.') // emphasis via CAPS
|
|
* tts.stop()
|
|
*
|
|
* Environment variables:
|
|
* BARK_MODEL HuggingFace model id (default: suno/bark)
|
|
* BARK_VOICE voice preset (default: v2/en_speaker_6)
|
|
*
|
|
* Bark voice presets (English):
|
|
* v2/en_speaker_0 calm female
|
|
* v2/en_speaker_1 calm male
|
|
* v2/en_speaker_3 deep male
|
|
* v2/en_speaker_6 neutral/warm (default)
|
|
* v2/en_speaker_9 expressive
|
|
*/
|
|
|
|
import { spawn } from 'node:child_process'
|
|
import * as path from 'node:path'
|
|
import * as readline from 'node:readline'
|
|
import { markdown_to_bark, split_sentences } from './markdown.mjs'
|
|
|
|
const BARK_MODEL = process.env.BARK_MODEL || 'suno/bark'
|
|
const BARK_VOICE = process.env.BARK_VOICE || 'v2/en_speaker_6'
|
|
const SERVER = path.join(import.meta.dirname, '..', 'bark-server.py')
|
|
|
|
export class Bark_Tts {
|
|
constructor({
|
|
model = BARK_MODEL,
|
|
voice = BARK_VOICE,
|
|
} = {}) {
|
|
this._model = model
|
|
this._voice = voice
|
|
this._proc = null
|
|
this._rl = null
|
|
this._resolve = null // resolver for the current in-flight request
|
|
}
|
|
|
|
/** Spawn bark-server.py and wait until it signals "ready". */
|
|
init() {
|
|
return new Promise((resolve, reject) => {
|
|
this._proc = spawn(SERVER, [this._model, this._voice], {
|
|
stdio: ['pipe', 'pipe', 'inherit'],
|
|
})
|
|
|
|
this._proc.on('error', reject)
|
|
this._proc.on('close', (code) => {
|
|
if (code !== 0 && code !== null) {
|
|
process.stderr.write(`[bark] server exited with code ${code}\n`)
|
|
}
|
|
})
|
|
|
|
this._rl = readline.createInterface({ input: this._proc.stdout })
|
|
|
|
this._rl.on('line', (line) => {
|
|
if (line === 'ready') {
|
|
resolve()
|
|
return
|
|
}
|
|
if (line === 'ok' && this._resolve) {
|
|
const res = this._resolve
|
|
this._resolve = null
|
|
res()
|
|
}
|
|
})
|
|
})
|
|
}
|
|
|
|
/** Preprocess markdown and speak as a single request. */
|
|
async speak(text, { voice = this._voice, preprocess = true } = {}) {
|
|
const clean = preprocess ? markdown_to_bark(text) : text
|
|
return this._send(clean, voice)
|
|
}
|
|
|
|
/**
|
|
* Preprocess markdown and speak sentence by sentence.
|
|
* Lower latency — first sentence starts playing while rest are queued.
|
|
*/
|
|
async speak_streaming(text, opts = {}) {
|
|
const clean = opts.preprocess !== false ? markdown_to_bark(text) : text
|
|
const sentences = split_sentences(clean)
|
|
for (const s of sentences) {
|
|
await this._send(s, opts.voice ?? this._voice)
|
|
}
|
|
}
|
|
|
|
_send(text, voice) {
|
|
return new Promise((resolve, reject) => {
|
|
if (!this._proc) {
|
|
return reject(new Error('Bark_Tts not initialized — call init() first'))
|
|
}
|
|
this._resolve = resolve
|
|
const payload = JSON.stringify({ text, voice }) + '\n'
|
|
this._proc.stdin.write(payload)
|
|
})
|
|
}
|
|
|
|
stop() {
|
|
this._rl?.close()
|
|
this._proc?.kill()
|
|
this._proc = null
|
|
this._rl = null
|
|
}
|
|
}
|