STT (Silero VAD + Whisper via sherpa-onnx), Chatterbox TTS HTTP server, query completeness classifier (Ollama), multi-voice demo scripts, and planning docs. Kept as reference; clean rewrite planned in separate repos. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
164 lines
6.1 KiB
JavaScript
164 lines
6.1 KiB
JavaScript
/**
|
|
* Demonstrates prosody and acting techniques with Kokoro TTS.
|
|
*
|
|
* Run: node acting-demo.mjs [start-section]
|
|
* E.g.: node acting-demo.mjs 5
|
|
*
|
|
* Kokoro's expressive range comes from four levers:
|
|
*
|
|
* 1. Voice choice — each voice has a baked-in emotional character
|
|
* 2. Speed — slower = deliberate/grave, faster = excited/happy
|
|
* 3. silenceScale — controls pause length between sentences
|
|
* 4. Punctuation & markup — espeak-ng honours some SSML and typographic cues
|
|
*
|
|
* SSML tags that pass through espeak-ng:
|
|
* <break time="500ms"/> — explicit pause
|
|
* <emphasis level="strong">x</emphasis> — louder/longer (model-dependent)
|
|
* <prosody rate="slow">x</prosody> — local rate change
|
|
* <phoneme alphabet="ipa" ph="..."/> — force pronunciation
|
|
*
|
|
* Typographic cues that affect prosody without SSML:
|
|
* UPPERCASE words — some stress
|
|
* Ellipsis ... — trailing pause / trailing off
|
|
* Em-dash — — abrupt break
|
|
* Comma placement — micro-pauses
|
|
* Exclamation ! — raised energy
|
|
*/
|
|
|
|
import { Tts } from './lib/tts.mjs'
|
|
import { spawn } from 'node:child_process'
|
|
|
|
const tts = new Tts()
|
|
tts.init()
|
|
|
|
// Overriding the internal _play is the simplest way to control
|
|
// per-utterance GenerationConfig without reworking the class.
|
|
|
|
async function say(text, { voice = 'af_heart', speed = 1.0, silence_scale = 1.0 } = {}) {
|
|
const { createRequire } = await import('node:module')
|
|
const require = createRequire(import.meta.url)
|
|
const sherpa = require('sherpa-onnx-node')
|
|
|
|
const VOICES = { af_heart:0, af_bella:1, af_nicole:2, af_sarah:3, af_sky:4, am_adam:5, am_michael:6 }
|
|
const sid = VOICES[voice] ?? 0
|
|
|
|
process.stdout.write(` voice=${voice} speed=${speed} silence=${silence_scale}\n "${text}"\n\n`)
|
|
|
|
const audio = tts._tts.generate({
|
|
text,
|
|
generationConfig: new sherpa.GenerationConfig({ sid, speed, silenceScale: silence_scale }),
|
|
})
|
|
|
|
await play(audio.samples, audio.sampleRate)
|
|
}
|
|
|
|
// Synthesize multiple phrase/option pairs and play them as one continuous audio
|
|
async function say_concat(parts) {
|
|
const { createRequire } = await import('node:module')
|
|
const require = createRequire(import.meta.url)
|
|
const sherpa = require('sherpa-onnx-node')
|
|
const VOICES = { af_heart:0, af_bella:1, af_nicole:2, af_sarah:3, af_sky:4, am_adam:5, am_michael:6 }
|
|
|
|
let total_len = 0
|
|
const chunks = []
|
|
|
|
for (const [text, opts = {}] of parts) {
|
|
const { voice = 'af_heart', speed = 1.0, silence_scale = 1.0 } = opts
|
|
const sid = VOICES[voice] ?? 0
|
|
const audio = tts._tts.generate({
|
|
text,
|
|
generationConfig: new sherpa.GenerationConfig({ sid, speed, silenceScale: silence_scale }),
|
|
})
|
|
process.stdout.write(` [${speed}x] "${text}"\n`)
|
|
chunks.push(audio.samples)
|
|
total_len += audio.samples.length
|
|
}
|
|
|
|
const combined = new Float32Array(total_len)
|
|
let offset = 0
|
|
for (const chunk of chunks) {
|
|
combined.set(chunk, offset)
|
|
offset += chunk.length
|
|
}
|
|
|
|
await play(combined, tts._tts.sampleRate)
|
|
process.stdout.write('\n')
|
|
}
|
|
|
|
function play(samples, sample_rate) {
|
|
return new Promise((resolve, reject) => {
|
|
const proc = spawn('pacat', [
|
|
'--format=float32le', `--rate=${sample_rate}`, '--channels=1',
|
|
], { stdio: ['pipe', 'ignore', 'inherit'] })
|
|
proc.on('error', reject)
|
|
proc.on('close', resolve)
|
|
proc.stdin.write(Buffer.from(samples.buffer))
|
|
proc.stdin.end()
|
|
})
|
|
}
|
|
|
|
const START = parseInt(process.argv[2] ?? '1', 10)
|
|
|
|
function section(title) {
|
|
process.stdout.write(`\n${'─'.repeat(50)}\n${title}\n${'─'.repeat(50)}\n`)
|
|
}
|
|
|
|
// ─── 1. Baseline ───────────────────────────────────────
|
|
section('1. Baseline — neutral af_heart')
|
|
if (START <= 1) {
|
|
await say('The package has arrived. Please sign here.')
|
|
}
|
|
|
|
// ─── 2. Speed as emotion ───────────────────────────────
|
|
section('2. Speed — slow (grave) vs fast (excited)')
|
|
if (START <= 2) {
|
|
await say('I have some terrible news. The project has been cancelled.', { speed: 0.8 })
|
|
await say("Oh my goodness, I can't believe you're actually here! This is amazing!", { speed: 1.25 })
|
|
}
|
|
|
|
// ─── 3. Voice character ────────────────────────────────
|
|
section('3. Voice character — same line, different voices')
|
|
if (START <= 3) {
|
|
const line = "Well, that's certainly one way to look at it."
|
|
for (const voice of ['af_heart', 'af_sky', 'am_adam', 'bm_george']) {
|
|
await say(line, { voice })
|
|
}
|
|
}
|
|
|
|
// ─── 4. Punctuation-driven prosody ─────────────────────
|
|
section('4. Punctuation and typographic cues')
|
|
if (START <= 4) {
|
|
await say('I told you. I told you this would happen. But did anyone listen? No.')
|
|
await say('It was... quiet. Too quiet.')
|
|
await say('He said he was fine — but his eyes told a different story.')
|
|
await say('This is ABSOLUTELY unacceptable.')
|
|
}
|
|
|
|
// ─── 5. Per-phrase speed — manual emphasis ─────────────
|
|
// SSML tags are not supported by this backend; emphasis is achieved by
|
|
// synthesizing key phrases at a different speed and concatenating the audio.
|
|
section('5. Per-phrase speed (manual emphasis)')
|
|
if (START <= 5) {
|
|
// "stop" spoken slower and then the rest faster — sounds like emphasis
|
|
await say_concat([
|
|
['You need to ', { speed: 1.0 }],
|
|
['stop.', { speed: 0.7 }],
|
|
['Right now.', { speed: 1.1 }],
|
|
])
|
|
await say_concat([
|
|
['Listen very carefully.', { speed: 0.75 }],
|
|
['I will only say this once.', { speed: 0.9 }],
|
|
])
|
|
}
|
|
|
|
// ─── 6. Combining levers ───────────────────────────────
|
|
section('6. Combined — a tense scene')
|
|
if (START <= 6) {
|
|
await say('Something is wrong.', { speed: 0.85, silence_scale: 1.5 })
|
|
await say('I can feel it.', { speed: 0.8, silence_scale: 2.0 })
|
|
await say('<break time="600ms"/> Run!', { speed: 1.3, voice: 'af_sky' })
|
|
}
|
|
|
|
section('Done')
|
|
process.stdout.write('Tip: adjust speed/silence_scale and swap voices to build a character.\n')
|