/**
* Demonstrates prosody and acting techniques with Kokoro TTS.
*
* Run: node acting-demo.mjs [start-section]
* E.g.: node acting-demo.mjs 5
*
* Kokoro's expressive range comes from four levers:
*
* 1. Voice choice — each voice has a baked-in emotional character
* 2. Speed — slower = deliberate/grave, faster = excited/happy
* 3. silenceScale — controls pause length between sentences
* 4. Punctuation & markup — espeak-ng honours some SSML and typographic cues
*
* SSML tags that pass through espeak-ng:
* — explicit pause
* x — louder/longer (model-dependent)
* x — local rate change
* — force pronunciation
*
* Typographic cues that affect prosody without SSML:
* UPPERCASE words — some stress
* Ellipsis ... — trailing pause / trailing off
* Em-dash — — abrupt break
* Comma placement — micro-pauses
* Exclamation ! — raised energy
*/
import { Tts } from './lib/tts.mjs'
import { spawn } from 'node:child_process'
const tts = new Tts()
tts.init()
// Overriding the internal _play is the simplest way to control
// per-utterance GenerationConfig without reworking the class.
async function say(text, { voice = 'af_heart', speed = 1.0, silence_scale = 1.0 } = {}) {
const { createRequire } = await import('node:module')
const require = createRequire(import.meta.url)
const sherpa = require('sherpa-onnx-node')
const VOICES = { af_heart:0, af_bella:1, af_nicole:2, af_sarah:3, af_sky:4, am_adam:5, am_michael:6 }
const sid = VOICES[voice] ?? 0
process.stdout.write(` voice=${voice} speed=${speed} silence=${silence_scale}\n "${text}"\n\n`)
const audio = tts._tts.generate({
text,
generationConfig: new sherpa.GenerationConfig({ sid, speed, silenceScale: silence_scale }),
})
await play(audio.samples, audio.sampleRate)
}
// Synthesize multiple phrase/option pairs and play them as one continuous audio
async function say_concat(parts) {
const { createRequire } = await import('node:module')
const require = createRequire(import.meta.url)
const sherpa = require('sherpa-onnx-node')
const VOICES = { af_heart:0, af_bella:1, af_nicole:2, af_sarah:3, af_sky:4, am_adam:5, am_michael:6 }
let total_len = 0
const chunks = []
for (const [text, opts = {}] of parts) {
const { voice = 'af_heart', speed = 1.0, silence_scale = 1.0 } = opts
const sid = VOICES[voice] ?? 0
const audio = tts._tts.generate({
text,
generationConfig: new sherpa.GenerationConfig({ sid, speed, silenceScale: silence_scale }),
})
process.stdout.write(` [${speed}x] "${text}"\n`)
chunks.push(audio.samples)
total_len += audio.samples.length
}
const combined = new Float32Array(total_len)
let offset = 0
for (const chunk of chunks) {
combined.set(chunk, offset)
offset += chunk.length
}
await play(combined, tts._tts.sampleRate)
process.stdout.write('\n')
}
function play(samples, sample_rate) {
return new Promise((resolve, reject) => {
const proc = spawn('pacat', [
'--format=float32le', `--rate=${sample_rate}`, '--channels=1',
], { stdio: ['pipe', 'ignore', 'inherit'] })
proc.on('error', reject)
proc.on('close', resolve)
proc.stdin.write(Buffer.from(samples.buffer))
proc.stdin.end()
})
}
const START = parseInt(process.argv[2] ?? '1', 10)
function section(title) {
process.stdout.write(`\n${'─'.repeat(50)}\n${title}\n${'─'.repeat(50)}\n`)
}
// ─── 1. Baseline ───────────────────────────────────────
section('1. Baseline — neutral af_heart')
if (START <= 1) {
await say('The package has arrived. Please sign here.')
}
// ─── 2. Speed as emotion ───────────────────────────────
section('2. Speed — slow (grave) vs fast (excited)')
if (START <= 2) {
await say('I have some terrible news. The project has been cancelled.', { speed: 0.8 })
await say("Oh my goodness, I can't believe you're actually here! This is amazing!", { speed: 1.25 })
}
// ─── 3. Voice character ────────────────────────────────
section('3. Voice character — same line, different voices')
if (START <= 3) {
const line = "Well, that's certainly one way to look at it."
for (const voice of ['af_heart', 'af_sky', 'am_adam', 'bm_george']) {
await say(line, { voice })
}
}
// ─── 4. Punctuation-driven prosody ─────────────────────
section('4. Punctuation and typographic cues')
if (START <= 4) {
await say('I told you. I told you this would happen. But did anyone listen? No.')
await say('It was... quiet. Too quiet.')
await say('He said he was fine — but his eyes told a different story.')
await say('This is ABSOLUTELY unacceptable.')
}
// ─── 5. Per-phrase speed — manual emphasis ─────────────
// SSML tags are not supported by this backend; emphasis is achieved by
// synthesizing key phrases at a different speed and concatenating the audio.
section('5. Per-phrase speed (manual emphasis)')
if (START <= 5) {
// "stop" spoken slower and then the rest faster — sounds like emphasis
await say_concat([
['You need to ', { speed: 1.0 }],
['stop.', { speed: 0.7 }],
['Right now.', { speed: 1.1 }],
])
await say_concat([
['Listen very carefully.', { speed: 0.75 }],
['I will only say this once.', { speed: 0.9 }],
])
}
// ─── 6. Combining levers ───────────────────────────────
section('6. Combined — a tense scene')
if (START <= 6) {
await say('Something is wrong.', { speed: 0.85, silence_scale: 1.5 })
await say('I can feel it.', { speed: 0.8, silence_scale: 2.0 })
await say(' Run!', { speed: 1.3, voice: 'af_sky' })
}
section('Done')
process.stdout.write('Tip: adjust speed/silence_scale and swap voices to build a character.\n')