/** * Demonstrates prosody and acting techniques with Kokoro TTS. * * Run: node acting-demo.mjs [start-section] * E.g.: node acting-demo.mjs 5 * * Kokoro's expressive range comes from four levers: * * 1. Voice choice — each voice has a baked-in emotional character * 2. Speed — slower = deliberate/grave, faster = excited/happy * 3. silenceScale — controls pause length between sentences * 4. Punctuation & markup — espeak-ng honours some SSML and typographic cues * * SSML tags that pass through espeak-ng: * — explicit pause * x — louder/longer (model-dependent) * x — local rate change * — force pronunciation * * Typographic cues that affect prosody without SSML: * UPPERCASE words — some stress * Ellipsis ... — trailing pause / trailing off * Em-dash — — abrupt break * Comma placement — micro-pauses * Exclamation ! — raised energy */ import { Tts } from './lib/tts.mjs' import { spawn } from 'node:child_process' const tts = new Tts() tts.init() // Overriding the internal _play is the simplest way to control // per-utterance GenerationConfig without reworking the class. async function say(text, { voice = 'af_heart', speed = 1.0, silence_scale = 1.0 } = {}) { const { createRequire } = await import('node:module') const require = createRequire(import.meta.url) const sherpa = require('sherpa-onnx-node') const VOICES = { af_heart:0, af_bella:1, af_nicole:2, af_sarah:3, af_sky:4, am_adam:5, am_michael:6 } const sid = VOICES[voice] ?? 0 process.stdout.write(` voice=${voice} speed=${speed} silence=${silence_scale}\n "${text}"\n\n`) const audio = tts._tts.generate({ text, generationConfig: new sherpa.GenerationConfig({ sid, speed, silenceScale: silence_scale }), }) await play(audio.samples, audio.sampleRate) } // Synthesize multiple phrase/option pairs and play them as one continuous audio async function say_concat(parts) { const { createRequire } = await import('node:module') const require = createRequire(import.meta.url) const sherpa = require('sherpa-onnx-node') const VOICES = { af_heart:0, af_bella:1, af_nicole:2, af_sarah:3, af_sky:4, am_adam:5, am_michael:6 } let total_len = 0 const chunks = [] for (const [text, opts = {}] of parts) { const { voice = 'af_heart', speed = 1.0, silence_scale = 1.0 } = opts const sid = VOICES[voice] ?? 0 const audio = tts._tts.generate({ text, generationConfig: new sherpa.GenerationConfig({ sid, speed, silenceScale: silence_scale }), }) process.stdout.write(` [${speed}x] "${text}"\n`) chunks.push(audio.samples) total_len += audio.samples.length } const combined = new Float32Array(total_len) let offset = 0 for (const chunk of chunks) { combined.set(chunk, offset) offset += chunk.length } await play(combined, tts._tts.sampleRate) process.stdout.write('\n') } function play(samples, sample_rate) { return new Promise((resolve, reject) => { const proc = spawn('pacat', [ '--format=float32le', `--rate=${sample_rate}`, '--channels=1', ], { stdio: ['pipe', 'ignore', 'inherit'] }) proc.on('error', reject) proc.on('close', resolve) proc.stdin.write(Buffer.from(samples.buffer)) proc.stdin.end() }) } const START = parseInt(process.argv[2] ?? '1', 10) function section(title) { process.stdout.write(`\n${'─'.repeat(50)}\n${title}\n${'─'.repeat(50)}\n`) } // ─── 1. Baseline ─────────────────────────────────────── section('1. Baseline — neutral af_heart') if (START <= 1) { await say('The package has arrived. Please sign here.') } // ─── 2. Speed as emotion ─────────────────────────────── section('2. Speed — slow (grave) vs fast (excited)') if (START <= 2) { await say('I have some terrible news. The project has been cancelled.', { speed: 0.8 }) await say("Oh my goodness, I can't believe you're actually here! This is amazing!", { speed: 1.25 }) } // ─── 3. Voice character ──────────────────────────────── section('3. Voice character — same line, different voices') if (START <= 3) { const line = "Well, that's certainly one way to look at it." for (const voice of ['af_heart', 'af_sky', 'am_adam', 'bm_george']) { await say(line, { voice }) } } // ─── 4. Punctuation-driven prosody ───────────────────── section('4. Punctuation and typographic cues') if (START <= 4) { await say('I told you. I told you this would happen. But did anyone listen? No.') await say('It was... quiet. Too quiet.') await say('He said he was fine — but his eyes told a different story.') await say('This is ABSOLUTELY unacceptable.') } // ─── 5. Per-phrase speed — manual emphasis ───────────── // SSML tags are not supported by this backend; emphasis is achieved by // synthesizing key phrases at a different speed and concatenating the audio. section('5. Per-phrase speed (manual emphasis)') if (START <= 5) { // "stop" spoken slower and then the rest faster — sounds like emphasis await say_concat([ ['You need to ', { speed: 1.0 }], ['stop.', { speed: 0.7 }], ['Right now.', { speed: 1.1 }], ]) await say_concat([ ['Listen very carefully.', { speed: 0.75 }], ['I will only say this once.', { speed: 0.9 }], ]) } // ─── 6. Combining levers ─────────────────────────────── section('6. Combined — a tense scene') if (START <= 6) { await say('Something is wrong.', { speed: 0.85, silence_scale: 1.5 }) await say('I can feel it.', { speed: 0.8, silence_scale: 2.0 }) await say(' Run!', { speed: 1.3, voice: 'af_sky' }) } section('Done') process.stdout.write('Tip: adjust speed/silence_scale and swap voices to build a character.\n')