claude-voice-experiment/acting-demo.mjs

/**
 * Demonstrates prosody and acting techniques with Kokoro TTS.
 *
 * Run: node acting-demo.mjs [start-section]
 * E.g.: node acting-demo.mjs 5
 *
 * Kokoro's expressive range comes from four levers:
 *
 * 1. Voice choice — each voice has a baked-in emotional character
 * 2. Speed — slower = deliberate/grave, faster = excited/happy
 * 3. silenceScale — controls pause length between sentences
 * 4. Punctuation & markup — espeak-ng honours some SSML and typographic cues
 *
 * SSML tags that pass through espeak-ng:
 *   <break time="500ms"/>          — explicit pause
 *   <emphasis level="strong">x</emphasis>  — louder/longer (model-dependent)
 *   <prosody rate="slow">x</prosody>       — local rate change
 *   <phoneme alphabet="ipa" ph="..."/>     — force pronunciation
 *
 * Typographic cues that affect prosody without SSML:
 *   UPPERCASE words            — some stress
 *   Ellipsis ...               — trailing pause / trailing off
 *   Em-dash —                  — abrupt break
 *   Comma placement            — micro-pauses
 *   Exclamation !              — raised energy
 */

import { Tts } from './lib/tts.mjs'
import { spawn } from 'node:child_process'

const tts = new Tts()
tts.init()

// Overriding the internal _play is the simplest way to control
// per-utterance GenerationConfig without reworking the class.

async function say(text, { voice = 'af_heart', speed = 1.0, silence_scale = 1.0 } = {}) {
	const { createRequire } = await import('node:module')
	const require = createRequire(import.meta.url)
	const sherpa  = require('sherpa-onnx-node')

	const VOICES = { af_heart:0, af_bella:1, af_nicole:2, af_sarah:3, af_sky:4, am_adam:5, am_michael:6 }
	const sid    = VOICES[voice] ?? 0

	process.stdout.write(`  voice=${voice} speed=${speed} silence=${silence_scale}\n  "${text}"\n\n`)

	const audio = tts._tts.generate({
		text,
		generationConfig: new sherpa.GenerationConfig({ sid, speed, silenceScale: silence_scale }),
	})

	await play(audio.samples, audio.sampleRate)
}

// Synthesize multiple phrase/option pairs and play them as one continuous audio
async function say_concat(parts) {
	const { createRequire } = await import('node:module')
	const require = createRequire(import.meta.url)
	const sherpa  = require('sherpa-onnx-node')
	const VOICES  = { af_heart:0, af_bella:1, af_nicole:2, af_sarah:3, af_sky:4, am_adam:5, am_michael:6 }

	let total_len = 0
	const chunks = []

	for (const [text, opts = {}] of parts) {
		const { voice = 'af_heart', speed = 1.0, silence_scale = 1.0 } = opts
		const sid   = VOICES[voice] ?? 0
		const audio = tts._tts.generate({
			text,
			generationConfig: new sherpa.GenerationConfig({ sid, speed, silenceScale: silence_scale }),
		})
		process.stdout.write(`  [${speed}x] "${text}"\n`)
		chunks.push(audio.samples)
		total_len += audio.samples.length
	}

	const combined = new Float32Array(total_len)
	let offset = 0
	for (const chunk of chunks) {
		combined.set(chunk, offset)
		offset += chunk.length
	}

	await play(combined, tts._tts.sampleRate)
	process.stdout.write('\n')
}

function play(samples, sample_rate) {
	return new Promise((resolve, reject) => {
		const proc = spawn('pacat', [
			'--format=float32le', `--rate=${sample_rate}`, '--channels=1',
		], { stdio: ['pipe', 'ignore', 'inherit'] })
		proc.on('error', reject)
		proc.on('close', resolve)
		proc.stdin.write(Buffer.from(samples.buffer))
		proc.stdin.end()
	})
}

const START = parseInt(process.argv[2] ?? '1', 10)

function section(title) {
	process.stdout.write(`\n${'─'.repeat(50)}\n${title}\n${'─'.repeat(50)}\n`)
}

// ─── 1. Baseline ───────────────────────────────────────
section('1. Baseline — neutral af_heart')
if (START <= 1) {
	await say('The package has arrived. Please sign here.')
}

// ─── 2. Speed as emotion ───────────────────────────────
section('2. Speed — slow (grave) vs fast (excited)')
if (START <= 2) {
	await say('I have some terrible news. The project has been cancelled.', { speed: 0.8 })
	await say("Oh my goodness, I can't believe you're actually here! This is amazing!", { speed: 1.25 })
}

// ─── 3. Voice character ────────────────────────────────
section('3. Voice character — same line, different voices')
if (START <= 3) {
	const line = "Well, that's certainly one way to look at it."
	for (const voice of ['af_heart', 'af_sky', 'am_adam', 'bm_george']) {
		await say(line, { voice })
	}
}

// ─── 4. Punctuation-driven prosody ─────────────────────
section('4. Punctuation and typographic cues')
if (START <= 4) {
	await say('I told you. I told you this would happen. But did anyone listen? No.')
	await say('It was... quiet. Too quiet.')
	await say('He said he was fine — but his eyes told a different story.')
	await say('This is ABSOLUTELY unacceptable.')
}

// ─── 5. Per-phrase speed — manual emphasis ─────────────
// SSML tags are not supported by this backend; emphasis is achieved by
// synthesizing key phrases at a different speed and concatenating the audio.
section('5. Per-phrase speed (manual emphasis)')
if (START <= 5) {
	// "stop" spoken slower and then the rest faster — sounds like emphasis
	await say_concat([
		['You need to ',        { speed: 1.0 }],
		['stop.',               { speed: 0.7 }],
		['Right now.',          { speed: 1.1 }],
	])
	await say_concat([
		['Listen very carefully.', { speed: 0.75 }],
		['I will only say this once.', { speed: 0.9 }],
	])
}

// ─── 6. Combining levers ───────────────────────────────
section('6. Combined — a tense scene')
if (START <= 6) {
	await say('Something is wrong.', { speed: 0.85, silence_scale: 1.5 })
	await say('I can feel it.', { speed: 0.8, silence_scale: 2.0 })
	await say('<break time="600ms"/> Run!', { speed: 1.3, voice: 'af_sky' })
}

section('Done')
process.stdout.write('Tip: adjust speed/silence_scale and swap voices to build a character.\n')