claude-voice-experiment/lib/stt.mjs

/**
 * Speech-to-Text using sherpa-onnx-node.
 *
 * Uses Whisper (offline/batch) as the recognizer, gated by Silero VAD so
 * transcription only runs when a complete utterance has been detected.
 * Audio is captured from the microphone via parec (PulseAudio) or arecord (ALSA).
 *
 * Model layout expected under models_dir:
 *   silero_vad.onnx
 *   sherpa-onnx-whisper-<name>/
 *     <name>-encoder.int8.onnx
 *     <name>-decoder.int8.onnx
 *     <name>-tokens.txt
 *
 * Available Whisper model names (pass as whisper_name):
 *   tiny.en  base.en  small.en  medium.en  large-v3  large-v3-turbo
 * Download with: bash download-models.sh [model-name]
 */

import { spawn, execSync } from 'node:child_process'
import { createRequire } from 'node:module'
import * as path from 'node:path'

const require = createRequire(import.meta.url)
const DEFAULT_MODELS_DIR = path.join(import.meta.dirname, '..', 'models')

const PRE_ROLL_SAMPLES  = 3200    // 200ms at 16kHz
const HISTORY_SAMPLES   = 960000  // 60s ring buffer — matches VAD internal size

function find_mic() {
	const candidates = [
		['parec',   ['--format=s16le', '--rate=16000', '--channels=1', '--latency-msec=50']],
		['arecord', ['-f', 'S16_LE', '-r', '16000', '-c', '1', '-t', 'raw', '-q']],
	]
	for (const [cmd, args] of candidates) {
		try {
			execSync(`which ${cmd}`, { stdio: 'ignore' })
			return [cmd, args]
		} catch { /* try next */ }
	}
	throw new Error('No mic capture command found — need parec (PulseAudio) or arecord (ALSA)')
}

function s16le_to_f32(buf) {
	const out = new Float32Array(buf.length / 2)
	for (let i = 0; i < out.length; i++) {
		out[i] = buf.readInt16LE(i * 2) / 32768.0
	}
	return out
}

export class Stt {
	constructor({
		models_dir   = DEFAULT_MODELS_DIR,
		whisper_name = 'base.en',
		provider     = 'cuda',
		debug_url    = null,
	} = {}) {
		this._whisper_dir  = path.join(models_dir, `sherpa-onnx-whisper-${whisper_name}`)
		this._whisper_name = whisper_name
		this._vad_model    = path.join(models_dir, 'silero_vad.onnx')
		this._provider     = provider
		this._debug_url    = debug_url
		this._recognizer   = null
		this._vad          = null
		this._history      = new Float32Array(HISTORY_SAMPLES)
		this._history_pos  = 0  // total samples fed, monotonically increasing
	}

	_post_debug(meta, samples) {
		const json_buf  = Buffer.from(JSON.stringify(meta), 'utf8')
		const len_buf   = Buffer.allocUnsafe(4)
		len_buf.writeUInt32LE(json_buf.byteLength, 0)
		const samp_buf  = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength)
		const body      = Buffer.concat([len_buf, json_buf, samp_buf])
		fetch(this._debug_url, {
			method:  'POST',
			body,
			headers: { 'Content-Type': 'application/octet-stream' },
		}).catch(() => {})
	}

	init() {
		const sherpa = require('sherpa-onnx-node')
		const { _whisper_dir: wdir, _whisper_name: wname, _vad_model, _provider } = this

		this._recognizer = new sherpa.OfflineRecognizer({
			featConfig: { sampleRate: 16000, featureDim: 80 },
			modelConfig: {
				whisper: {
					encoder:               path.join(wdir, `${wname}-encoder.int8.onnx`),
					decoder:               path.join(wdir, `${wname}-decoder.int8.onnx`),
					enableTokenTimestamps: 1,
				},
				tokens:     path.join(wdir, `${wname}-tokens.txt`),
				numThreads: 2,
				provider:   _provider,
				debug:      false,
			},
		})

		// VAD runs on CPU — silero_vad.onnx is tiny
		this._vad = new sherpa.Vad({
			sileroVad: {
				model:              _vad_model,
				threshold:          0.5,
				minSilenceDuration: 0.5,   // seconds of silence to end an utterance
				minSpeechDuration:  0.1,   // ignore sub-100ms blips
				windowSize:         512,   // 32ms @ 16kHz
				maxSpeechDuration:  30.0,
			},
			sampleRate: 16000,
			numThreads: 1,
			provider:   'cpu',
			debug:      false,
		}, 60)  // 60-second internal ring buffer
	}

	_get_result(samples) {
		const stream = this._recognizer.createStream()
		try { stream.setOption('enableTokenTimestamps', '1') } catch {}
		stream.acceptWaveform({ samples, sampleRate: 16000 })
		this._recognizer.decode(stream)
		return this._recognizer.getResult(stream)
	}

	_transcribe_raw(samples) {
		return this._get_result(samples).text
	}

	_transcribe(samples) {
		return this._transcribe_raw(samples).trim()
	}

	_with_preroll(seg) {
		const pre_start = Math.max(0, seg.start - PRE_ROLL_SAMPLES)
		const pre_len   = seg.start - pre_start
		if (pre_len === 0) return seg.samples
		const out = new Float32Array(pre_len + seg.samples.length)
		for (let i = 0; i < pre_len; i++) {
			out[i] = this._history[(pre_start + i) % HISTORY_SAMPLES]
		}
		out.set(seg.samples, pre_len)
		return out
	}

	/**
	 * Start listening. Calls on_text(text) for each detected utterance.
	 * Returns a stop() function.
	 */
	listen(on_text, { on_audio } = {}) {
		const [cmd, args] = find_mic()
		process.stderr.write(`[stt] mic: ${cmd} ${args.join(' ')}\n`)
		const mic = spawn(cmd, args, { stdio: ['ignore', 'pipe', 'inherit'] })

		const VAD_WIN = 512  // must match windowSize above
		let pending = Buffer.alloc(0)

		mic.stdout.on('data', (chunk) => {
			if (on_audio) on_audio(chunk)
			pending = Buffer.concat([pending, chunk])

			// Feed complete VAD windows
			while (pending.length >= VAD_WIN * 2) {
				const win = pending.subarray(0, VAD_WIN * 2)
				pending   = pending.subarray(VAD_WIN * 2)
				const f32 = s16le_to_f32(win)
				// Write to history ring buffer for pre-roll
				const base = this._history_pos % HISTORY_SAMPLES
				for (let i = 0; i < f32.length; i++) {
					this._history[(base + i) % HISTORY_SAMPLES] = f32[i]
				}
				this._history_pos += f32.length
				this._vad.acceptWaveform(f32)
			}

			// Drain any complete speech segments
			while (!this._vad.isEmpty()) {
				const seg = this._vad.front()
				this._vad.pop()
				const drift    = seg.start - (this._history_pos - seg.samples.length)
				const with_pre = this._with_preroll(seg)
				process.stderr.write(`[stt] segment: start=${seg.start} history_pos=${this._history_pos} samples=${seg.samples.length} drift=${drift}\n`)
				const result = this._get_result(with_pre)
				if (!this._result_keys_logged) {
					this._result_keys_logged = true
					process.stderr.write(`[stt] result keys: ${JSON.stringify(Object.keys(result))}\n`)
					process.stderr.write(`[stt] result sample: ${JSON.stringify({ tokens: result.tokens?.slice(0,3), timestamps: result.timestamps?.slice(0,3), durations: result.durations?.slice(0,3) })}\n`)
				}
				const raw    = result.text ?? ''
				const text   = raw.trim()
				process.stderr.write(`[stt] whisper raw: ${JSON.stringify(raw)}\n`)
				if (this._debug_url) {
					try {
						this._post_debug({
							preroll_length: with_pre.length - seg.samples.length,
							transcript:     raw,
							tokens:         result.tokens     ?? [],
							timestamps:     result.timestamps ?? [],
							durations:      result.durations  ?? [],
							timestamp:      Date.now(),
							seg_start:      seg.start,
							history_pos:    this._history_pos,
							drift,
						}, with_pre)
					} catch (err) {
						process.stderr.write(`[stt] debug post error: ${err.message}\n`)
					}
				}
				if (text) {
					on_text(text)
				}
			}
		})

		mic.on('error', err => process.stderr.write(`[stt] mic error: ${err.message}\n`))
		mic.on('close', code => {
			if (code !== null && code !== 0) {
				process.stderr.write(`[stt] mic exited with code ${code}\n`)
			}
		})

		return () => mic.kill()
	}
}