claude-voice-experiment/lib/fw-stt.mjs

/**
 * faster-whisper STT backend — drop-in replacement for stt.mjs.
 *
 * Uses sherpa-onnx Silero VAD for segment detection (same as stt.mjs),
 * but transcribes via a faster-whisper Python subprocess which returns
 * word-level timestamps.
 */

import { spawn, execSync } from 'node:child_process'
import { createRequire }   from 'node:module'
import * as path           from 'node:path'
import { existsSync }      from 'node:fs'

const require = createRequire(import.meta.url)

const DEFAULT_MODELS_DIR = path.join(import.meta.dirname, '..', 'models')
const DEFAULT_SERVER     = path.join(import.meta.dirname, '..', 'faster-whisper-server.py')
const VENV_PYTHON        = path.join(import.meta.dirname, '..', 'venv', 'bin', 'python3')

const PRE_ROLL_SAMPLES = 3200
const HISTORY_SAMPLES  = 960000

function find_mic() {
	const candidates = [
		['parec',   ['--format=s16le', '--rate=16000', '--channels=1', '--latency-msec=50']],
		['arecord', ['-f', 'S16_LE', '-r', '16000', '-c', '1', '-t', 'raw', '-q']],
	]
	for (const [cmd, args] of candidates) {
		try {
			execSync(`which ${cmd}`, { stdio: 'ignore' })
			return [cmd, args]
		} catch { /* try next */ }
	}
	throw new Error('No mic capture command found — need parec or arecord')
}

function s16le_to_f32(buf) {
	const out = new Float32Array(buf.length / 2)
	for (let i = 0; i < out.length; i++) {
		out[i] = buf.readInt16LE(i * 2) / 32768.0
	}
	return out
}

export class Stt {
	constructor({
		models_dir   = DEFAULT_MODELS_DIR,
		whisper_name = 'base.en',
		server_path  = DEFAULT_SERVER,
		device       = 'cuda',
		debug_url    = null,
	} = {}) {
		this._models_dir   = models_dir
		this._whisper_name = whisper_name
		this._server_path  = server_path
		this._device       = device
		this._debug_url    = debug_url
		this._vad          = null
		this._server       = null
		this._history      = new Float32Array(HISTORY_SAMPLES)
		this._history_pos  = 0

		// Response queue: each entry is a resolve function waiting for the next JSON line
		this._response_queue = []
		this._line_buf       = ''

		// Ready promise — resolved when server prints "ready"
		let ready_resolve
		this._ready = new Promise(r => { ready_resolve = r })
		this._ready_resolve = ready_resolve
	}

	init() {
		const sherpa    = require('sherpa-onnx-node')
		const vad_model = path.join(this._models_dir, 'silero_vad.onnx')

		this._vad = new sherpa.Vad({
			sileroVad: {
				model:              vad_model,
				threshold:          0.5,
				minSilenceDuration: 0.5,
				minSpeechDuration:  0.1,
				windowSize:         512,
				maxSpeechDuration:  30.0,
			},
			sampleRate: 16000,
			numThreads: 1,
			provider:   'cpu',
			debug:      false,
		}, 60)

		const python = existsSync(VENV_PYTHON) ? VENV_PYTHON : 'python3'
		process.stderr.write(`[fw-stt] python: ${python}\n`)
		process.stderr.write(`[fw-stt] model: ${this._whisper_name}\n`)

		this._server = spawn(python, [
			this._server_path,
			'--model', this._whisper_name,
			'--device', this._device,
		], {
			stdio: ['pipe', 'pipe', 'inherit'],
			env: { ...process.env },
		})

		this._server.stdout.on('data', (chunk) => {
			this._line_buf += chunk.toString()
			let nl
			while ((nl = this._line_buf.indexOf('\n')) !== -1) {
				const line = this._line_buf.slice(0, nl).trim()
				this._line_buf = this._line_buf.slice(nl + 1)
				if (line === 'ready') {
					this._ready_resolve()
					continue
				}
				const resolver = this._response_queue.shift()
				if (resolver) {
					try {
						resolver(JSON.parse(line))
					} catch {
						resolver({ text: '', words: [] })
					}
				}
			}
		})

		this._server.on('error', err => process.stderr.write(`[fw-stt] server error: ${err.message}\n`))
		this._server.on('close', code => process.stderr.write(`[fw-stt] server exited (${code})\n`))
	}

	_with_preroll(seg) {
		const pre_start = Math.max(0, seg.start - PRE_ROLL_SAMPLES)
		const pre_len   = seg.start - pre_start
		if (pre_len === 0) return seg.samples
		const out = new Float32Array(pre_len + seg.samples.length)
		for (let i = 0; i < pre_len; i++) {
			out[i] = this._history[(pre_start + i) % HISTORY_SAMPLES]
		}
		out.set(seg.samples, pre_len)
		return out
	}

	async _transcribe(samples) {
		await this._ready
		return new Promise((resolve) => {
			this._response_queue.push(resolve)
			const bytes   = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength)
			const request = JSON.stringify({ audio_b64: bytes.toString('base64'), sample_rate: 16000 }) + '\n'
			this._server.stdin.write(request)
		})
	}

	_post_debug(meta, samples) {
		const json_buf = Buffer.from(JSON.stringify(meta), 'utf8')
		const len_buf  = Buffer.allocUnsafe(4)
		len_buf.writeUInt32LE(json_buf.byteLength, 0)
		const samp_buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength)
		const body     = Buffer.concat([len_buf, json_buf, samp_buf])
		fetch(this._debug_url, {
			method:  'POST',
			body,
			headers: { 'Content-Type': 'application/octet-stream' },
		}).catch(err => {
			process.stderr.write(`[fw-stt] debug post failed: ${err.message}\n`)
		})
	}

	listen(on_text, { on_audio } = {}) {
		const [cmd, args] = find_mic()
		process.stderr.write(`[fw-stt] mic: ${cmd} ${args.join(' ')}\n`)
		const mic = spawn(cmd, args, { stdio: ['ignore', 'pipe', 'inherit'] })

		const VAD_WIN = 512
		let pending = Buffer.alloc(0)

		mic.stdout.on('data', (chunk) => {
			if (on_audio) on_audio(chunk)
			pending = Buffer.concat([pending, chunk])

			while (pending.length >= VAD_WIN * 2) {
				const win  = pending.subarray(0, VAD_WIN * 2)
				pending    = pending.subarray(VAD_WIN * 2)
				const f32  = s16le_to_f32(win)
				const base = this._history_pos % HISTORY_SAMPLES
				for (let i = 0; i < f32.length; i++) {
					this._history[(base + i) % HISTORY_SAMPLES] = f32[i]
				}
				this._history_pos += f32.length
				this._vad.acceptWaveform(f32)
			}

			while (!this._vad.isEmpty()) {
				const seg      = this._vad.front()
				this._vad.pop()
				const drift    = seg.start - (this._history_pos - seg.samples.length)
				const with_pre = this._with_preroll(seg)
				process.stderr.write(`[fw-stt] segment: samples=${seg.samples.length} drift=${drift}\n`)

				this._transcribe(with_pre).then(result => {
					const text  = (result.text ?? '').trim()
					const words = result.words ?? []
					process.stderr.write(`[fw-stt] raw: ${JSON.stringify(result.text)} (${words.length} words)\n`)

					if (this._debug_url) {
						try {
							this._post_debug({
								preroll_length: with_pre.length - seg.samples.length,
								transcript:     result.text ?? '',
								tokens:         words.map(w => w.word),
								timestamps:     words.map(w => w.start),
								durations:      words.map(w => w.end - w.start),
								timestamp:      Date.now(),
								drift,
							}, with_pre)
						} catch (err) {
							process.stderr.write(`[fw-stt] debug post error: ${err.message}\n`)
						}
					}

					if (text) on_text(text)
				}).catch(err => {
					process.stderr.write(`[fw-stt] transcription error: ${err.message}\n`)
				})
			}
		})

		mic.on('error', err => process.stderr.write(`[fw-stt] mic error: ${err.message}\n`))
		mic.on('close', code => {
			if (code !== null && code !== 0) {
				process.stderr.write(`[fw-stt] mic exited (${code})\n`)
			}
		})

		return () => { mic.kill(); this._server?.kill() }
	}
}