/** * Speech-to-Text using sherpa-onnx-node. * * Uses Whisper (offline/batch) as the recognizer, gated by Silero VAD so * transcription only runs when a complete utterance has been detected. * Audio is captured from the microphone via parec (PulseAudio) or arecord (ALSA). * * Model layout expected under models_dir: * silero_vad.onnx * sherpa-onnx-whisper-/ * -encoder.int8.onnx * -decoder.int8.onnx * -tokens.txt * * Available Whisper model names (pass as whisper_name): * tiny.en base.en small.en medium.en large-v3 large-v3-turbo * Download with: bash download-models.sh [model-name] */ import { spawn, execSync } from 'node:child_process' import { createRequire } from 'node:module' import * as path from 'node:path' const require = createRequire(import.meta.url) const DEFAULT_MODELS_DIR = path.join(import.meta.dirname, '..', 'models') const PRE_ROLL_SAMPLES = 3200 // 200ms at 16kHz const HISTORY_SAMPLES = 960000 // 60s ring buffer — matches VAD internal size function find_mic() { const candidates = [ ['parec', ['--format=s16le', '--rate=16000', '--channels=1', '--latency-msec=50']], ['arecord', ['-f', 'S16_LE', '-r', '16000', '-c', '1', '-t', 'raw', '-q']], ] for (const [cmd, args] of candidates) { try { execSync(`which ${cmd}`, { stdio: 'ignore' }) return [cmd, args] } catch { /* try next */ } } throw new Error('No mic capture command found — need parec (PulseAudio) or arecord (ALSA)') } function s16le_to_f32(buf) { const out = new Float32Array(buf.length / 2) for (let i = 0; i < out.length; i++) { out[i] = buf.readInt16LE(i * 2) / 32768.0 } return out } export class Stt { constructor({ models_dir = DEFAULT_MODELS_DIR, whisper_name = 'base.en', provider = 'cuda', debug_url = null, } = {}) { this._whisper_dir = path.join(models_dir, `sherpa-onnx-whisper-${whisper_name}`) this._whisper_name = whisper_name this._vad_model = path.join(models_dir, 'silero_vad.onnx') this._provider = provider this._debug_url = debug_url this._recognizer = null this._vad = null this._history = new Float32Array(HISTORY_SAMPLES) this._history_pos = 0 // total samples fed, monotonically increasing } _post_debug(meta, samples) { const json_buf = Buffer.from(JSON.stringify(meta), 'utf8') const len_buf = Buffer.allocUnsafe(4) len_buf.writeUInt32LE(json_buf.byteLength, 0) const samp_buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength) const body = Buffer.concat([len_buf, json_buf, samp_buf]) fetch(this._debug_url, { method: 'POST', body, headers: { 'Content-Type': 'application/octet-stream' }, }).catch(() => {}) } init() { const sherpa = require('sherpa-onnx-node') const { _whisper_dir: wdir, _whisper_name: wname, _vad_model, _provider } = this this._recognizer = new sherpa.OfflineRecognizer({ featConfig: { sampleRate: 16000, featureDim: 80 }, modelConfig: { whisper: { encoder: path.join(wdir, `${wname}-encoder.int8.onnx`), decoder: path.join(wdir, `${wname}-decoder.int8.onnx`), enableTokenTimestamps: 1, }, tokens: path.join(wdir, `${wname}-tokens.txt`), numThreads: 2, provider: _provider, debug: false, }, }) // VAD runs on CPU — silero_vad.onnx is tiny this._vad = new sherpa.Vad({ sileroVad: { model: _vad_model, threshold: 0.5, minSilenceDuration: 0.5, // seconds of silence to end an utterance minSpeechDuration: 0.1, // ignore sub-100ms blips windowSize: 512, // 32ms @ 16kHz maxSpeechDuration: 30.0, }, sampleRate: 16000, numThreads: 1, provider: 'cpu', debug: false, }, 60) // 60-second internal ring buffer } _get_result(samples) { const stream = this._recognizer.createStream() try { stream.setOption('enableTokenTimestamps', '1') } catch {} stream.acceptWaveform({ samples, sampleRate: 16000 }) this._recognizer.decode(stream) return this._recognizer.getResult(stream) } _transcribe_raw(samples) { return this._get_result(samples).text } _transcribe(samples) { return this._transcribe_raw(samples).trim() } _with_preroll(seg) { const pre_start = Math.max(0, seg.start - PRE_ROLL_SAMPLES) const pre_len = seg.start - pre_start if (pre_len === 0) return seg.samples const out = new Float32Array(pre_len + seg.samples.length) for (let i = 0; i < pre_len; i++) { out[i] = this._history[(pre_start + i) % HISTORY_SAMPLES] } out.set(seg.samples, pre_len) return out } /** * Start listening. Calls on_text(text) for each detected utterance. * Returns a stop() function. */ listen(on_text, { on_audio } = {}) { const [cmd, args] = find_mic() process.stderr.write(`[stt] mic: ${cmd} ${args.join(' ')}\n`) const mic = spawn(cmd, args, { stdio: ['ignore', 'pipe', 'inherit'] }) const VAD_WIN = 512 // must match windowSize above let pending = Buffer.alloc(0) mic.stdout.on('data', (chunk) => { if (on_audio) on_audio(chunk) pending = Buffer.concat([pending, chunk]) // Feed complete VAD windows while (pending.length >= VAD_WIN * 2) { const win = pending.subarray(0, VAD_WIN * 2) pending = pending.subarray(VAD_WIN * 2) const f32 = s16le_to_f32(win) // Write to history ring buffer for pre-roll const base = this._history_pos % HISTORY_SAMPLES for (let i = 0; i < f32.length; i++) { this._history[(base + i) % HISTORY_SAMPLES] = f32[i] } this._history_pos += f32.length this._vad.acceptWaveform(f32) } // Drain any complete speech segments while (!this._vad.isEmpty()) { const seg = this._vad.front() this._vad.pop() const drift = seg.start - (this._history_pos - seg.samples.length) const with_pre = this._with_preroll(seg) process.stderr.write(`[stt] segment: start=${seg.start} history_pos=${this._history_pos} samples=${seg.samples.length} drift=${drift}\n`) const result = this._get_result(with_pre) if (!this._result_keys_logged) { this._result_keys_logged = true process.stderr.write(`[stt] result keys: ${JSON.stringify(Object.keys(result))}\n`) process.stderr.write(`[stt] result sample: ${JSON.stringify({ tokens: result.tokens?.slice(0,3), timestamps: result.timestamps?.slice(0,3), durations: result.durations?.slice(0,3) })}\n`) } const raw = result.text ?? '' const text = raw.trim() process.stderr.write(`[stt] whisper raw: ${JSON.stringify(raw)}\n`) if (this._debug_url) { try { this._post_debug({ preroll_length: with_pre.length - seg.samples.length, transcript: raw, tokens: result.tokens ?? [], timestamps: result.timestamps ?? [], durations: result.durations ?? [], timestamp: Date.now(), seg_start: seg.start, history_pos: this._history_pos, drift, }, with_pre) } catch (err) { process.stderr.write(`[stt] debug post error: ${err.message}\n`) } } if (text) { on_text(text) } } }) mic.on('error', err => process.stderr.write(`[stt] mic error: ${err.message}\n`)) mic.on('close', code => { if (code !== null && code !== 0) { process.stderr.write(`[stt] mic exited with code ${code}\n`) } }) return () => mic.kill() } }