Files
claude-voice-experiment/lib/stt.mjs
mikael-lovqvists-claude-agent 20873be786 Add README, faster-whisper backend, and session fixes
- README explaining experimental/transparency purpose
- faster-whisper STT backend (fw-stt.mjs, faster-whisper-server.py, install-faster-whisper.sh)
- Bug fixes: Buffer alignment in on_audio, --debug-waveform URL parsing, silent fetch errors, instant dispatch timer leak
- Global uncaughtException/unhandledRejection handlers in query-demo.mjs
- Design docs: CHANGELOG, COMMAND-DISPATCH, INTERFACE-THEORY, VOICE-POLICY
- Systemd service unit templates

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-07 06:39:14 +00:00

226 lines
7.3 KiB
JavaScript

/**
* Speech-to-Text using sherpa-onnx-node.
*
* Uses Whisper (offline/batch) as the recognizer, gated by Silero VAD so
* transcription only runs when a complete utterance has been detected.
* Audio is captured from the microphone via parec (PulseAudio) or arecord (ALSA).
*
* Model layout expected under models_dir:
* silero_vad.onnx
* sherpa-onnx-whisper-<name>/
* <name>-encoder.int8.onnx
* <name>-decoder.int8.onnx
* <name>-tokens.txt
*
* Available Whisper model names (pass as whisper_name):
* tiny.en base.en small.en medium.en large-v3 large-v3-turbo
* Download with: bash download-models.sh [model-name]
*/
import { spawn, execSync } from 'node:child_process'
import { createRequire } from 'node:module'
import * as path from 'node:path'
const require = createRequire(import.meta.url)
const DEFAULT_MODELS_DIR = path.join(import.meta.dirname, '..', 'models')
const PRE_ROLL_SAMPLES = 3200 // 200ms at 16kHz
const HISTORY_SAMPLES = 960000 // 60s ring buffer — matches VAD internal size
function find_mic() {
const candidates = [
['parec', ['--format=s16le', '--rate=16000', '--channels=1', '--latency-msec=50']],
['arecord', ['-f', 'S16_LE', '-r', '16000', '-c', '1', '-t', 'raw', '-q']],
]
for (const [cmd, args] of candidates) {
try {
execSync(`which ${cmd}`, { stdio: 'ignore' })
return [cmd, args]
} catch { /* try next */ }
}
throw new Error('No mic capture command found — need parec (PulseAudio) or arecord (ALSA)')
}
function s16le_to_f32(buf) {
const out = new Float32Array(buf.length / 2)
for (let i = 0; i < out.length; i++) {
out[i] = buf.readInt16LE(i * 2) / 32768.0
}
return out
}
export class Stt {
constructor({
models_dir = DEFAULT_MODELS_DIR,
whisper_name = 'base.en',
provider = 'cuda',
debug_url = null,
} = {}) {
this._whisper_dir = path.join(models_dir, `sherpa-onnx-whisper-${whisper_name}`)
this._whisper_name = whisper_name
this._vad_model = path.join(models_dir, 'silero_vad.onnx')
this._provider = provider
this._debug_url = debug_url
this._recognizer = null
this._vad = null
this._history = new Float32Array(HISTORY_SAMPLES)
this._history_pos = 0 // total samples fed, monotonically increasing
}
_post_debug(meta, samples) {
const json_buf = Buffer.from(JSON.stringify(meta), 'utf8')
const len_buf = Buffer.allocUnsafe(4)
len_buf.writeUInt32LE(json_buf.byteLength, 0)
const samp_buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength)
const body = Buffer.concat([len_buf, json_buf, samp_buf])
fetch(this._debug_url, {
method: 'POST',
body,
headers: { 'Content-Type': 'application/octet-stream' },
}).catch(() => {})
}
init() {
const sherpa = require('sherpa-onnx-node')
const { _whisper_dir: wdir, _whisper_name: wname, _vad_model, _provider } = this
this._recognizer = new sherpa.OfflineRecognizer({
featConfig: { sampleRate: 16000, featureDim: 80 },
modelConfig: {
whisper: {
encoder: path.join(wdir, `${wname}-encoder.int8.onnx`),
decoder: path.join(wdir, `${wname}-decoder.int8.onnx`),
enableTokenTimestamps: 1,
},
tokens: path.join(wdir, `${wname}-tokens.txt`),
numThreads: 2,
provider: _provider,
debug: false,
},
})
// VAD runs on CPU — silero_vad.onnx is tiny
this._vad = new sherpa.Vad({
sileroVad: {
model: _vad_model,
threshold: 0.5,
minSilenceDuration: 0.5, // seconds of silence to end an utterance
minSpeechDuration: 0.1, // ignore sub-100ms blips
windowSize: 512, // 32ms @ 16kHz
maxSpeechDuration: 30.0,
},
sampleRate: 16000,
numThreads: 1,
provider: 'cpu',
debug: false,
}, 60) // 60-second internal ring buffer
}
_get_result(samples) {
const stream = this._recognizer.createStream()
try { stream.setOption('enableTokenTimestamps', '1') } catch {}
stream.acceptWaveform({ samples, sampleRate: 16000 })
this._recognizer.decode(stream)
return this._recognizer.getResult(stream)
}
_transcribe_raw(samples) {
return this._get_result(samples).text
}
_transcribe(samples) {
return this._transcribe_raw(samples).trim()
}
_with_preroll(seg) {
const pre_start = Math.max(0, seg.start - PRE_ROLL_SAMPLES)
const pre_len = seg.start - pre_start
if (pre_len === 0) return seg.samples
const out = new Float32Array(pre_len + seg.samples.length)
for (let i = 0; i < pre_len; i++) {
out[i] = this._history[(pre_start + i) % HISTORY_SAMPLES]
}
out.set(seg.samples, pre_len)
return out
}
/**
* Start listening. Calls on_text(text) for each detected utterance.
* Returns a stop() function.
*/
listen(on_text, { on_audio } = {}) {
const [cmd, args] = find_mic()
process.stderr.write(`[stt] mic: ${cmd} ${args.join(' ')}\n`)
const mic = spawn(cmd, args, { stdio: ['ignore', 'pipe', 'inherit'] })
const VAD_WIN = 512 // must match windowSize above
let pending = Buffer.alloc(0)
mic.stdout.on('data', (chunk) => {
if (on_audio) on_audio(chunk)
pending = Buffer.concat([pending, chunk])
// Feed complete VAD windows
while (pending.length >= VAD_WIN * 2) {
const win = pending.subarray(0, VAD_WIN * 2)
pending = pending.subarray(VAD_WIN * 2)
const f32 = s16le_to_f32(win)
// Write to history ring buffer for pre-roll
const base = this._history_pos % HISTORY_SAMPLES
for (let i = 0; i < f32.length; i++) {
this._history[(base + i) % HISTORY_SAMPLES] = f32[i]
}
this._history_pos += f32.length
this._vad.acceptWaveform(f32)
}
// Drain any complete speech segments
while (!this._vad.isEmpty()) {
const seg = this._vad.front()
this._vad.pop()
const drift = seg.start - (this._history_pos - seg.samples.length)
const with_pre = this._with_preroll(seg)
process.stderr.write(`[stt] segment: start=${seg.start} history_pos=${this._history_pos} samples=${seg.samples.length} drift=${drift}\n`)
const result = this._get_result(with_pre)
if (!this._result_keys_logged) {
this._result_keys_logged = true
process.stderr.write(`[stt] result keys: ${JSON.stringify(Object.keys(result))}\n`)
process.stderr.write(`[stt] result sample: ${JSON.stringify({ tokens: result.tokens?.slice(0,3), timestamps: result.timestamps?.slice(0,3), durations: result.durations?.slice(0,3) })}\n`)
}
const raw = result.text ?? ''
const text = raw.trim()
process.stderr.write(`[stt] whisper raw: ${JSON.stringify(raw)}\n`)
if (this._debug_url) {
try {
this._post_debug({
preroll_length: with_pre.length - seg.samples.length,
transcript: raw,
tokens: result.tokens ?? [],
timestamps: result.timestamps ?? [],
durations: result.durations ?? [],
timestamp: Date.now(),
seg_start: seg.start,
history_pos: this._history_pos,
drift,
}, with_pre)
} catch (err) {
process.stderr.write(`[stt] debug post error: ${err.message}\n`)
}
}
if (text) {
on_text(text)
}
}
})
mic.on('error', err => process.stderr.write(`[stt] mic error: ${err.message}\n`))
mic.on('close', code => {
if (code !== null && code !== 0) {
process.stderr.write(`[stt] mic exited with code ${code}\n`)
}
})
return () => mic.kill()
}
}