- README explaining experimental/transparency purpose - faster-whisper STT backend (fw-stt.mjs, faster-whisper-server.py, install-faster-whisper.sh) - Bug fixes: Buffer alignment in on_audio, --debug-waveform URL parsing, silent fetch errors, instant dispatch timer leak - Global uncaughtException/unhandledRejection handlers in query-demo.mjs - Design docs: CHANGELOG, COMMAND-DISPATCH, INTERFACE-THEORY, VOICE-POLICY - Systemd service unit templates Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
236 lines
7.2 KiB
JavaScript
236 lines
7.2 KiB
JavaScript
/**
|
|
* faster-whisper STT backend — drop-in replacement for stt.mjs.
|
|
*
|
|
* Uses sherpa-onnx Silero VAD for segment detection (same as stt.mjs),
|
|
* but transcribes via a faster-whisper Python subprocess which returns
|
|
* word-level timestamps.
|
|
*/
|
|
|
|
import { spawn, execSync } from 'node:child_process'
|
|
import { createRequire } from 'node:module'
|
|
import * as path from 'node:path'
|
|
import { existsSync } from 'node:fs'
|
|
|
|
const require = createRequire(import.meta.url)
|
|
|
|
const DEFAULT_MODELS_DIR = path.join(import.meta.dirname, '..', 'models')
|
|
const DEFAULT_SERVER = path.join(import.meta.dirname, '..', 'faster-whisper-server.py')
|
|
const VENV_PYTHON = path.join(import.meta.dirname, '..', 'venv', 'bin', 'python3')
|
|
|
|
const PRE_ROLL_SAMPLES = 3200
|
|
const HISTORY_SAMPLES = 960000
|
|
|
|
function find_mic() {
|
|
const candidates = [
|
|
['parec', ['--format=s16le', '--rate=16000', '--channels=1', '--latency-msec=50']],
|
|
['arecord', ['-f', 'S16_LE', '-r', '16000', '-c', '1', '-t', 'raw', '-q']],
|
|
]
|
|
for (const [cmd, args] of candidates) {
|
|
try {
|
|
execSync(`which ${cmd}`, { stdio: 'ignore' })
|
|
return [cmd, args]
|
|
} catch { /* try next */ }
|
|
}
|
|
throw new Error('No mic capture command found — need parec or arecord')
|
|
}
|
|
|
|
function s16le_to_f32(buf) {
|
|
const out = new Float32Array(buf.length / 2)
|
|
for (let i = 0; i < out.length; i++) {
|
|
out[i] = buf.readInt16LE(i * 2) / 32768.0
|
|
}
|
|
return out
|
|
}
|
|
|
|
export class Stt {
|
|
constructor({
|
|
models_dir = DEFAULT_MODELS_DIR,
|
|
whisper_name = 'base.en',
|
|
server_path = DEFAULT_SERVER,
|
|
device = 'cuda',
|
|
debug_url = null,
|
|
} = {}) {
|
|
this._models_dir = models_dir
|
|
this._whisper_name = whisper_name
|
|
this._server_path = server_path
|
|
this._device = device
|
|
this._debug_url = debug_url
|
|
this._vad = null
|
|
this._server = null
|
|
this._history = new Float32Array(HISTORY_SAMPLES)
|
|
this._history_pos = 0
|
|
|
|
// Response queue: each entry is a resolve function waiting for the next JSON line
|
|
this._response_queue = []
|
|
this._line_buf = ''
|
|
|
|
// Ready promise — resolved when server prints "ready"
|
|
let ready_resolve
|
|
this._ready = new Promise(r => { ready_resolve = r })
|
|
this._ready_resolve = ready_resolve
|
|
}
|
|
|
|
init() {
|
|
const sherpa = require('sherpa-onnx-node')
|
|
const vad_model = path.join(this._models_dir, 'silero_vad.onnx')
|
|
|
|
this._vad = new sherpa.Vad({
|
|
sileroVad: {
|
|
model: vad_model,
|
|
threshold: 0.5,
|
|
minSilenceDuration: 0.5,
|
|
minSpeechDuration: 0.1,
|
|
windowSize: 512,
|
|
maxSpeechDuration: 30.0,
|
|
},
|
|
sampleRate: 16000,
|
|
numThreads: 1,
|
|
provider: 'cpu',
|
|
debug: false,
|
|
}, 60)
|
|
|
|
const python = existsSync(VENV_PYTHON) ? VENV_PYTHON : 'python3'
|
|
process.stderr.write(`[fw-stt] python: ${python}\n`)
|
|
process.stderr.write(`[fw-stt] model: ${this._whisper_name}\n`)
|
|
|
|
this._server = spawn(python, [
|
|
this._server_path,
|
|
'--model', this._whisper_name,
|
|
'--device', this._device,
|
|
], {
|
|
stdio: ['pipe', 'pipe', 'inherit'],
|
|
env: { ...process.env },
|
|
})
|
|
|
|
this._server.stdout.on('data', (chunk) => {
|
|
this._line_buf += chunk.toString()
|
|
let nl
|
|
while ((nl = this._line_buf.indexOf('\n')) !== -1) {
|
|
const line = this._line_buf.slice(0, nl).trim()
|
|
this._line_buf = this._line_buf.slice(nl + 1)
|
|
if (line === 'ready') {
|
|
this._ready_resolve()
|
|
continue
|
|
}
|
|
const resolver = this._response_queue.shift()
|
|
if (resolver) {
|
|
try {
|
|
resolver(JSON.parse(line))
|
|
} catch {
|
|
resolver({ text: '', words: [] })
|
|
}
|
|
}
|
|
}
|
|
})
|
|
|
|
this._server.on('error', err => process.stderr.write(`[fw-stt] server error: ${err.message}\n`))
|
|
this._server.on('close', code => process.stderr.write(`[fw-stt] server exited (${code})\n`))
|
|
}
|
|
|
|
_with_preroll(seg) {
|
|
const pre_start = Math.max(0, seg.start - PRE_ROLL_SAMPLES)
|
|
const pre_len = seg.start - pre_start
|
|
if (pre_len === 0) return seg.samples
|
|
const out = new Float32Array(pre_len + seg.samples.length)
|
|
for (let i = 0; i < pre_len; i++) {
|
|
out[i] = this._history[(pre_start + i) % HISTORY_SAMPLES]
|
|
}
|
|
out.set(seg.samples, pre_len)
|
|
return out
|
|
}
|
|
|
|
async _transcribe(samples) {
|
|
await this._ready
|
|
return new Promise((resolve) => {
|
|
this._response_queue.push(resolve)
|
|
const bytes = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength)
|
|
const request = JSON.stringify({ audio_b64: bytes.toString('base64'), sample_rate: 16000 }) + '\n'
|
|
this._server.stdin.write(request)
|
|
})
|
|
}
|
|
|
|
_post_debug(meta, samples) {
|
|
const json_buf = Buffer.from(JSON.stringify(meta), 'utf8')
|
|
const len_buf = Buffer.allocUnsafe(4)
|
|
len_buf.writeUInt32LE(json_buf.byteLength, 0)
|
|
const samp_buf = Buffer.from(samples.buffer, samples.byteOffset, samples.byteLength)
|
|
const body = Buffer.concat([len_buf, json_buf, samp_buf])
|
|
fetch(this._debug_url, {
|
|
method: 'POST',
|
|
body,
|
|
headers: { 'Content-Type': 'application/octet-stream' },
|
|
}).catch(err => {
|
|
process.stderr.write(`[fw-stt] debug post failed: ${err.message}\n`)
|
|
})
|
|
}
|
|
|
|
listen(on_text, { on_audio } = {}) {
|
|
const [cmd, args] = find_mic()
|
|
process.stderr.write(`[fw-stt] mic: ${cmd} ${args.join(' ')}\n`)
|
|
const mic = spawn(cmd, args, { stdio: ['ignore', 'pipe', 'inherit'] })
|
|
|
|
const VAD_WIN = 512
|
|
let pending = Buffer.alloc(0)
|
|
|
|
mic.stdout.on('data', (chunk) => {
|
|
if (on_audio) on_audio(chunk)
|
|
pending = Buffer.concat([pending, chunk])
|
|
|
|
while (pending.length >= VAD_WIN * 2) {
|
|
const win = pending.subarray(0, VAD_WIN * 2)
|
|
pending = pending.subarray(VAD_WIN * 2)
|
|
const f32 = s16le_to_f32(win)
|
|
const base = this._history_pos % HISTORY_SAMPLES
|
|
for (let i = 0; i < f32.length; i++) {
|
|
this._history[(base + i) % HISTORY_SAMPLES] = f32[i]
|
|
}
|
|
this._history_pos += f32.length
|
|
this._vad.acceptWaveform(f32)
|
|
}
|
|
|
|
while (!this._vad.isEmpty()) {
|
|
const seg = this._vad.front()
|
|
this._vad.pop()
|
|
const drift = seg.start - (this._history_pos - seg.samples.length)
|
|
const with_pre = this._with_preroll(seg)
|
|
process.stderr.write(`[fw-stt] segment: samples=${seg.samples.length} drift=${drift}\n`)
|
|
|
|
this._transcribe(with_pre).then(result => {
|
|
const text = (result.text ?? '').trim()
|
|
const words = result.words ?? []
|
|
process.stderr.write(`[fw-stt] raw: ${JSON.stringify(result.text)} (${words.length} words)\n`)
|
|
|
|
if (this._debug_url) {
|
|
try {
|
|
this._post_debug({
|
|
preroll_length: with_pre.length - seg.samples.length,
|
|
transcript: result.text ?? '',
|
|
tokens: words.map(w => w.word),
|
|
timestamps: words.map(w => w.start),
|
|
durations: words.map(w => w.end - w.start),
|
|
timestamp: Date.now(),
|
|
drift,
|
|
}, with_pre)
|
|
} catch (err) {
|
|
process.stderr.write(`[fw-stt] debug post error: ${err.message}\n`)
|
|
}
|
|
}
|
|
|
|
if (text) on_text(text)
|
|
}).catch(err => {
|
|
process.stderr.write(`[fw-stt] transcription error: ${err.message}\n`)
|
|
})
|
|
}
|
|
})
|
|
|
|
mic.on('error', err => process.stderr.write(`[fw-stt] mic error: ${err.message}\n`))
|
|
mic.on('close', code => {
|
|
if (code !== null && code !== 0) {
|
|
process.stderr.write(`[fw-stt] mic exited (${code})\n`)
|
|
}
|
|
})
|
|
|
|
return () => { mic.kill(); this._server?.kill() }
|
|
}
|
|
}
|