STT (Silero VAD + Whisper via sherpa-onnx), Chatterbox TTS HTTP server, query completeness classifier (Ollama), multi-voice demo scripts, and planning docs. Kept as reference; clean rewrite planned in separate repos. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
171 lines
5.2 KiB
JavaScript
171 lines
5.2 KiB
JavaScript
/**
|
|
* Speech-to-Text using sherpa-onnx-node.
|
|
*
|
|
* Uses Whisper (offline/batch) as the recognizer, gated by Silero VAD so
|
|
* transcription only runs when a complete utterance has been detected.
|
|
* Audio is captured from the microphone via parec (PulseAudio) or arecord (ALSA).
|
|
*
|
|
* Model layout expected under models_dir:
|
|
* silero_vad.onnx
|
|
* sherpa-onnx-whisper-<name>/
|
|
* <name>-encoder.int8.onnx
|
|
* <name>-decoder.int8.onnx
|
|
* <name>-tokens.txt
|
|
*
|
|
* Available Whisper model names (pass as whisper_name):
|
|
* tiny.en base.en small.en medium.en large-v3 large-v3-turbo
|
|
* Download with: bash download-models.sh [model-name]
|
|
*/
|
|
|
|
import { spawn, execSync } from 'node:child_process'
|
|
import { createRequire } from 'node:module'
|
|
import * as path from 'node:path'
|
|
|
|
const require = createRequire(import.meta.url)
|
|
const DEFAULT_MODELS_DIR = path.join(import.meta.dirname, '..', 'models')
|
|
|
|
const PRE_ROLL_SAMPLES = 3200 // 200ms at 16kHz
|
|
const HISTORY_SAMPLES = 960000 // 60s ring buffer — matches VAD internal size
|
|
|
|
function find_mic() {
|
|
const candidates = [
|
|
['parec', ['--format=s16le', '--rate=16000', '--channels=1', '--latency-msec=50']],
|
|
['arecord', ['-f', 'S16_LE', '-r', '16000', '-c', '1', '-t', 'raw', '-q']],
|
|
]
|
|
for (const [cmd, args] of candidates) {
|
|
try {
|
|
execSync(`which ${cmd}`, { stdio: 'ignore' })
|
|
return [cmd, args]
|
|
} catch { /* try next */ }
|
|
}
|
|
throw new Error('No mic capture command found — need parec (PulseAudio) or arecord (ALSA)')
|
|
}
|
|
|
|
function s16le_to_f32(buf) {
|
|
const out = new Float32Array(buf.length / 2)
|
|
for (let i = 0; i < out.length; i++) {
|
|
out[i] = buf.readInt16LE(i * 2) / 32768.0
|
|
}
|
|
return out
|
|
}
|
|
|
|
export class Stt {
|
|
constructor({
|
|
models_dir = DEFAULT_MODELS_DIR,
|
|
whisper_name = 'base.en',
|
|
provider = 'cuda',
|
|
} = {}) {
|
|
this._whisper_dir = path.join(models_dir, `sherpa-onnx-whisper-${whisper_name}`)
|
|
this._whisper_name = whisper_name
|
|
this._vad_model = path.join(models_dir, 'silero_vad.onnx')
|
|
this._provider = provider
|
|
this._recognizer = null
|
|
this._vad = null
|
|
this._history = new Float32Array(HISTORY_SAMPLES)
|
|
this._history_pos = 0 // total samples fed, monotonically increasing
|
|
}
|
|
|
|
init() {
|
|
const sherpa = require('sherpa-onnx-node')
|
|
const { _whisper_dir: wdir, _whisper_name: wname, _vad_model, _provider } = this
|
|
|
|
this._recognizer = new sherpa.OfflineRecognizer({
|
|
featConfig: { sampleRate: 16000, featureDim: 80 },
|
|
modelConfig: {
|
|
whisper: {
|
|
encoder: path.join(wdir, `${wname}-encoder.int8.onnx`),
|
|
decoder: path.join(wdir, `${wname}-decoder.int8.onnx`),
|
|
},
|
|
tokens: path.join(wdir, `${wname}-tokens.txt`),
|
|
numThreads: 2,
|
|
provider: _provider,
|
|
debug: false,
|
|
},
|
|
})
|
|
|
|
// VAD runs on CPU — silero_vad.onnx is tiny
|
|
this._vad = new sherpa.Vad({
|
|
sileroVad: {
|
|
model: _vad_model,
|
|
threshold: 0.5,
|
|
minSilenceDuration: 0.5, // seconds of silence to end an utterance
|
|
minSpeechDuration: 0.1, // ignore sub-100ms blips
|
|
windowSize: 512, // 32ms @ 16kHz
|
|
maxSpeechDuration: 30.0,
|
|
},
|
|
sampleRate: 16000,
|
|
numThreads: 1,
|
|
provider: 'cpu',
|
|
debug: false,
|
|
}, 60) // 60-second internal ring buffer
|
|
}
|
|
|
|
_transcribe(samples) {
|
|
const stream = this._recognizer.createStream()
|
|
stream.acceptWaveform({ samples, sampleRate: 16000 })
|
|
this._recognizer.decode(stream)
|
|
return this._recognizer.getResult(stream).text.trim()
|
|
}
|
|
|
|
_with_preroll(seg) {
|
|
const pre_start = Math.max(0, seg.start - PRE_ROLL_SAMPLES)
|
|
const pre_len = seg.start - pre_start
|
|
if (pre_len === 0) return seg.samples
|
|
const out = new Float32Array(pre_len + seg.samples.length)
|
|
for (let i = 0; i < pre_len; i++) {
|
|
out[i] = this._history[(pre_start + i) % HISTORY_SAMPLES]
|
|
}
|
|
out.set(seg.samples, pre_len)
|
|
return out
|
|
}
|
|
|
|
/**
|
|
* Start listening. Calls on_text(text) for each detected utterance.
|
|
* Returns a stop() function.
|
|
*/
|
|
listen(on_text, { on_audio } = {}) {
|
|
const [cmd, args] = find_mic()
|
|
process.stderr.write(`[stt] mic: ${cmd} ${args.join(' ')}\n`)
|
|
const mic = spawn(cmd, args, { stdio: ['ignore', 'pipe', 'inherit'] })
|
|
|
|
const VAD_WIN = 512 // must match windowSize above
|
|
let pending = Buffer.alloc(0)
|
|
|
|
mic.stdout.on('data', (chunk) => {
|
|
if (on_audio) on_audio()
|
|
pending = Buffer.concat([pending, chunk])
|
|
|
|
// Feed complete VAD windows
|
|
while (pending.length >= VAD_WIN * 2) {
|
|
const win = pending.subarray(0, VAD_WIN * 2)
|
|
pending = pending.subarray(VAD_WIN * 2)
|
|
const f32 = s16le_to_f32(win)
|
|
// Write to history ring buffer for pre-roll
|
|
const base = this._history_pos % HISTORY_SAMPLES
|
|
for (let i = 0; i < f32.length; i++) {
|
|
this._history[(base + i) % HISTORY_SAMPLES] = f32[i]
|
|
}
|
|
this._history_pos += f32.length
|
|
this._vad.acceptWaveform(f32)
|
|
}
|
|
|
|
// Drain any complete speech segments
|
|
while (!this._vad.isEmpty()) {
|
|
const seg = this._vad.front()
|
|
this._vad.pop()
|
|
const text = this._transcribe(this._with_preroll(seg))
|
|
if (text) on_text(text)
|
|
}
|
|
})
|
|
|
|
mic.on('error', err => process.stderr.write(`[stt] mic error: ${err.message}\n`))
|
|
mic.on('close', code => {
|
|
if (code !== null && code !== 0) {
|
|
process.stderr.write(`[stt] mic exited with code ${code}\n`)
|
|
}
|
|
})
|
|
|
|
return () => mic.kill()
|
|
}
|
|
}
|