#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"' """ STT process: records audio, runs Silero VAD, transcribes with faster-whisper. Events (JSON lines on stdout): {"event": "ready"} {"event": "vad_start"} {"event": "vad_end", "duration": 1.23} {"event": "transcript", "text": "...", "words": [...], "duration": 1.23} {"event": "error", "message": "..."} word format: {"word": "hello", "start": 0.12, "end": 0.45, "probability": 0.99} All log/status messages go to stderr. Stdout is machine-readable events only. Usage: ./stt-server.py ./stt-server.py --model large-v3 --device cuda --compute-type int8_float16 """ import sys import json import signal import argparse import threading import queue import subprocess import traceback import numpy as np import torch SAMPLE_RATE = 16000 VAD_WINDOW = 512 # samples per VAD chunk (32ms at 16kHz) PRE_ROLL_SAMPLES = 3200 # 0.2s of audio prepended to each segment HISTORY_SAMPLES = 960000 # 60s ring buffer for pre-roll def log(msg): sys.stderr.write(f'[stt] {msg}\n') sys.stderr.flush() def emit(event): sys.stdout.write(json.dumps(event) + '\n') sys.stdout.flush() def find_mic(): candidates = [ ['parec', ['--format=s16le', '--rate=16000', '--channels=1', '--latency-msec=50']], ['arecord', ['-f', 'S16_LE', '-r', '16000', '-c', '1', '-t', 'raw', '-q']], ] for cmd, args in candidates: try: subprocess.run(['which', cmd], check=True, capture_output=True) return cmd, args except subprocess.CalledProcessError: pass raise RuntimeError('no mic capture command found — need parec or arecord') def s16le_to_f32(data): return np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0 parser = argparse.ArgumentParser() parser.add_argument('--model', default='base.en') parser.add_argument('--device', default='cuda') parser.add_argument('--compute-type', default='int8_float16') args = parser.parse_args() log(f'loading faster-whisper {args.model} ({args.device}, {args.compute_type})...') from faster_whisper import WhisperModel try: model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type) log(f'model ready on {args.device}') except Exception as e: log(f'{args.device} failed ({e}), falling back to cpu') model = WhisperModel(args.model, device='cpu', compute_type='int8') log('model ready on cpu') log('loading silero VAD...') from silero_vad import load_silero_vad, VADIterator vad_model = load_silero_vad() vad = VADIterator(vad_model, sampling_rate=SAMPLE_RATE, threshold=0.5, min_silence_duration_ms=500) log('VAD ready') # Ring buffer for pre-roll context history = np.zeros(HISTORY_SAMPLES, dtype=np.float32) history_pos = 0 def push_history(samples): global history_pos n = len(samples) base = history_pos % HISTORY_SAMPLES # May wrap around — handle both cases space = HISTORY_SAMPLES - base if n <= space: history[base:base + n] = samples else: history[base:] = samples[:space] history[:n - space] = samples[space:] history_pos += n def get_preroll(): start = max(0, history_pos - PRE_ROLL_SAMPLES) count = history_pos - start out = np.empty(count, dtype=np.float32) for i in range(count): out[i] = history[(start + i) % HISTORY_SAMPLES] return out # Transcription runs in a separate thread so VAD is never blocked by GPU transcription_queue = queue.Queue() def transcription_worker(): while True: item = transcription_queue.get() if item is None: break samples, duration = item try: segments, _ = model.transcribe( samples, language='en', word_timestamps=True, vad_filter=False, ) text = '' words = [] for seg in segments: text += seg.text for w in (seg.words or []): words.append({ 'word': w.word, 'start': round(float(w.start), 4), 'end': round(float(w.end), 4), 'probability': round(float(w.probability), 4), }) log(f'transcript: {json.dumps(text.strip())} ({len(words)} words)') if text.strip(): emit({'event': 'transcript', 'text': text.strip(), 'words': words, 'duration': round(duration, 3)}) except Exception: msg = traceback.format_exc() log(f'transcription error:\n{msg}') emit({'event': 'error', 'message': msg}) finally: transcription_queue.task_done() threading.Thread(target=transcription_worker, daemon=True).start() # Main recording + VAD loop cmd, cmd_args = find_mic() log(f'mic: {cmd} {" ".join(cmd_args)}') mic = subprocess.Popen([cmd] + cmd_args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) def shutdown(sig=None, frame=None): mic.terminate() transcription_queue.put(None) sys.exit(0) signal.signal(signal.SIGTERM, shutdown) signal.signal(signal.SIGINT, shutdown) emit({'event': 'ready'}) speech_samples = [] speech_start = None pending = b'' for chunk in mic.stdout: pending += chunk while len(pending) >= VAD_WINDOW * 2: raw = pending[:VAD_WINDOW * 2] pending = pending[VAD_WINDOW * 2:] f32 = s16le_to_f32(raw) push_history(f32) result = vad(torch.from_numpy(f32), return_seconds=True) if result is not None: if 'start' in result: speech_start = result['start'] speech_samples = [get_preroll()] log(f'VAD start at {speech_start:.2f}s') emit({'event': 'vad_start'}) elif 'end' in result and speech_start is not None: duration = result['end'] - speech_start log(f'VAD end at {result["end"]:.2f}s (duration {duration:.2f}s)') emit({'event': 'vad_end', 'duration': round(duration, 3)}) segment = np.concatenate(speech_samples) transcription_queue.put((segment, duration)) speech_samples = [] speech_start = None vad.reset_states() if speech_start is not None: speech_samples.append(f32)