From 3db7058646b8a4d9cdfc092ba60c762203f91a4a Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 08:22:40 +0000 Subject: [PATCH 1/6] Add setup-venv.sh, clean up setup-venv-local-build.sh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit setup-venv.sh: simple PyPI install path — just pip install faster-whisper. Use this when the PyPI ctranslate2 wheel matches the system CUDA version. setup-venv-local-build.sh: - PYTHON_ENV env var for venv path override (consistent with tts-server) - HF_TOKEN_FILE env var instead of hardcoded path - HF_HUB_CACHE env var surfaced in output when set - Remove stray chmod on faster-whisper-server.py (not part of this repo) - Remove voice-experiment-specific "run with" message - Add python3 to tool prerequisite check - Arch Linux package suggestions extended to cover CUDA and python - Document why each script exists and when to use which Co-Authored-By: Claude Sonnet 4.6 --- setup-venv-local-build.sh | 42 +++++++++++++++++++++++++++++++-------- setup-venv.sh | 34 +++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 8 deletions(-) create mode 100755 setup-venv.sh diff --git a/setup-venv-local-build.sh b/setup-venv-local-build.sh index d86aa92..1c9f772 100755 --- a/setup-venv-local-build.sh +++ b/setup-venv-local-build.sh @@ -1,11 +1,33 @@ #!/usr/bin/env bash +# +# setup-venv-local-build.sh — builds ctranslate2 from source and installs faster-whisper. +# +# USE THIS SCRIPT when the PyPI ctranslate2 wheel does not match your CUDA version. +# The PyPI wheel targets a specific CUDA major version (e.g. CUDA 12). If your system +# has a newer version (e.g. CUDA 13), the wheel will fail at runtime because it tries +# to dlopen libcublas.so.12 which does not exist. Building from source compiles against +# your actual installed CUDA and links correctly. +# +# For systems where the PyPI wheel works (CUDA version matches), use setup-venv.sh +# instead — it is much faster and simpler. +# +# Environment overrides: +# PYTHON_ENV path to venv (default: ./venv) +# HF_TOKEN_FILE path to HuggingFace token file (default: ~/.secrets/hugging-face.token) +# HF_HUB_CACHE path to HuggingFace hub cache (default: ~/.cache/huggingface/hub) +# CUDA_HOME path to CUDA toolkit (auto-detected if not set) +# +# Arch Linux packages needed before running this script: +# sudo pacman -S cuda cmake git python +# (cuda is in the extra repo; if not found: yay -S cuda or check community/AUR) +# set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -VENV="${SCRIPT_DIR}/venv" +VENV="${PYTHON_ENV:-${SCRIPT_DIR}/venv}" BUILD_DIR="${SCRIPT_DIR}/build/ctranslate2" MODEL="${1:-base.en}" -TOKEN_FILE="${HOME}/.secrets/hugging-face.token" +TOKEN_FILE="${HF_TOKEN_FILE:-${HOME}/.secrets/hugging-face.token}" # Locate CUDA if [ -z "${CUDA_HOME:-}" ]; then @@ -18,14 +40,14 @@ if [ -z "${CUDA_HOME:-}" ]; then fi if [ -z "${CUDA_HOME:-}" ]; then - echo "ERROR: CUDA not found. Set CUDA_HOME manually." >&2 + echo "ERROR: CUDA not found. Set CUDA_HOME manually or install: sudo pacman -S cuda" >&2 exit 1 fi echo "==> CUDA: ${CUDA_HOME}" "${CUDA_HOME}/bin/nvcc" --version | head -1 -for tool in cmake git; do +for tool in cmake git python3; do if ! command -v "${tool}" &>/dev/null; then echo "ERROR: ${tool} not found — install with: sudo pacman -S ${tool}" >&2 exit 1 @@ -79,7 +101,8 @@ ls "${VENV}/lib/libctranslate2"* 2>/dev/null || { echo "ERROR: libctranslate2 no grep "WITH_CUDA" "${BUILD_DIR}/cmake-build/CMakeCache.txt" | grep -v "^#" || true # --- Python bindings --- -# Always reinstall from source to ensure we use our CUDA 13 build, not a PyPI wheel +# Always reinstall from source to ensure we use our locally built library, +# not a PyPI wheel that may have been pulled in as a dependency. echo "==> removing any existing ctranslate2 install..." "${VENV}/bin/pip" uninstall -y ctranslate2 2>/dev/null || true @@ -97,6 +120,7 @@ else echo "==> faster-whisper already installed, skipping" fi +# --- model download --- if [ -f "${TOKEN_FILE}" ]; then export HF_TOKEN="$(cat "${TOKEN_FILE}")" echo "==> HuggingFace token loaded from ${TOKEN_FILE}" @@ -104,6 +128,10 @@ else echo "==> no token found at ${TOKEN_FILE} — unauthenticated download" fi +if [ -n "${HF_HUB_CACHE:-}" ]; then + echo "==> HuggingFace cache: ${HF_HUB_CACHE}" +fi + echo "==> pre-downloading model: ${MODEL}" "${VENV}/bin/python3" - < done. Run with: node query-demo.mjs --stt faster-whisper" +echo "==> done. Venv ready at ${VENV}" diff --git a/setup-venv.sh b/setup-venv.sh new file mode 100755 index 0000000..b041654 --- /dev/null +++ b/setup-venv.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# +# setup-venv.sh — installs faster-whisper from PyPI into a venv. +# +# USE THIS SCRIPT when the PyPI ctranslate2 wheel matches your CUDA version. +# PyPI wheels target a specific CUDA major version; if your system matches, +# this is the fastest way to get started — no compilation required. +# +# If you see errors like "libcublas.so.12: cannot open shared object file" at +# runtime, your CUDA version does not match the wheel. Use setup-venv-local-build.sh +# instead, which compiles ctranslate2 against your actual CUDA installation. +# +# Environment overrides: +# PYTHON_ENV path to venv (default: ./venv) +# +# Arch Linux packages needed before running this script: +# sudo pacman -S python cuda +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VENV="${PYTHON_ENV:-${SCRIPT_DIR}/venv}" + +if [ ! -d "${VENV}" ]; then + echo "==> creating venv at ${VENV}" + python3 -m venv "${VENV}" +fi + +echo "==> installing faster-whisper" +"${VENV}/bin/pip" install --upgrade pip --quiet +"${VENV}/bin/pip" install faster-whisper + +echo "" +echo "==> done. Venv ready at ${VENV}" From 346c7c65857583a000a17d7f0273f7e6c5bbd752 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 08:24:43 +0000 Subject: [PATCH 2/6] Remove Arch Linux specific package suggestions Co-Authored-By: Claude Sonnet 4.6 --- setup-venv-local-build.sh | 8 ++------ setup-venv.sh | 3 --- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/setup-venv-local-build.sh b/setup-venv-local-build.sh index 1c9f772..84552f2 100755 --- a/setup-venv-local-build.sh +++ b/setup-venv-local-build.sh @@ -17,10 +17,6 @@ # HF_HUB_CACHE path to HuggingFace hub cache (default: ~/.cache/huggingface/hub) # CUDA_HOME path to CUDA toolkit (auto-detected if not set) # -# Arch Linux packages needed before running this script: -# sudo pacman -S cuda cmake git python -# (cuda is in the extra repo; if not found: yay -S cuda or check community/AUR) -# set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -40,7 +36,7 @@ if [ -z "${CUDA_HOME:-}" ]; then fi if [ -z "${CUDA_HOME:-}" ]; then - echo "ERROR: CUDA not found. Set CUDA_HOME manually or install: sudo pacman -S cuda" >&2 + echo "ERROR: CUDA not found. Set CUDA_HOME manually." >&2 exit 1 fi @@ -49,7 +45,7 @@ echo "==> CUDA: ${CUDA_HOME}" for tool in cmake git python3; do if ! command -v "${tool}" &>/dev/null; then - echo "ERROR: ${tool} not found — install with: sudo pacman -S ${tool}" >&2 + echo "ERROR: ${tool} not found" >&2 exit 1 fi done diff --git a/setup-venv.sh b/setup-venv.sh index b041654..6e9affe 100755 --- a/setup-venv.sh +++ b/setup-venv.sh @@ -13,9 +13,6 @@ # Environment overrides: # PYTHON_ENV path to venv (default: ./venv) # -# Arch Linux packages needed before running this script: -# sudo pacman -S python cuda -# set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" From bbde89a2cc29a92e5e076ec0871850af9d43ba64 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 08:28:42 +0000 Subject: [PATCH 3/6] Fix missing faster-whisper deps when using local ctranslate2 build --no-deps skipped av and other required packages. Fix by installing faster-whisper normally first (satisfies all deps, pulls PyPI ctranslate2), then immediately overriding ctranslate2 with the source-built version. Co-Authored-By: Claude Sonnet 4.6 --- setup-venv-local-build.sh | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/setup-venv-local-build.sh b/setup-venv-local-build.sh index 84552f2..7354f27 100755 --- a/setup-venv-local-build.sh +++ b/setup-venv-local-build.sh @@ -96,26 +96,29 @@ ls "${VENV}/include/ctranslate2/" | head -3 ls "${VENV}/lib/libctranslate2"* 2>/dev/null || { echo "ERROR: libctranslate2 not found in venv/lib" >&2; exit 1; } grep "WITH_CUDA" "${BUILD_DIR}/cmake-build/CMakeCache.txt" | grep -v "^#" || true -# --- Python bindings --- -# Always reinstall from source to ensure we use our locally built library, -# not a PyPI wheel that may have been pulled in as a dependency. -echo "==> removing any existing ctranslate2 install..." +# --- faster-whisper (with all deps, including PyPI ctranslate2) --- +# Install faster-whisper normally so all its dependencies (av, huggingface_hub, etc.) +# are satisfied. This will pull in the PyPI ctranslate2 wheel, which we override next. +if ! "${VENV}/bin/python3" -c "import faster_whisper" &>/dev/null 2>&1; then + echo "==> installing faster-whisper" + "${VENV}/bin/pip" install faster-whisper +else + echo "==> faster-whisper already installed, skipping" +fi + +# --- Python bindings (always reinstalled from source) --- +# Override the PyPI ctranslate2 wheel pulled in above with our source-built version. +# This is the whole point of this script: the PyPI wheel links against a fixed CUDA +# major version (e.g. libcublas.so.12) while our build links against the system version. +echo "==> removing PyPI ctranslate2..." "${VENV}/bin/pip" uninstall -y ctranslate2 2>/dev/null || true -echo "==> building ctranslate2 Python bindings from source..." +echo "==> installing source-built ctranslate2 Python bindings..." CT2_ROOT="${VENV}" \ LIBRARY_PATH="${VENV}/lib:${VENV}/lib64${LIBRARY_PATH:+:${LIBRARY_PATH}}" \ LDFLAGS="-Wl,-rpath,${VENV}/lib" \ "${VENV}/bin/pip" install "${BUILD_DIR}/src/python" --no-build-isolation -# --- faster-whisper --- -if ! "${VENV}/bin/python3" -c "import faster_whisper" &>/dev/null 2>&1; then - echo "==> installing faster-whisper" - "${VENV}/bin/pip" install faster-whisper --no-deps -else - echo "==> faster-whisper already installed, skipping" -fi - # --- model download --- if [ -f "${TOKEN_FILE}" ]; then export HF_TOKEN="$(cat "${TOKEN_FILE}")" From 2af47373c4270c86aea297bc3fc25d1847d1805f Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 08:41:45 +0000 Subject: [PATCH 4/6] Add stt-server.py: self-contained recording + VAD + transcription process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the old stdin/stdout transcription-only server. Now handles the full pipeline in Python: - Launches parec or arecord for mic capture - Runs Silero VAD (via silero-vad, already a faster-whisper dep — no sherpa-onnx needed) - Pre-roll ring buffer (0.2s) prepended to each segment for context - Transcribes with faster-whisper in a separate thread (GPU not blocking VAD) - Emits JSON line events to stdout: ready, vad_start, vad_end, transcript, error Event protocol is designed to map directly to WebSocket subscriptions later. Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 3 +- stt-server.py | 205 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 207 insertions(+), 1 deletion(-) create mode 100755 stt-server.py diff --git a/.gitignore b/.gitignore index eba74f4..1273e33 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -venv/ \ No newline at end of file +venv/ +build/ \ No newline at end of file diff --git a/stt-server.py b/stt-server.py new file mode 100755 index 0000000..8e7041a --- /dev/null +++ b/stt-server.py @@ -0,0 +1,205 @@ +#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"' +""" +STT process: records audio, runs Silero VAD, transcribes with faster-whisper. + +Events (JSON lines on stdout): + {"event": "ready"} + {"event": "vad_start"} + {"event": "vad_end", "duration": 1.23} + {"event": "transcript", "text": "...", "words": [...], "duration": 1.23} + {"event": "error", "message": "..."} + +word format: {"word": "hello", "start": 0.12, "end": 0.45, "probability": 0.99} + +All log/status messages go to stderr. Stdout is machine-readable events only. + +Usage: + ./stt-server.py + ./stt-server.py --model large-v3 --device cuda --compute-type int8_float16 +""" + +import sys +import json +import signal +import argparse +import threading +import queue +import subprocess +import traceback +import numpy as np +import torch + +SAMPLE_RATE = 16000 +VAD_WINDOW = 512 # samples per VAD chunk (32ms at 16kHz) +PRE_ROLL_SAMPLES = 3200 # 0.2s of audio prepended to each segment +HISTORY_SAMPLES = 960000 # 60s ring buffer for pre-roll + + +def log(msg): + sys.stderr.write(f'[stt] {msg}\n') + sys.stderr.flush() + + +def emit(event): + sys.stdout.write(json.dumps(event) + '\n') + sys.stdout.flush() + + +def find_mic(): + candidates = [ + ['parec', ['--format=s16le', '--rate=16000', '--channels=1', '--latency-msec=50']], + ['arecord', ['-f', 'S16_LE', '-r', '16000', '-c', '1', '-t', 'raw', '-q']], + ] + for cmd, args in candidates: + try: + subprocess.run(['which', cmd], check=True, capture_output=True) + return cmd, args + except subprocess.CalledProcessError: + pass + raise RuntimeError('no mic capture command found — need parec or arecord') + + +def s16le_to_f32(data): + return np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0 + + +parser = argparse.ArgumentParser() +parser.add_argument('--model', default='base.en') +parser.add_argument('--device', default='cuda') +parser.add_argument('--compute-type', default='int8_float16') +args = parser.parse_args() + +log(f'loading faster-whisper {args.model} ({args.device}, {args.compute_type})...') +from faster_whisper import WhisperModel +try: + model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type) + log(f'model ready on {args.device}') +except Exception as e: + log(f'{args.device} failed ({e}), falling back to cpu') + model = WhisperModel(args.model, device='cpu', compute_type='int8') + log('model ready on cpu') + +log('loading silero VAD...') +from silero_vad import load_silero_vad, VADIterator +vad_model = load_silero_vad() +vad = VADIterator(vad_model, sampling_rate=SAMPLE_RATE, + threshold=0.5, min_silence_duration_ms=500) +log('VAD ready') + + +# Ring buffer for pre-roll context +history = np.zeros(HISTORY_SAMPLES, dtype=np.float32) +history_pos = 0 + +def push_history(samples): + global history_pos + n = len(samples) + base = history_pos % HISTORY_SAMPLES + # May wrap around — handle both cases + space = HISTORY_SAMPLES - base + if n <= space: + history[base:base + n] = samples + else: + history[base:] = samples[:space] + history[:n - space] = samples[space:] + history_pos += n + +def get_preroll(): + start = max(0, history_pos - PRE_ROLL_SAMPLES) + count = history_pos - start + out = np.empty(count, dtype=np.float32) + for i in range(count): + out[i] = history[(start + i) % HISTORY_SAMPLES] + return out + + +# Transcription runs in a separate thread so VAD is never blocked by GPU +transcription_queue = queue.Queue() + +def transcription_worker(): + while True: + item = transcription_queue.get() + if item is None: + break + samples, duration = item + try: + segments, _ = model.transcribe( + samples, + language='en', + word_timestamps=True, + vad_filter=False, + ) + text = '' + words = [] + for seg in segments: + text += seg.text + for w in (seg.words or []): + words.append({ + 'word': w.word, + 'start': round(float(w.start), 4), + 'end': round(float(w.end), 4), + 'probability': round(float(w.probability), 4), + }) + log(f'transcript: {json.dumps(text.strip())} ({len(words)} words)') + if text.strip(): + emit({'event': 'transcript', 'text': text.strip(), 'words': words, 'duration': round(duration, 3)}) + except Exception: + msg = traceback.format_exc() + log(f'transcription error:\n{msg}') + emit({'event': 'error', 'message': msg}) + finally: + transcription_queue.task_done() + + +threading.Thread(target=transcription_worker, daemon=True).start() + + +# Main recording + VAD loop +cmd, cmd_args = find_mic() +log(f'mic: {cmd} {" ".join(cmd_args)}') +mic = subprocess.Popen([cmd] + cmd_args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + +def shutdown(sig=None, frame=None): + mic.terminate() + transcription_queue.put(None) + sys.exit(0) + +signal.signal(signal.SIGTERM, shutdown) +signal.signal(signal.SIGINT, shutdown) + +emit({'event': 'ready'}) + +speech_samples = [] +speech_start = None +pending = b'' + +for chunk in mic.stdout: + pending += chunk + while len(pending) >= VAD_WINDOW * 2: + raw = pending[:VAD_WINDOW * 2] + pending = pending[VAD_WINDOW * 2:] + + f32 = s16le_to_f32(raw) + push_history(f32) + + result = vad(torch.from_numpy(f32), return_seconds=True) + + if result is not None: + if 'start' in result: + speech_start = result['start'] + speech_samples = [get_preroll()] + log(f'VAD start at {speech_start:.2f}s') + emit({'event': 'vad_start'}) + + elif 'end' in result and speech_start is not None: + duration = result['end'] - speech_start + log(f'VAD end at {result["end"]:.2f}s (duration {duration:.2f}s)') + emit({'event': 'vad_end', 'duration': round(duration, 3)}) + segment = np.concatenate(speech_samples) + transcription_queue.put((segment, duration)) + speech_samples = [] + speech_start = None + vad.reset_states() + + if speech_start is not None: + speech_samples.append(f32) From c0a72679f8cfc886556155d0fae3f52414c139c2 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 08:47:03 +0000 Subject: [PATCH 5/6] Add torch to both venv setup scripts Co-Authored-By: Claude Sonnet 4.6 --- setup-venv-local-build.sh | 1 + setup-venv.sh | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/setup-venv-local-build.sh b/setup-venv-local-build.sh index 7354f27..1929759 100755 --- a/setup-venv-local-build.sh +++ b/setup-venv-local-build.sh @@ -57,6 +57,7 @@ fi echo "==> upgrading pip + build tools" "${VENV}/bin/pip" install --upgrade pip wheel setuptools pybind11 --quiet +"${VENV}/bin/pip" install torch # --- clone (skipped if already done) --- if [ ! -d "${BUILD_DIR}/src/.git" ]; then diff --git a/setup-venv.sh b/setup-venv.sh index 6e9affe..19b288c 100755 --- a/setup-venv.sh +++ b/setup-venv.sh @@ -23,9 +23,9 @@ if [ ! -d "${VENV}" ]; then python3 -m venv "${VENV}" fi -echo "==> installing faster-whisper" +echo "==> installing torch and faster-whisper" "${VENV}/bin/pip" install --upgrade pip --quiet -"${VENV}/bin/pip" install faster-whisper +"${VENV}/bin/pip" install torch faster-whisper echo "" echo "==> done. Venv ready at ${VENV}" From 01210e878f45e4dafdb2192b69c7a1767e09c055 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 08:48:59 +0000 Subject: [PATCH 6/6] Add silero-vad to both venv setup scripts Co-Authored-By: Claude Sonnet 4.6 --- setup-venv-local-build.sh | 2 +- setup-venv.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup-venv-local-build.sh b/setup-venv-local-build.sh index 1929759..7ccbbc0 100755 --- a/setup-venv-local-build.sh +++ b/setup-venv-local-build.sh @@ -57,7 +57,7 @@ fi echo "==> upgrading pip + build tools" "${VENV}/bin/pip" install --upgrade pip wheel setuptools pybind11 --quiet -"${VENV}/bin/pip" install torch +"${VENV}/bin/pip" install torch silero-vad # --- clone (skipped if already done) --- if [ ! -d "${BUILD_DIR}/src/.git" ]; then diff --git a/setup-venv.sh b/setup-venv.sh index 19b288c..c017c9d 100755 --- a/setup-venv.sh +++ b/setup-venv.sh @@ -25,7 +25,7 @@ fi echo "==> installing torch and faster-whisper" "${VENV}/bin/pip" install --upgrade pip --quiet -"${VENV}/bin/pip" install torch faster-whisper +"${VENV}/bin/pip" install torch faster-whisper silero-vad echo "" echo "==> done. Venv ready at ${VENV}"