From 18404708e3f16a7d092171f8fc2f2d95bb2e95a8 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 08:53:54 +0000 Subject: [PATCH 01/13] Add WebSocket broadcast to stt-server.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every connection receives the full event stream (vad_start, vad_end, transcript, error) from the moment it connects — no subscription handshake required. The asyncio WebSocket server runs in a daemon thread alongside the VAD loop and transcription thread. Events still go to stdout unchanged. Port is configurable via STT_PORT env var (default: 11501). Add websockets to both setup scripts. Co-Authored-By: Claude Sonnet 4.6 --- setup-venv-local-build.sh | 2 +- setup-venv.sh | 2 +- stt-server.py | 96 ++++++++++++++++++++++++++++++--------- 3 files changed, 77 insertions(+), 23 deletions(-) diff --git a/setup-venv-local-build.sh b/setup-venv-local-build.sh index 7ccbbc0..e2dcf1d 100755 --- a/setup-venv-local-build.sh +++ b/setup-venv-local-build.sh @@ -57,7 +57,7 @@ fi echo "==> upgrading pip + build tools" "${VENV}/bin/pip" install --upgrade pip wheel setuptools pybind11 --quiet -"${VENV}/bin/pip" install torch silero-vad +"${VENV}/bin/pip" install torch silero-vad websockets # --- clone (skipped if already done) --- if [ ! -d "${BUILD_DIR}/src/.git" ]; then diff --git a/setup-venv.sh b/setup-venv.sh index c017c9d..356b6cc 100755 --- a/setup-venv.sh +++ b/setup-venv.sh @@ -25,7 +25,7 @@ fi echo "==> installing torch and faster-whisper" "${VENV}/bin/pip" install --upgrade pip --quiet -"${VENV}/bin/pip" install torch faster-whisper silero-vad +"${VENV}/bin/pip" install torch faster-whisper silero-vad websockets echo "" echo "==> done. Venv ready at ${VENV}" diff --git a/stt-server.py b/stt-server.py index 8e7041a..38b994b 100755 --- a/stt-server.py +++ b/stt-server.py @@ -1,18 +1,25 @@ #!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"' """ -STT process: records audio, runs Silero VAD, transcribes with faster-whisper. +STT server: records audio, runs Silero VAD, transcribes with faster-whisper. +Broadcasts JSON events to all connected WebSocket clients and to stdout. -Events (JSON lines on stdout): +Events: {"event": "ready"} {"event": "vad_start"} - {"event": "vad_end", "duration": 1.23} - {"event": "transcript", "text": "...", "words": [...], "duration": 1.23} - {"event": "error", "message": "..."} + {"event": "vad_end", "duration": 1.23} + {"event": "transcript", "text": "...", "words": [...], "duration": 1.23} + {"event": "error", "message": "..."} word format: {"word": "hello", "start": 0.12, "end": 0.45, "probability": 0.99} +Every WebSocket connection receives the full event stream from the moment it +connects — no subscription handshake required. + All log/status messages go to stderr. Stdout is machine-readable events only. +Environment: + STT_PORT WebSocket port (default: 11501) + Usage: ./stt-server.py ./stt-server.py --model large-v3 --device cuda --compute-type int8_float16 @@ -26,13 +33,16 @@ import threading import queue import subprocess import traceback +import asyncio +import websockets import numpy as np import torch -SAMPLE_RATE = 16000 -VAD_WINDOW = 512 # samples per VAD chunk (32ms at 16kHz) -PRE_ROLL_SAMPLES = 3200 # 0.2s of audio prepended to each segment -HISTORY_SAMPLES = 960000 # 60s ring buffer for pre-roll +SAMPLE_RATE = 16000 +VAD_WINDOW = 512 # samples per VAD chunk (32ms at 16kHz) +PRE_ROLL_SAMPLES = 3200 # 0.2s prepended to each segment for context +HISTORY_SAMPLES = 960000 # 60s ring buffer for pre-roll +PORT = int(__import__('os').environ.get('STT_PORT', 11501)) def log(msg): @@ -40,10 +50,49 @@ def log(msg): sys.stderr.flush() -def emit(event): - sys.stdout.write(json.dumps(event) + '\n') - sys.stdout.flush() +# --- WebSocket broadcast --- +_ws_loop = None +_ws_clients = set() # set of asyncio.Queue, one per connection + + +def emit(event): + line = json.dumps(event) + sys.stdout.write(line + '\n') + sys.stdout.flush() + if _ws_loop is not None: + for q in list(_ws_clients): + _ws_loop.call_soon_threadsafe(q.put_nowait, line) + + +async def ws_handler(websocket): + q = asyncio.Queue() + _ws_clients.add(q) + log(f'client connected ({len(_ws_clients)} total)') + try: + while True: + msg = await q.get() + await websocket.send(msg) + except websockets.ConnectionClosed: + pass + finally: + _ws_clients.discard(q) + log(f'client disconnected ({len(_ws_clients)} remaining)') + + +async def ws_main(): + global _ws_loop + _ws_loop = asyncio.get_running_loop() + async with websockets.serve(ws_handler, '', PORT): + log(f'WebSocket listening on port {PORT}') + await asyncio.Future() # run forever + + +def start_ws_server(): + asyncio.run(ws_main()) + + +# --- Mic --- def find_mic(): candidates = [ @@ -63,6 +112,8 @@ def s16le_to_f32(data): return np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0 +# --- Args + model loading --- + parser = argparse.ArgumentParser() parser.add_argument('--model', default='base.en') parser.add_argument('--device', default='cuda') @@ -82,25 +133,25 @@ except Exception as e: log('loading silero VAD...') from silero_vad import load_silero_vad, VADIterator vad_model = load_silero_vad() -vad = VADIterator(vad_model, sampling_rate=SAMPLE_RATE, - threshold=0.5, min_silence_duration_ms=500) +vad = VADIterator(vad_model, sampling_rate=SAMPLE_RATE, + threshold=0.5, min_silence_duration_ms=500) log('VAD ready') -# Ring buffer for pre-roll context +# --- Pre-roll ring buffer --- + history = np.zeros(HISTORY_SAMPLES, dtype=np.float32) history_pos = 0 def push_history(samples): global history_pos - n = len(samples) - base = history_pos % HISTORY_SAMPLES - # May wrap around — handle both cases + n = len(samples) + base = history_pos % HISTORY_SAMPLES space = HISTORY_SAMPLES - base if n <= space: history[base:base + n] = samples else: - history[base:] = samples[:space] + history[base:] = samples[:space] history[:n - space] = samples[space:] history_pos += n @@ -113,7 +164,8 @@ def get_preroll(): return out -# Transcription runs in a separate thread so VAD is never blocked by GPU +# --- Transcription thread --- + transcription_queue = queue.Queue() def transcription_worker(): @@ -152,9 +204,11 @@ def transcription_worker(): threading.Thread(target=transcription_worker, daemon=True).start() +threading.Thread(target=start_ws_server, daemon=True).start() -# Main recording + VAD loop +# --- Main recording + VAD loop --- + cmd, cmd_args = find_mic() log(f'mic: {cmd} {" ".join(cmd_args)}') mic = subprocess.Popen([cmd] + cmd_args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) From f18330608d87e4148d5f30c7d283dd92adffdb25 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 08:56:58 +0000 Subject: [PATCH 02/13] Add --verbose flag; suppress info logging by default Errors always go to stderr. Info logs (startup, VAD events, transcripts) only appear with --verbose / -v, keeping stderr clean when running as a system service. Co-Authored-By: Claude Sonnet 4.6 --- stt-server.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/stt-server.py b/stt-server.py index 38b994b..3c47185 100755 --- a/stt-server.py +++ b/stt-server.py @@ -16,13 +16,15 @@ Every WebSocket connection receives the full event stream from the moment it connects — no subscription handshake required. All log/status messages go to stderr. Stdout is machine-readable events only. +Pass --verbose to enable info logging (startup, VAD events, transcripts). +Errors always go to stderr regardless of verbosity. Environment: STT_PORT WebSocket port (default: 11501) Usage: ./stt-server.py - ./stt-server.py --model large-v3 --device cuda --compute-type int8_float16 + ./stt-server.py --model large-v3 --device cuda --compute-type int8_float16 --verbose """ import sys @@ -45,9 +47,10 @@ HISTORY_SAMPLES = 960000 # 60s ring buffer for pre-roll PORT = int(__import__('os').environ.get('STT_PORT', 11501)) -def log(msg): - sys.stderr.write(f'[stt] {msg}\n') - sys.stderr.flush() +def log(msg, error=False): + if error or verbose: + sys.stderr.write(f'[stt] {msg}\n') + sys.stderr.flush() # --- WebSocket broadcast --- @@ -118,7 +121,9 @@ parser = argparse.ArgumentParser() parser.add_argument('--model', default='base.en') parser.add_argument('--device', default='cuda') parser.add_argument('--compute-type', default='int8_float16') -args = parser.parse_args() +parser.add_argument('--verbose', '-v', action='store_true') +args = parser.parse_args() +verbose = args.verbose log(f'loading faster-whisper {args.model} ({args.device}, {args.compute_type})...') from faster_whisper import WhisperModel @@ -126,7 +131,7 @@ try: model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type) log(f'model ready on {args.device}') except Exception as e: - log(f'{args.device} failed ({e}), falling back to cpu') + log(f'{args.device} failed ({e}), falling back to cpu', error=True) model = WhisperModel(args.model, device='cpu', compute_type='int8') log('model ready on cpu') @@ -197,7 +202,7 @@ def transcription_worker(): emit({'event': 'transcript', 'text': text.strip(), 'words': words, 'duration': round(duration, 3)}) except Exception: msg = traceback.format_exc() - log(f'transcription error:\n{msg}') + log(f'transcription error:\n{msg}', error=True) emit({'event': 'error', 'message': msg}) finally: transcription_queue.task_done() From aad1bda3bf17443a487aed353d943dfb1e53aa8c Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 08:58:29 +0000 Subject: [PATCH 03/13] =?UTF-8?q?Remove=20stdout=20event=20output=20?= =?UTF-8?q?=E2=80=94=20WebSocket=20is=20the=20sole=20event=20channel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- stt-server.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/stt-server.py b/stt-server.py index 3c47185..d83c885 100755 --- a/stt-server.py +++ b/stt-server.py @@ -15,8 +15,8 @@ word format: {"word": "hello", "start": 0.12, "end": 0.45, "probability": 0.99} Every WebSocket connection receives the full event stream from the moment it connects — no subscription handshake required. -All log/status messages go to stderr. Stdout is machine-readable events only. -Pass --verbose to enable info logging (startup, VAD events, transcripts). +Machine-readable events are sent over WebSocket only. +Pass --verbose to enable logging to stderr (startup, VAD events, transcripts). Errors always go to stderr regardless of verbosity. Environment: @@ -61,8 +61,6 @@ _ws_clients = set() # set of asyncio.Queue, one per connection def emit(event): line = json.dumps(event) - sys.stdout.write(line + '\n') - sys.stdout.flush() if _ws_loop is not None: for q in list(_ws_clients): _ws_loop.call_soon_threadsafe(q.put_nowait, line) From 6bbc04dde749ceb6bfa9efe805342ab83a8a3e89 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 08:59:38 +0000 Subject: [PATCH 04/13] Add Node.js WebSocket example scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit listen.mjs: prints all events as JSON objects. transcripts.mjs: prints transcript text only. Both use Node 21+ built-in WebSocket — no libraries required. Co-Authored-By: Claude Sonnet 4.6 --- examples/listen.mjs | 24 ++++++++++++++++++++++++ examples/transcripts.mjs | 19 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 examples/listen.mjs create mode 100644 examples/transcripts.mjs diff --git a/examples/listen.mjs b/examples/listen.mjs new file mode 100644 index 0000000..58cae0f --- /dev/null +++ b/examples/listen.mjs @@ -0,0 +1,24 @@ +// Connect to the STT server and print all events. +// Usage: node listen.mjs + +const PORT = process.env.STT_PORT ?? '11501' +const ws = new WebSocket(`ws://localhost:${PORT}`) + +ws.addEventListener('open', () => { + process.stderr.write(`connected to ws://localhost:${PORT}\n`) +}) + +ws.addEventListener('message', ({ data }) => { + const event = JSON.parse(data) + console.log(event) +}) + +ws.addEventListener('close', () => { + process.stderr.write('disconnected\n') + process.exit(0) +}) + +ws.addEventListener('error', (err) => { + process.stderr.write(`error: ${err.message}\n`) + process.exit(1) +}) diff --git a/examples/transcripts.mjs b/examples/transcripts.mjs new file mode 100644 index 0000000..b5d404c --- /dev/null +++ b/examples/transcripts.mjs @@ -0,0 +1,19 @@ +// Connect to the STT server and print transcript text only. +// Usage: node transcripts.mjs + +const PORT = process.env.STT_PORT ?? '11501' +const ws = new WebSocket(`ws://localhost:${PORT}`) + +ws.addEventListener('open', () => { + process.stderr.write(`connected to ws://localhost:${PORT}\n`) +}) + +ws.addEventListener('message', ({ data }) => { + const event = JSON.parse(data) + if (event.event === 'transcript') { + console.log(event.text) + } +}) + +ws.addEventListener('close', () => process.exit(0)) +ws.addEventListener('error', () => process.exit(1)) From 218687b039d64c0a8db28bfa5eb84f2c267e5bdd Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 09:07:13 +0000 Subject: [PATCH 05/13] Log to stderr when model needs to be downloaded Checks cache first with local_files_only=True; if the model isn't present logs "downloading model ..." to stderr before WhisperModel triggers the actual download. Co-Authored-By: Claude Sonnet 4.6 --- stt-server.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/stt-server.py b/stt-server.py index d83c885..edfa472 100755 --- a/stt-server.py +++ b/stt-server.py @@ -123,8 +123,15 @@ parser.add_argument('--verbose', '-v', action='store_true') args = parser.parse_args() verbose = args.verbose -log(f'loading faster-whisper {args.model} ({args.device}, {args.compute_type})...') from faster_whisper import WhisperModel +from huggingface_hub import snapshot_download + +try: + snapshot_download(f'Systran/faster-whisper-{args.model}', local_files_only=True) +except Exception: + log(f'downloading model {args.model}...', error=True) + +log(f'loading faster-whisper {args.model} ({args.device}, {args.compute_type})...') try: model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type) log(f'model ready on {args.device}') From 7b03deddb5408784b54e91783d2ba9730d7f2564 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 09:09:03 +0000 Subject: [PATCH 06/13] Gate download log message behind --verbose like everything else Co-Authored-By: Claude Sonnet 4.6 --- stt-server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stt-server.py b/stt-server.py index edfa472..31ca094 100755 --- a/stt-server.py +++ b/stt-server.py @@ -129,7 +129,7 @@ from huggingface_hub import snapshot_download try: snapshot_download(f'Systran/faster-whisper-{args.model}', local_files_only=True) except Exception: - log(f'downloading model {args.model}...', error=True) + log(f'downloading model {args.model}...') log(f'loading faster-whisper {args.model} ({args.device}, {args.compute_type})...') try: From 9030b1315da91412cb4da301d49c1e13faa78b33 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 09:10:54 +0000 Subject: [PATCH 07/13] Load HF_TOKEN from token file at startup (consistent with tts-server) Reads ~/.secrets/hugging-face.token by default, overridable via HF_TOKEN_FILE. Co-Authored-By: Claude Sonnet 4.6 --- stt-server.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/stt-server.py b/stt-server.py index 31ca094..e08d226 100755 --- a/stt-server.py +++ b/stt-server.py @@ -27,6 +27,7 @@ Usage: ./stt-server.py --model large-v3 --device cuda --compute-type int8_float16 --verbose """ +import os import sys import json import signal @@ -123,6 +124,13 @@ parser.add_argument('--verbose', '-v', action='store_true') args = parser.parse_args() verbose = args.verbose +token_file = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token')) +try: + with open(token_file) as f: + os.environ['HF_TOKEN'] = f.read().strip() +except FileNotFoundError: + pass + from faster_whisper import WhisperModel from huggingface_hub import snapshot_download From be1efd9edb0bfb4015a2b32be12c700a3990e731 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 09:11:39 +0000 Subject: [PATCH 08/13] Add model selection and compute type sections to README Co-Authored-By: Claude Sonnet 4.6 --- README.md | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c28d7d6..63270fc 100644 --- a/README.md +++ b/README.md @@ -15,4 +15,31 @@ This project started as a [vibe-coded](https://en.wikipedia.org/wiki/Vibe_coding ### Setup [venv](https://docs.python.org/3/library/venv.html) for [python](https://www.python.org/) -We will have two different setups here depending on if you want to build ctranslate2 locally or not. This shall be documented. \ No newline at end of file +We will have two different setups here depending on if you want to build ctranslate2 locally or not. This shall be documented. + + +## Model selection + +Pass `--model ` to `stt-server.py`. Models are downloaded automatically from HuggingFace on first use. + +| Model | VRAM | Quality | Notes | +|-------|------|---------|-------| +| `base.en` | ~1 GB | Low | Default. Fast, but struggles with similar-sounding consonants (V/B/D). | +| `small.en` | ~2 GB | Medium | Noticeable improvement over base for most speech. | +| `medium.en` | ~5 GB | Good | Recommended starting point for production use. | +| `large-v3` | ~10 GB | Best | Highest accuracy, use if VRAM allows. | + +English-only models (`.en` suffix) are faster and more accurate than multilingual models for English speech. + + +## Compute type + +Pass `--compute-type ` to control the numeric precision used during inference. + +| Type | Notes | +|------|-------| +| `int8_float16` | Default. Good balance of speed and accuracy on modern GPUs. | +| `float16` | Slightly better accuracy, higher VRAM usage. | +| `int8` | CPU-friendly, lower quality. | + +If you see a CUDA error about mismatched library versions at startup, use `setup-venv-local-build.sh` to build ctranslate2 against your system CUDA version rather than using the PyPI wheel. \ No newline at end of file From dd6e74a7a895c09d02a43e56447212c7fdd82d95 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 09:14:00 +0000 Subject: [PATCH 09/13] =?UTF-8?q?Fix=20large-v3=20VRAM=20estimate=20?= =?UTF-8?q?=E2=80=94=20~5GB=20with=20float16,=20not=20~10GB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 63270fc..3d7b8d4 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Pass `--model ` to `stt-server.py`. Models are downloaded automatically fr | `base.en` | ~1 GB | Low | Default. Fast, but struggles with similar-sounding consonants (V/B/D). | | `small.en` | ~2 GB | Medium | Noticeable improvement over base for most speech. | | `medium.en` | ~5 GB | Good | Recommended starting point for production use. | -| `large-v3` | ~10 GB | Best | Highest accuracy, use if VRAM allows. | +| `large-v3` | ~5 GB (`float16`) / ~10 GB (`float32`) | Best | Highest accuracy, use if VRAM allows. | English-only models (`.en` suffix) are faster and more accurate than multilingual models for English speech. From f2ba15185e16d858f5bf66ecf6f3280f5faf9c64 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 09:14:35 +0000 Subject: [PATCH 10/13] Update VRAM estimates to show float16/float32 for all models Co-Authored-By: Claude Sonnet 4.6 --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3d7b8d4..f243cc9 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,9 @@ Pass `--model ` to `stt-server.py`. Models are downloaded automatically fr | Model | VRAM | Quality | Notes | |-------|------|---------|-------| -| `base.en` | ~1 GB | Low | Default. Fast, but struggles with similar-sounding consonants (V/B/D). | -| `small.en` | ~2 GB | Medium | Noticeable improvement over base for most speech. | -| `medium.en` | ~5 GB | Good | Recommended starting point for production use. | +| `base.en` | ~0.5 GB (`float16`) / ~1 GB (`float32`) | Low | Default. Fast, but struggles with similar-sounding consonants (V/B/D). | +| `small.en` | ~1 GB (`float16`) / ~2 GB (`float32`) | Medium | Noticeable improvement over base for most speech. | +| `medium.en` | ~2.5 GB (`float16`) / ~5 GB (`float32`) | Good | Recommended starting point for production use. | | `large-v3` | ~5 GB (`float16`) / ~10 GB (`float32`) | Best | Highest accuracy, use if VRAM allows. | English-only models (`.en` suffix) are faster and more accurate than multilingual models for English speech. From bdb1aac885ac9635abddde2f00782fd9764fe20a Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 09:16:19 +0000 Subject: [PATCH 11/13] Add --language and --task CLI arguments, document in README --language: force language detection (e.g. en, sv) or leave unset for auto --task: transcribe (default) or translate to English Previously language was hardcoded to 'en' which caused multilingual models to hallucinate translations instead of transcribing the source language. Co-Authored-By: Claude Sonnet 4.6 --- README.md | 19 ++++++++++++++++++- stt-server.py | 5 ++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f243cc9..5306d74 100644 --- a/README.md +++ b/README.md @@ -42,4 +42,21 @@ Pass `--compute-type ` to control the numeric precision used during infere | `float16` | Slightly better accuracy, higher VRAM usage. | | `int8` | CPU-friendly, lower quality. | -If you see a CUDA error about mismatched library versions at startup, use `setup-venv-local-build.sh` to build ctranslate2 against your system CUDA version rather than using the PyPI wheel. \ No newline at end of file +If you see a CUDA error about mismatched library versions at startup, use `setup-venv-local-build.sh` to build ctranslate2 against your system CUDA version rather than using the PyPI wheel. + + +## Language and translation + +By default the server auto-detects the spoken language and transcribes it as-is. + +| Argument | Default | Notes | +|----------|---------|-------| +| `--language ` | none (auto-detect) | Force a specific language, e.g. `--language en` or `--language sv`. Speeds up detection and avoids misidentification. | +| `--task transcribe` | default | Output text in the spoken language. | +| `--task translate` | | Translate speech to English regardless of source language. | + +> [!NOTE] +> The `.en` model variants (`base.en`, `small.en` etc.) are English-only and do not support `--task translate` or non-English `--language`. Use a multilingual model (`large-v3`, `medium`) for multilingual or translation use cases. + +> [!WARNING] +> Omitting `--language` with a multilingual model and English-only speech may cause occasional misdetection. Pass `--language en` to avoid this if you only speak English. \ No newline at end of file diff --git a/stt-server.py b/stt-server.py index e08d226..8cba120 100755 --- a/stt-server.py +++ b/stt-server.py @@ -120,6 +120,8 @@ parser = argparse.ArgumentParser() parser.add_argument('--model', default='base.en') parser.add_argument('--device', default='cuda') parser.add_argument('--compute-type', default='int8_float16') +parser.add_argument('--language', default=None, help='language code (e.g. en, sv) or None for auto-detect') +parser.add_argument('--task', default='transcribe', choices=['transcribe', 'translate'], help='transcribe keeps the source language; translate converts to English') parser.add_argument('--verbose', '-v', action='store_true') args = parser.parse_args() verbose = args.verbose @@ -195,7 +197,8 @@ def transcription_worker(): try: segments, _ = model.transcribe( samples, - language='en', + language=args.language, + task=args.task, word_timestamps=True, vad_filter=False, ) From 0afe7616251b25ee40a1541648c40b720f9b4a1f Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 09:21:44 +0000 Subject: [PATCH 12/13] Include detected language and confidence in transcript events Unpacks transcription info instead of discarding it. Adds language and language_probability fields to transcript events, and includes them in verbose log output. Co-Authored-By: Claude Sonnet 4.6 --- stt-server.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/stt-server.py b/stt-server.py index 8cba120..d806510 100755 --- a/stt-server.py +++ b/stt-server.py @@ -195,7 +195,7 @@ def transcription_worker(): break samples, duration = item try: - segments, _ = model.transcribe( + segments, info = model.transcribe( samples, language=args.language, task=args.task, @@ -213,9 +213,11 @@ def transcription_worker(): 'end': round(float(w.end), 4), 'probability': round(float(w.probability), 4), }) - log(f'transcript: {json.dumps(text.strip())} ({len(words)} words)') + language = info.language + lang_prob = round(float(info.language_probability), 3) + log(f'transcript [{language} {lang_prob}]: {json.dumps(text.strip())} ({len(words)} words)') if text.strip(): - emit({'event': 'transcript', 'text': text.strip(), 'words': words, 'duration': round(duration, 3)}) + emit({'event': 'transcript', 'text': text.strip(), 'words': words, 'duration': round(duration, 3), 'language': language, 'language_probability': lang_prob}) except Exception: msg = traceback.format_exc() log(f'transcription error:\n{msg}', error=True) From 81e9ea82cf4787b42459e6d6a7e866808dfaefa3 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 09:24:53 +0000 Subject: [PATCH 13/13] Add NOTES.md with TranscriptionInfo unused fields Co-Authored-By: Claude Sonnet 4.6 --- NOTES.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 NOTES.md diff --git a/NOTES.md b/NOTES.md new file mode 100644 index 0000000..a8bb54c --- /dev/null +++ b/NOTES.md @@ -0,0 +1,9 @@ +# Notes + +## TranscriptionInfo — unused fields + +`model.transcribe()` returns a `TranscriptionInfo` object as its second value. We currently use `language` and `language_probability`. Other available fields: + +- **`all_language_probs`** — full ranked list of `(language, probability)` tuples for the segment. Useful for debugging misdetection — e.g. when the model hallucinates Sinhala on noise, this would show Sinhala at the top with a high probability. Could be included in transcript events or exposed as a diagnostic endpoint. +- **`duration`** — total audio duration fed to the model. +- **`duration_after_vad`** — speech duration according to Whisper's internal VAD (not meaningful since we pass `vad_filter=False`).