From 18404708e3f16a7d092171f8fc2f2d95bb2e95a8 Mon Sep 17 00:00:00 2001
From: mikael-lovqvists-claude-agent <mikaels.claude.agent@efforting.tech>
Date: Sun, 7 Jun 2026 08:53:54 +0000
Subject: [PATCH 01/13] Add WebSocket broadcast to stt-server.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every connection receives the full event stream (vad_start, vad_end,
transcript, error) from the moment it connects — no subscription
handshake required. The asyncio WebSocket server runs in a daemon thread
alongside the VAD loop and transcription thread. Events still go to
stdout unchanged.

Port is configurable via STT_PORT env var (default: 11501).
Add websockets to both setup scripts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 setup-venv-local-build.sh |  2 +-
 setup-venv.sh             |  2 +-
 stt-server.py             | 96 ++++++++++++++++++++++++++++++---------
 3 files changed, 77 insertions(+), 23 deletions(-)

diff --git a/setup-venv-local-build.sh b/setup-venv-local-build.sh
index 7ccbbc0..e2dcf1d 100755
--- a/setup-venv-local-build.sh
+++ b/setup-venv-local-build.sh
@@ -57,7 +57,7 @@ fi
 
 echo "==> upgrading pip + build tools"
 "${VENV}/bin/pip" install --upgrade pip wheel setuptools pybind11 --quiet
-"${VENV}/bin/pip" install torch silero-vad
+"${VENV}/bin/pip" install torch silero-vad websockets
 
 # --- clone (skipped if already done) ---
 if [ ! -d "${BUILD_DIR}/src/.git" ]; then
diff --git a/setup-venv.sh b/setup-venv.sh
index c017c9d..356b6cc 100755
--- a/setup-venv.sh
+++ b/setup-venv.sh
@@ -25,7 +25,7 @@ fi
 
 echo "==> installing torch and faster-whisper"
 "${VENV}/bin/pip" install --upgrade pip --quiet
-"${VENV}/bin/pip" install torch faster-whisper silero-vad
+"${VENV}/bin/pip" install torch faster-whisper silero-vad websockets
 
 echo ""
 echo "==> done. Venv ready at ${VENV}"
diff --git a/stt-server.py b/stt-server.py
index 8e7041a..38b994b 100755
--- a/stt-server.py
+++ b/stt-server.py
@@ -1,18 +1,25 @@
 #!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"'
 """
-STT process: records audio, runs Silero VAD, transcribes with faster-whisper.
+STT server: records audio, runs Silero VAD, transcribes with faster-whisper.
+Broadcasts JSON events to all connected WebSocket clients and to stdout.
 
-Events (JSON lines on stdout):
+Events:
   {"event": "ready"}
   {"event": "vad_start"}
-  {"event": "vad_end",     "duration": 1.23}
-  {"event": "transcript",  "text": "...", "words": [...], "duration": 1.23}
-  {"event": "error",       "message": "..."}
+  {"event": "vad_end",    "duration": 1.23}
+  {"event": "transcript", "text": "...", "words": [...], "duration": 1.23}
+  {"event": "error",      "message": "..."}
 
 word format: {"word": "hello", "start": 0.12, "end": 0.45, "probability": 0.99}
 
+Every WebSocket connection receives the full event stream from the moment it
+connects — no subscription handshake required.
+
 All log/status messages go to stderr. Stdout is machine-readable events only.
 
+Environment:
+  STT_PORT   WebSocket port (default: 11501)
+
 Usage:
   ./stt-server.py
   ./stt-server.py --model large-v3 --device cuda --compute-type int8_float16
@@ -26,13 +33,16 @@ import threading
 import queue
 import subprocess
 import traceback
+import asyncio
+import websockets
 import numpy as np
 import torch
 
-SAMPLE_RATE     = 16000
-VAD_WINDOW      = 512           # samples per VAD chunk (32ms at 16kHz)
-PRE_ROLL_SAMPLES = 3200         # 0.2s of audio prepended to each segment
-HISTORY_SAMPLES  = 960000       # 60s ring buffer for pre-roll
+SAMPLE_RATE      = 16000
+VAD_WINDOW       = 512      # samples per VAD chunk (32ms at 16kHz)
+PRE_ROLL_SAMPLES = 3200     # 0.2s prepended to each segment for context
+HISTORY_SAMPLES  = 960000   # 60s ring buffer for pre-roll
+PORT             = int(__import__('os').environ.get('STT_PORT', 11501))
 
 
 def log(msg):
@@ -40,10 +50,49 @@ def log(msg):
 	sys.stderr.flush()
 
 
-def emit(event):
-	sys.stdout.write(json.dumps(event) + '\n')
-	sys.stdout.flush()
+# --- WebSocket broadcast ---
 
+_ws_loop    = None
+_ws_clients = set()   # set of asyncio.Queue, one per connection
+
+
+def emit(event):
+	line = json.dumps(event)
+	sys.stdout.write(line + '\n')
+	sys.stdout.flush()
+	if _ws_loop is not None:
+		for q in list(_ws_clients):
+			_ws_loop.call_soon_threadsafe(q.put_nowait, line)
+
+
+async def ws_handler(websocket):
+	q = asyncio.Queue()
+	_ws_clients.add(q)
+	log(f'client connected ({len(_ws_clients)} total)')
+	try:
+		while True:
+			msg = await q.get()
+			await websocket.send(msg)
+	except websockets.ConnectionClosed:
+		pass
+	finally:
+		_ws_clients.discard(q)
+		log(f'client disconnected ({len(_ws_clients)} remaining)')
+
+
+async def ws_main():
+	global _ws_loop
+	_ws_loop = asyncio.get_running_loop()
+	async with websockets.serve(ws_handler, '', PORT):
+		log(f'WebSocket listening on port {PORT}')
+		await asyncio.Future()   # run forever
+
+
+def start_ws_server():
+	asyncio.run(ws_main())
+
+
+# --- Mic ---
 
 def find_mic():
 	candidates = [
@@ -63,6 +112,8 @@ def s16le_to_f32(data):
 	return np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
 
 
+# --- Args + model loading ---
+
 parser = argparse.ArgumentParser()
 parser.add_argument('--model',        default='base.en')
 parser.add_argument('--device',       default='cuda')
@@ -82,25 +133,25 @@ except Exception as e:
 log('loading silero VAD...')
 from silero_vad import load_silero_vad, VADIterator
 vad_model = load_silero_vad()
-vad        = VADIterator(vad_model, sampling_rate=SAMPLE_RATE,
-                         threshold=0.5, min_silence_duration_ms=500)
+vad       = VADIterator(vad_model, sampling_rate=SAMPLE_RATE,
+                        threshold=0.5, min_silence_duration_ms=500)
 log('VAD ready')
 
 
-# Ring buffer for pre-roll context
+# --- Pre-roll ring buffer ---
+
 history     = np.zeros(HISTORY_SAMPLES, dtype=np.float32)
 history_pos = 0
 
 def push_history(samples):
 	global history_pos
-	n    = len(samples)
-	base = history_pos % HISTORY_SAMPLES
-	# May wrap around — handle both cases
+	n     = len(samples)
+	base  = history_pos % HISTORY_SAMPLES
 	space = HISTORY_SAMPLES - base
 	if n <= space:
 		history[base:base + n] = samples
 	else:
-		history[base:]    = samples[:space]
+		history[base:]      = samples[:space]
 		history[:n - space] = samples[space:]
 	history_pos += n
 
@@ -113,7 +164,8 @@ def get_preroll():
 	return out
 
 
-# Transcription runs in a separate thread so VAD is never blocked by GPU
+# --- Transcription thread ---
+
 transcription_queue = queue.Queue()
 
 def transcription_worker():
@@ -152,9 +204,11 @@ def transcription_worker():
 
 
 threading.Thread(target=transcription_worker, daemon=True).start()
+threading.Thread(target=start_ws_server, daemon=True).start()
 
 
-# Main recording + VAD loop
+# --- Main recording + VAD loop ---
+
 cmd, cmd_args = find_mic()
 log(f'mic: {cmd} {" ".join(cmd_args)}')
 mic = subprocess.Popen([cmd] + cmd_args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)

From f18330608d87e4148d5f30c7d283dd92adffdb25 Mon Sep 17 00:00:00 2001
From: mikael-lovqvists-claude-agent <mikaels.claude.agent@efforting.tech>
Date: Sun, 7 Jun 2026 08:56:58 +0000
Subject: [PATCH 02/13] Add --verbose flag; suppress info logging by default

Errors always go to stderr. Info logs (startup, VAD events, transcripts)
only appear with --verbose / -v, keeping stderr clean when running as a
system service.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 stt-server.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/stt-server.py b/stt-server.py
index 38b994b..3c47185 100755
--- a/stt-server.py
+++ b/stt-server.py
@@ -16,13 +16,15 @@ Every WebSocket connection receives the full event stream from the moment it
 connects — no subscription handshake required.
 
 All log/status messages go to stderr. Stdout is machine-readable events only.
+Pass --verbose to enable info logging (startup, VAD events, transcripts).
+Errors always go to stderr regardless of verbosity.
 
 Environment:
   STT_PORT   WebSocket port (default: 11501)
 
 Usage:
   ./stt-server.py
-  ./stt-server.py --model large-v3 --device cuda --compute-type int8_float16
+  ./stt-server.py --model large-v3 --device cuda --compute-type int8_float16 --verbose
 """
 
 import sys
@@ -45,9 +47,10 @@ HISTORY_SAMPLES  = 960000   # 60s ring buffer for pre-roll
 PORT             = int(__import__('os').environ.get('STT_PORT', 11501))
 
 
-def log(msg):
-	sys.stderr.write(f'[stt] {msg}\n')
-	sys.stderr.flush()
+def log(msg, error=False):
+	if error or verbose:
+		sys.stderr.write(f'[stt] {msg}\n')
+		sys.stderr.flush()
 
 
 # --- WebSocket broadcast ---
@@ -118,7 +121,9 @@ parser = argparse.ArgumentParser()
 parser.add_argument('--model',        default='base.en')
 parser.add_argument('--device',       default='cuda')
 parser.add_argument('--compute-type', default='int8_float16')
-args = parser.parse_args()
+parser.add_argument('--verbose', '-v', action='store_true')
+args    = parser.parse_args()
+verbose = args.verbose
 
 log(f'loading faster-whisper {args.model} ({args.device}, {args.compute_type})...')
 from faster_whisper import WhisperModel
@@ -126,7 +131,7 @@ try:
 	model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type)
 	log(f'model ready on {args.device}')
 except Exception as e:
-	log(f'{args.device} failed ({e}), falling back to cpu')
+	log(f'{args.device} failed ({e}), falling back to cpu', error=True)
 	model = WhisperModel(args.model, device='cpu', compute_type='int8')
 	log('model ready on cpu')
 
@@ -197,7 +202,7 @@ def transcription_worker():
 				emit({'event': 'transcript', 'text': text.strip(), 'words': words, 'duration': round(duration, 3)})
 		except Exception:
 			msg = traceback.format_exc()
-			log(f'transcription error:\n{msg}')
+			log(f'transcription error:\n{msg}', error=True)
 			emit({'event': 'error', 'message': msg})
 		finally:
 			transcription_queue.task_done()

From aad1bda3bf17443a487aed353d943dfb1e53aa8c Mon Sep 17 00:00:00 2001
From: mikael-lovqvists-claude-agent <mikaels.claude.agent@efforting.tech>
Date: Sun, 7 Jun 2026 08:58:29 +0000
Subject: [PATCH 03/13] =?UTF-8?q?Remove=20stdout=20event=20output=20?=
 =?UTF-8?q?=E2=80=94=20WebSocket=20is=20the=20sole=20event=20channel?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 stt-server.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/stt-server.py b/stt-server.py
index 3c47185..d83c885 100755
--- a/stt-server.py
+++ b/stt-server.py
@@ -15,8 +15,8 @@ word format: {"word": "hello", "start": 0.12, "end": 0.45, "probability": 0.99}
 Every WebSocket connection receives the full event stream from the moment it
 connects — no subscription handshake required.
 
-All log/status messages go to stderr. Stdout is machine-readable events only.
-Pass --verbose to enable info logging (startup, VAD events, transcripts).
+Machine-readable events are sent over WebSocket only.
+Pass --verbose to enable logging to stderr (startup, VAD events, transcripts).
 Errors always go to stderr regardless of verbosity.
 
 Environment:
@@ -61,8 +61,6 @@ _ws_clients = set()   # set of asyncio.Queue, one per connection
 
 def emit(event):
 	line = json.dumps(event)
-	sys.stdout.write(line + '\n')
-	sys.stdout.flush()
 	if _ws_loop is not None:
 		for q in list(_ws_clients):
 			_ws_loop.call_soon_threadsafe(q.put_nowait, line)

From 6bbc04dde749ceb6bfa9efe805342ab83a8a3e89 Mon Sep 17 00:00:00 2001
From: mikael-lovqvists-claude-agent <mikaels.claude.agent@efforting.tech>
Date: Sun, 7 Jun 2026 08:59:38 +0000
Subject: [PATCH 04/13] Add Node.js WebSocket example scripts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

listen.mjs: prints all events as JSON objects.
transcripts.mjs: prints transcript text only.
Both use Node 21+ built-in WebSocket — no libraries required.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 examples/listen.mjs      | 24 ++++++++++++++++++++++++
 examples/transcripts.mjs | 19 +++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 examples/listen.mjs
 create mode 100644 examples/transcripts.mjs

diff --git a/examples/listen.mjs b/examples/listen.mjs
new file mode 100644
index 0000000..58cae0f
--- /dev/null
+++ b/examples/listen.mjs
@@ -0,0 +1,24 @@
+// Connect to the STT server and print all events.
+// Usage: node listen.mjs
+
+const PORT = process.env.STT_PORT ?? '11501'
+const ws   = new WebSocket(`ws://localhost:${PORT}`)
+
+ws.addEventListener('open', () => {
+	process.stderr.write(`connected to ws://localhost:${PORT}\n`)
+})
+
+ws.addEventListener('message', ({ data }) => {
+	const event = JSON.parse(data)
+	console.log(event)
+})
+
+ws.addEventListener('close', () => {
+	process.stderr.write('disconnected\n')
+	process.exit(0)
+})
+
+ws.addEventListener('error', (err) => {
+	process.stderr.write(`error: ${err.message}\n`)
+	process.exit(1)
+})
diff --git a/examples/transcripts.mjs b/examples/transcripts.mjs
new file mode 100644
index 0000000..b5d404c
--- /dev/null
+++ b/examples/transcripts.mjs
@@ -0,0 +1,19 @@
+// Connect to the STT server and print transcript text only.
+// Usage: node transcripts.mjs
+
+const PORT = process.env.STT_PORT ?? '11501'
+const ws   = new WebSocket(`ws://localhost:${PORT}`)
+
+ws.addEventListener('open', () => {
+	process.stderr.write(`connected to ws://localhost:${PORT}\n`)
+})
+
+ws.addEventListener('message', ({ data }) => {
+	const event = JSON.parse(data)
+	if (event.event === 'transcript') {
+		console.log(event.text)
+	}
+})
+
+ws.addEventListener('close', () => process.exit(0))
+ws.addEventListener('error', () => process.exit(1))

From 218687b039d64c0a8db28bfa5eb84f2c267e5bdd Mon Sep 17 00:00:00 2001
From: mikael-lovqvists-claude-agent <mikaels.claude.agent@efforting.tech>
Date: Sun, 7 Jun 2026 09:07:13 +0000
Subject: [PATCH 05/13] Log to stderr when model needs to be downloaded

Checks cache first with local_files_only=True; if the model isn't present
logs "downloading model ..." to stderr before WhisperModel triggers the
actual download.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 stt-server.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/stt-server.py b/stt-server.py
index d83c885..edfa472 100755
--- a/stt-server.py
+++ b/stt-server.py
@@ -123,8 +123,15 @@ parser.add_argument('--verbose', '-v', action='store_true')
 args    = parser.parse_args()
 verbose = args.verbose
 
-log(f'loading faster-whisper {args.model} ({args.device}, {args.compute_type})...')
 from faster_whisper import WhisperModel
+from huggingface_hub import snapshot_download
+
+try:
+	snapshot_download(f'Systran/faster-whisper-{args.model}', local_files_only=True)
+except Exception:
+	log(f'downloading model {args.model}...', error=True)
+
+log(f'loading faster-whisper {args.model} ({args.device}, {args.compute_type})...')
 try:
 	model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type)
 	log(f'model ready on {args.device}')

From 7b03deddb5408784b54e91783d2ba9730d7f2564 Mon Sep 17 00:00:00 2001
From: mikael-lovqvists-claude-agent <mikaels.claude.agent@efforting.tech>
Date: Sun, 7 Jun 2026 09:09:03 +0000
Subject: [PATCH 06/13] Gate download log message behind --verbose like
 everything else

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 stt-server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stt-server.py b/stt-server.py
index edfa472..31ca094 100755
--- a/stt-server.py
+++ b/stt-server.py
@@ -129,7 +129,7 @@ from huggingface_hub import snapshot_download
 try:
 	snapshot_download(f'Systran/faster-whisper-{args.model}', local_files_only=True)
 except Exception:
-	log(f'downloading model {args.model}...', error=True)
+	log(f'downloading model {args.model}...')
 
 log(f'loading faster-whisper {args.model} ({args.device}, {args.compute_type})...')
 try:

From 9030b1315da91412cb4da301d49c1e13faa78b33 Mon Sep 17 00:00:00 2001
From: mikael-lovqvists-claude-agent <mikaels.claude.agent@efforting.tech>
Date: Sun, 7 Jun 2026 09:10:54 +0000
Subject: [PATCH 07/13] Load HF_TOKEN from token file at startup (consistent
 with tts-server)

Reads ~/.secrets/hugging-face.token by default, overridable via HF_TOKEN_FILE.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 stt-server.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/stt-server.py b/stt-server.py
index 31ca094..e08d226 100755
--- a/stt-server.py
+++ b/stt-server.py
@@ -27,6 +27,7 @@ Usage:
   ./stt-server.py --model large-v3 --device cuda --compute-type int8_float16 --verbose
 """
 
+import os
 import sys
 import json
 import signal
@@ -123,6 +124,13 @@ parser.add_argument('--verbose', '-v', action='store_true')
 args    = parser.parse_args()
 verbose = args.verbose
 
+token_file = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token'))
+try:
+	with open(token_file) as f:
+		os.environ['HF_TOKEN'] = f.read().strip()
+except FileNotFoundError:
+	pass
+
 from faster_whisper import WhisperModel
 from huggingface_hub import snapshot_download
 

From be1efd9edb0bfb4015a2b32be12c700a3990e731 Mon Sep 17 00:00:00 2001
From: mikael-lovqvists-claude-agent <mikaels.claude.agent@efforting.tech>
Date: Sun, 7 Jun 2026 09:11:39 +0000
Subject: [PATCH 08/13] Add model selection and compute type sections to README

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c28d7d6..63270fc 100644
--- a/README.md
+++ b/README.md
@@ -15,4 +15,31 @@ This project started as a [vibe-coded](https://en.wikipedia.org/wiki/Vibe_coding
 
 ### Setup [venv](https://docs.python.org/3/library/venv.html) for [python](https://www.python.org/)
 
-We will have two different setups here depending on if you want to build ctranslate2 locally or not. This shall be documented.
\ No newline at end of file
+We will have two different setups here depending on if you want to build ctranslate2 locally or not. This shall be documented.
+
+
+## Model selection
+
+Pass `--model <name>` to `stt-server.py`. Models are downloaded automatically from HuggingFace on first use.
+
+| Model | VRAM | Quality | Notes |
+|-------|------|---------|-------|
+| `base.en` | ~1 GB | Low | Default. Fast, but struggles with similar-sounding consonants (V/B/D). |
+| `small.en` | ~2 GB | Medium | Noticeable improvement over base for most speech. |
+| `medium.en` | ~5 GB | Good | Recommended starting point for production use. |
+| `large-v3` | ~10 GB | Best | Highest accuracy, use if VRAM allows. |
+
+English-only models (`.en` suffix) are faster and more accurate than multilingual models for English speech.
+
+
+## Compute type
+
+Pass `--compute-type <type>` to control the numeric precision used during inference.
+
+| Type | Notes |
+|------|-------|
+| `int8_float16` | Default. Good balance of speed and accuracy on modern GPUs. |
+| `float16` | Slightly better accuracy, higher VRAM usage. |
+| `int8` | CPU-friendly, lower quality. |
+
+If you see a CUDA error about mismatched library versions at startup, use `setup-venv-local-build.sh` to build ctranslate2 against your system CUDA version rather than using the PyPI wheel.
\ No newline at end of file

From dd6e74a7a895c09d02a43e56447212c7fdd82d95 Mon Sep 17 00:00:00 2001
From: mikael-lovqvists-claude-agent <mikaels.claude.agent@efforting.tech>
Date: Sun, 7 Jun 2026 09:14:00 +0000
Subject: [PATCH 09/13] =?UTF-8?q?Fix=20large-v3=20VRAM=20estimate=20?=
 =?UTF-8?q?=E2=80=94=20~5GB=20with=20float16,=20not=20~10GB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 63270fc..3d7b8d4 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Pass `--model <name>` to `stt-server.py`. Models are downloaded automatically fr
 | `base.en` | ~1 GB | Low | Default. Fast, but struggles with similar-sounding consonants (V/B/D). |
 | `small.en` | ~2 GB | Medium | Noticeable improvement over base for most speech. |
 | `medium.en` | ~5 GB | Good | Recommended starting point for production use. |
-| `large-v3` | ~10 GB | Best | Highest accuracy, use if VRAM allows. |
+| `large-v3` | ~5 GB (`float16`) / ~10 GB (`float32`) | Best | Highest accuracy, use if VRAM allows. |
 
 English-only models (`.en` suffix) are faster and more accurate than multilingual models for English speech.
 

From f2ba15185e16d858f5bf66ecf6f3280f5faf9c64 Mon Sep 17 00:00:00 2001
From: mikael-lovqvists-claude-agent <mikaels.claude.agent@efforting.tech>
Date: Sun, 7 Jun 2026 09:14:35 +0000
Subject: [PATCH 10/13] Update VRAM estimates to show float16/float32 for all
 models

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3d7b8d4..f243cc9 100644
--- a/README.md
+++ b/README.md
@@ -24,9 +24,9 @@ Pass `--model <name>` to `stt-server.py`. Models are downloaded automatically fr
 
 | Model | VRAM | Quality | Notes |
 |-------|------|---------|-------|
-| `base.en` | ~1 GB | Low | Default. Fast, but struggles with similar-sounding consonants (V/B/D). |
-| `small.en` | ~2 GB | Medium | Noticeable improvement over base for most speech. |
-| `medium.en` | ~5 GB | Good | Recommended starting point for production use. |
+| `base.en` | ~0.5 GB (`float16`) / ~1 GB (`float32`) | Low | Default. Fast, but struggles with similar-sounding consonants (V/B/D). |
+| `small.en` | ~1 GB (`float16`) / ~2 GB (`float32`) | Medium | Noticeable improvement over base for most speech. |
+| `medium.en` | ~2.5 GB (`float16`) / ~5 GB (`float32`) | Good | Recommended starting point for production use. |
 | `large-v3` | ~5 GB (`float16`) / ~10 GB (`float32`) | Best | Highest accuracy, use if VRAM allows. |
 
 English-only models (`.en` suffix) are faster and more accurate than multilingual models for English speech.

From bdb1aac885ac9635abddde2f00782fd9764fe20a Mon Sep 17 00:00:00 2001
From: mikael-lovqvists-claude-agent <mikaels.claude.agent@efforting.tech>
Date: Sun, 7 Jun 2026 09:16:19 +0000
Subject: [PATCH 11/13] Add --language and --task CLI arguments, document in
 README

--language: force language detection (e.g. en, sv) or leave unset for auto
--task: transcribe (default) or translate to English
Previously language was hardcoded to 'en' which caused multilingual models
to hallucinate translations instead of transcribing the source language.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md     | 19 ++++++++++++++++++-
 stt-server.py |  5 ++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f243cc9..5306d74 100644
--- a/README.md
+++ b/README.md
@@ -42,4 +42,21 @@ Pass `--compute-type <type>` to control the numeric precision used during infere
 | `float16` | Slightly better accuracy, higher VRAM usage. |
 | `int8` | CPU-friendly, lower quality. |
 
-If you see a CUDA error about mismatched library versions at startup, use `setup-venv-local-build.sh` to build ctranslate2 against your system CUDA version rather than using the PyPI wheel.
\ No newline at end of file
+If you see a CUDA error about mismatched library versions at startup, use `setup-venv-local-build.sh` to build ctranslate2 against your system CUDA version rather than using the PyPI wheel.
+
+
+## Language and translation
+
+By default the server auto-detects the spoken language and transcribes it as-is.
+
+| Argument | Default | Notes |
+|----------|---------|-------|
+| `--language <code>` | none (auto-detect) | Force a specific language, e.g. `--language en` or `--language sv`. Speeds up detection and avoids misidentification. |
+| `--task transcribe` | default | Output text in the spoken language. |
+| `--task translate` | | Translate speech to English regardless of source language. |
+
+> [!NOTE]
+> The `.en` model variants (`base.en`, `small.en` etc.) are English-only and do not support `--task translate` or non-English `--language`. Use a multilingual model (`large-v3`, `medium`) for multilingual or translation use cases.
+
+> [!WARNING]
+> Omitting `--language` with a multilingual model and English-only speech may cause occasional misdetection. Pass `--language en` to avoid this if you only speak English.
\ No newline at end of file
diff --git a/stt-server.py b/stt-server.py
index e08d226..8cba120 100755
--- a/stt-server.py
+++ b/stt-server.py
@@ -120,6 +120,8 @@ parser = argparse.ArgumentParser()
 parser.add_argument('--model',        default='base.en')
 parser.add_argument('--device',       default='cuda')
 parser.add_argument('--compute-type', default='int8_float16')
+parser.add_argument('--language',     default=None,          help='language code (e.g. en, sv) or None for auto-detect')
+parser.add_argument('--task',         default='transcribe',  choices=['transcribe', 'translate'], help='transcribe keeps the source language; translate converts to English')
 parser.add_argument('--verbose', '-v', action='store_true')
 args    = parser.parse_args()
 verbose = args.verbose
@@ -195,7 +197,8 @@ def transcription_worker():
 		try:
 			segments, _ = model.transcribe(
 				samples,
-				language='en',
+				language=args.language,
+				task=args.task,
 				word_timestamps=True,
 				vad_filter=False,
 			)

From 0afe7616251b25ee40a1541648c40b720f9b4a1f Mon Sep 17 00:00:00 2001
From: mikael-lovqvists-claude-agent <mikaels.claude.agent@efforting.tech>
Date: Sun, 7 Jun 2026 09:21:44 +0000
Subject: [PATCH 12/13] Include detected language and confidence in transcript
 events

Unpacks transcription info instead of discarding it. Adds language and
language_probability fields to transcript events, and includes them in
verbose log output.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 stt-server.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/stt-server.py b/stt-server.py
index 8cba120..d806510 100755
--- a/stt-server.py
+++ b/stt-server.py
@@ -195,7 +195,7 @@ def transcription_worker():
 			break
 		samples, duration = item
 		try:
-			segments, _ = model.transcribe(
+			segments, info = model.transcribe(
 				samples,
 				language=args.language,
 				task=args.task,
@@ -213,9 +213,11 @@ def transcription_worker():
 						'end':         round(float(w.end),   4),
 						'probability': round(float(w.probability), 4),
 					})
-			log(f'transcript: {json.dumps(text.strip())} ({len(words)} words)')
+			language    = info.language
+			lang_prob   = round(float(info.language_probability), 3)
+			log(f'transcript [{language} {lang_prob}]: {json.dumps(text.strip())} ({len(words)} words)')
 			if text.strip():
-				emit({'event': 'transcript', 'text': text.strip(), 'words': words, 'duration': round(duration, 3)})
+				emit({'event': 'transcript', 'text': text.strip(), 'words': words, 'duration': round(duration, 3), 'language': language, 'language_probability': lang_prob})
 		except Exception:
 			msg = traceback.format_exc()
 			log(f'transcription error:\n{msg}', error=True)

From 81e9ea82cf4787b42459e6d6a7e866808dfaefa3 Mon Sep 17 00:00:00 2001
From: mikael-lovqvists-claude-agent <mikaels.claude.agent@efforting.tech>
Date: Sun, 7 Jun 2026 09:24:53 +0000
Subject: [PATCH 13/13] Add NOTES.md with TranscriptionInfo unused fields

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 NOTES.md | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 NOTES.md

diff --git a/NOTES.md b/NOTES.md
new file mode 100644
index 0000000..a8bb54c
--- /dev/null
+++ b/NOTES.md
@@ -0,0 +1,9 @@
+# Notes
+
+## TranscriptionInfo — unused fields
+
+`model.transcribe()` returns a `TranscriptionInfo` object as its second value. We currently use `language` and `language_probability`. Other available fields:
+
+- **`all_language_probs`** — full ranked list of `(language, probability)` tuples for the segment. Useful for debugging misdetection — e.g. when the model hallucinates Sinhala on noise, this would show Sinhala at the top with a high probability. Could be included in transcript events or exposed as a diagnostic endpoint.
+- **`duration`** — total audio duration fed to the model.
+- **`duration_after_vad`** — speech duration according to Whisper's internal VAD (not meaningful since we pass `vad_filter=False`).