Merge pull request 'Convert chatterbox-server.py to HTTP server, add Node.js examples' (#1 ) from mikael-lovqvists-claude-agent/tts-server:http-server into main

Reviewed-on: #1
Convert chatterbox-server.py to HTTP server, add Node.js examples
2026-06-07 07:34:18 +00:00 · 2026-06-07 07:28:18 +00:00
4 changed files with 181 additions and 99 deletions
--- a/chatterbox-server.py
+++ b/chatterbox-server.py
@@ -1,13 +1,19 @@
 #!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"'
 """
-Chatterbox TTS server — keeps model loaded, reads JSON lines from stdin.
+Chatterbox TTS HTTP server — keeps model loaded, exposes a JSON HTTP API.

-Protocol:
-  stdin:  {"text": "...", "temperature": 0.8, "top_p": 0.95}
-		  {"chime": "/path/to/file.wav"}
-		  {"preload": "/path/to/file.wav"}
-  stdout: "ok\n" after each utterance is generated (playback may still be in progress)
-  stderr: status/timing messages
+Endpoints:
+  POST /speak    {"text": "...", "temperature": 0.8, "top_p": 0.95, "audio_prompt": "/path.wav"}
+  POST /chime    {"path": "/path/to/file.wav"}
+  POST /preload  {"path": "/path/to/file.wav"}
+
+All endpoints return {"status": "ok"} or {"status": "error", "message": "..."}.
+Responses are sent after audio is queued for playback (not after playback finishes).
+
+Environment:
+  TTS_PORT       TCP port to listen on (default: 11500)
+  HF_TOKEN_FILE  path to HuggingFace token file (default: ~/.secrets/hugging-face.token)
+  HF_HUB_CACHE   path to HuggingFace hub cache (default: ~/.cache/huggingface/hub)

 Usage:
  ./chatterbox-server.py
@@ -28,6 +34,10 @@ import time
 import queue
 import threading
 import subprocess
+import traceback
+import tempfile
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from pathlib import Path
 import numpy as np

 TOKEN_FILE = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token'))
@@ -37,11 +47,9 @@ try:
 except FileNotFoundError:
 	pass

-def find_hf_cache(repo_id):
-	"""Return the local snapshot path if the model is already cached, else None."""
-	from pathlib import Path
-	cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')))

+def find_hf_cache(repo_id):
+	cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')))
 	repo_dir = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots'
 	if repo_dir.exists():
 		snapshots = sorted(repo_dir.iterdir(), key=lambda p: p.stat().st_mtime)
@@ -49,23 +57,23 @@ def find_hf_cache(repo_id):
 			return str(snapshots[-1])
 	return None

+
 VARIANT     = sys.argv[1] if len(sys.argv) > 1 else 'turbo'
+PORT        = int(os.environ.get('TTS_PORT', 11500))
 SAMPLE_RATE = 24000

+
 def log(msg):
 	print(f'[chatterbox] {msg}', file=sys.stderr, flush=True)

+
 log(f'loading chatterbox-{VARIANT}...')
 t0 = time.time()

-import tempfile
-import traceback
-import numpy as np
 import torch
 import soundfile as sf
 import librosa as _librosa

-# librosa.resample returns float64 in newer numpy — patch it to always return float32
 _orig_resample = _librosa.resample
 def _resample_float32(*args, **kwargs):
 	return _orig_resample(*args, **kwargs).astype(np.float32)
@@ -92,48 +100,53 @@ else:
 	model = Model.from_pretrained(device=device)

 log(f'ready on {device} ({time.time() - t0:.1f}s load time)')
-print('ready', flush=True)
-

 _wav_cache   = {}
+_chime_cache = {}
+_gen_lock    = threading.Lock()
+
+_SENTINEL      = object()
+playback_queue = queue.Queue()
+
+
+def playback_worker():
+	while True:
+		item = playback_queue.get()
+		if item is _SENTINEL:
+			break
+		proc = subprocess.Popen(
+			['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
+			stdin=subprocess.PIPE,
+		)
+		proc.stdin.write(item.tobytes())
+		proc.stdin.close()
+		proc.wait()
+		playback_queue.task_done()
+
+
+threading.Thread(target=playback_worker, daemon=True).start()
+

 def ensure_float32_wav(path):
-	"""Re-save audio as float32 mono WAV to work around librosa/numpy float64 issue.
-	Result is cached by input path so repeated calls with the same file are free."""
 	if path in _wav_cache:
 		return _wav_cache[path]
 	wav, sr = sf.read(path, dtype='float32', always_2d=True)
-	wav = wav.mean(axis=1)  # stereo → mono if needed
+	wav = wav.mean(axis=1)
 	tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
 	sf.write(tmp.name, wav, sr, subtype='FLOAT')
 	_wav_cache[path] = tmp.name
 	return tmp.name


-_SENTINEL = object()
-
-playback_queue = queue.Queue()
-
-
-def playback_worker():
-	"""Plays audio samples in order. Runs in its own thread."""
-	while True:
-		item = playback_queue.get()
-		if item is _SENTINEL:
-			break
-		samples = item
-		proc = subprocess.Popen(
-			['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
-			stdin=subprocess.PIPE,
-		)
-		proc.stdin.write(samples.tobytes())
-		proc.stdin.close()
-		proc.wait()
-		playback_queue.task_done()
-
-
-playback_thread = threading.Thread(target=playback_worker, daemon=True)
-playback_thread.start()
+def load_chime(path):
+	if path in _chime_cache:
+		return _chime_cache[path]
+	samples, sr = sf.read(path, dtype='float32', always_2d=True)
+	samples = samples.mean(axis=1)
+	if sr != SAMPLE_RATE:
+		samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE)
+	_chime_cache[path] = samples
+	return samples


 def generate(text, opts):
@@ -168,68 +181,74 @@ def generate(text, opts):
 	elapsed = time.time() - t1
 	duration = len(samples) / SAMPLE_RATE
 	log(f'generated {duration:.1f}s audio in {elapsed:.1f}s  rtf={elapsed/duration:.2f}')
-
 	return samples


-_chime_cache = {}
+class Handler(BaseHTTPRequestHandler):
+	def send_json(self, data, status=200):
+		body = json.dumps(data).encode()
+		self.send_response(status)
+		self.send_header('Content-Type', 'application/json')
+		self.send_header('Content-Length', str(len(body)))
+		self.end_headers()
+		self.wfile.write(body)

-def load_chime(path):
-	if path in _chime_cache:
-		return _chime_cache[path]
-	samples, sr = sf.read(path, dtype='float32', always_2d=True)
-	samples = samples.mean(axis=1)  # stereo → mono
-	if sr != SAMPLE_RATE:
-		samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE)
-	_chime_cache[path] = samples
-	return samples
-
-
-for line in sys.stdin:
-	line = line.strip()
-	if not line:
-		continue
+	def read_json(self):
+		length = int(self.headers.get('Content-Length', 0))
+		return json.loads(self.rfile.read(length))

+	def do_POST(self):
 		try:
-		req = json.loads(line)
-	except json.JSONDecodeError:
-		req = {'text': line}
-
-	if 'preload' in req:
-		try:
-			load_chime(req['preload'])
-			log(f'preloaded chime: {req["preload"]}')
-		except Exception as e:
-			log(f'preload error: {e}')
-		print('ok', flush=True)
-		continue
-
-	if 'chime' in req:
-		try:
-			samples = load_chime(req['chime'])
-			playback_queue.put(samples)
-		except Exception as e:
-			log(f'chime error: {e}')
-			traceback.print_exc(file=sys.stderr)
-		print('ok', flush=True)
-		continue
+			req = self.read_json()
+		except Exception:
+			self.send_json({'status': 'error', 'message': 'invalid JSON'}, 400)
+			return

+		if self.path == '/speak':
 			text = req.pop('text', '')
-	opts = req
-
 			if not text:
-		print('ok', flush=True)
-		continue
-
+				self.send_json({'status': 'ok'})
+				return
 			try:
-		samples = generate(text, opts)
+				with _gen_lock:
+					samples = generate(text, req)
 				playback_queue.put(samples)
+				self.send_json({'status': 'ok'})
 			except Exception as e:
-		log(f'error: {e}')
 				traceback.print_exc(file=sys.stderr)
+				self.send_json({'status': 'error', 'message': str(e)}, 500)

-	print('ok', flush=True)
+		elif self.path == '/chime':
+			path = req.get('path', '')
+			try:
+				samples = load_chime(path)
+				playback_queue.put(samples)
+				self.send_json({'status': 'ok'})
+			except Exception as e:
+				traceback.print_exc(file=sys.stderr)
+				self.send_json({'status': 'error', 'message': str(e)}, 500)

-# Drain playback before exit
-playback_queue.put(_SENTINEL)
-playback_thread.join()
+		elif self.path == '/preload':
+			path = req.get('path', '')
+			try:
+				load_chime(path)
+				log(f'preloaded: {path}')
+				self.send_json({'status': 'ok'})
+			except Exception as e:
+				self.send_json({'status': 'error', 'message': str(e)}, 500)
+
+		else:
+			self.send_json({'status': 'error', 'message': 'not found'}, 404)
+
+	def log_message(self, fmt, *args):
+		log(fmt % args)
+
+
+server = ThreadingHTTPServer(('', PORT), Handler)
+log(f'listening on port {PORT}')
+try:
+	server.serve_forever()
+except KeyboardInterrupt:
+	pass
+finally:
+	playback_queue.put(_SENTINEL)
--- a/examples/chime.mjs
+++ b/examples/chime.mjs
@@ -0,0 +1,22 @@
+// Play a chime WAV file via the Chatterbox TTS server.
+// Usage: node chime.mjs /path/to/chime.wav
+
+const PORT = process.env.TTS_PORT ?? '11500'
+const path = process.argv[2]
+
+if (!path) {
+	console.error('usage: node chime.mjs /path/to/chime.wav')
+	process.exit(1)
+}
+
+const res = await fetch(`http://localhost:${PORT}/chime`, {
+	method: 'POST',
+	headers: { 'Content-Type': 'application/json' },
+	body: JSON.stringify({ path }),
+})
+
+const data = await res.json()
+if (data.status !== 'ok') {
+	console.error('error:', data.message)
+	process.exit(1)
+}
--- a/examples/speak.mjs
+++ b/examples/speak.mjs
@@ -0,0 +1,17 @@
+// Speak text via the Chatterbox TTS server.
+// Usage: node speak.mjs "Hello world"
+
+const PORT = process.env.TTS_PORT ?? '11500'
+const text = process.argv[2] ?? 'Hello from Node.'
+
+const res = await fetch(`http://localhost:${PORT}/speak`, {
+	method: 'POST',
+	headers: { 'Content-Type': 'application/json' },
+	body: JSON.stringify({ text }),
+})
+
+const data = await res.json()
+if (data.status !== 'ok') {
+	console.error('error:', data.message)
+	process.exit(1)
+}
--- a/examples/voice-clone.mjs
+++ b/examples/voice-clone.mjs
@@ -0,0 +1,24 @@
+// Speak text using a reference WAV for voice cloning.
+// The server reads the audio_prompt path from its own filesystem.
+// Usage: node voice-clone.mjs /path/to/reference.wav "Text to speak"
+
+const PORT        = process.env.TTS_PORT ?? '11500'
+const audio_prompt = process.argv[2]
+const text         = process.argv[3] ?? 'Hello, this is a cloned voice.'
+
+if (!audio_prompt) {
+	console.error('usage: node voice-clone.mjs /path/to/reference.wav "text"')
+	process.exit(1)
+}
+
+const res = await fetch(`http://localhost:${PORT}/speak`, {
+	method: 'POST',
+	headers: { 'Content-Type': 'application/json' },
+	body: JSON.stringify({ text, audio_prompt }),
+})
+
+const data = await res.json()
+if (data.status !== 'ok') {
+	console.error('error:', data.message)
+	process.exit(1)
+}