Convert chatterbox-server.py to HTTP server, add Node.js examples

Replace stdin/stdout JSON line protocol with a stdlib HTTP server (ThreadingHTTPServer). Three endpoints: POST /speak, /chime, /preload. All return {"status": "ok"} after audio is queued for playback. TTS generation is serialized via a threading.Lock; concurrent chime/preload requests are handled without waiting for generation. Add examples/speak.mjs, chime.mjs, voice-clone.mjs using Node.js built-in fetch (no libraries required, Node 18+). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-07 07:28:18 +00:00
parent bdae4c047f
commit f6ff8c72e8
4 changed files with 181 additions and 99 deletions
--- a/chatterbox-server.py
+++ b/chatterbox-server.py
@@ -1,13 +1,19 @@
 #!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"'
 """
-Chatterbox TTS server — keeps model loaded, reads JSON lines from stdin.
+Chatterbox TTS HTTP server — keeps model loaded, exposes a JSON HTTP API.
-Protocol:
+Endpoints:
-  stdin:  {"text": "...", "temperature": 0.8, "top_p": 0.95}
+  POST /speak    {"text": "...", "temperature": 0.8, "top_p": 0.95, "audio_prompt": "/path.wav"}
-		  {"chime": "/path/to/file.wav"}
+  POST /chime    {"path": "/path/to/file.wav"}
-		  {"preload": "/path/to/file.wav"}
+  POST /preload  {"path": "/path/to/file.wav"}
-  stdout: "ok\n" after each utterance is generated (playback may still be in progress)
+
-  stderr: status/timing messages
+All endpoints return {"status": "ok"} or {"status": "error", "message": "..."}.
 Responses are sent after audio is queued for playback (not after playback finishes).
 Environment:
  TTS_PORT       TCP port to listen on (default: 11500)
  HF_TOKEN_FILE  path to HuggingFace token file (default: ~/.secrets/hugging-face.token)
  HF_HUB_CACHE   path to HuggingFace hub cache (default: ~/.cache/huggingface/hub)
 Usage:
  ./chatterbox-server.py
@@ -28,6 +34,10 @@ import time
 import queue
 import threading
 import subprocess
 import traceback
 import tempfile
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 from pathlib import Path
 import numpy as np
 TOKEN_FILE = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token'))
@@ -37,35 +47,33 @@ try:
 except FileNotFoundError:
 	pass
 def find_hf_cache(repo_id):
 	"""Return the local snapshot path if the model is already cached, else None."""
 	from pathlib import Path
 	cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')))
-	repo_dir  = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots'
+def find_hf_cache(repo_id):
 	cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')))
 	repo_dir = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots'
 	if repo_dir.exists():
 		snapshots = sorted(repo_dir.iterdir(), key=lambda p: p.stat().st_mtime)
 		if snapshots:
 			return str(snapshots[-1])
 	return None
-VARIANT    = sys.argv[1] if len(sys.argv) > 1 else 'turbo'
+
 VARIANT     = sys.argv[1] if len(sys.argv) > 1 else 'turbo'
 PORT        = int(os.environ.get('TTS_PORT', 11500))
 SAMPLE_RATE = 24000
 def log(msg):
 	print(f'[chatterbox] {msg}', file=sys.stderr, flush=True)
 log(f'loading chatterbox-{VARIANT}...')
 t0 = time.time()
 import tempfile
 import traceback
 import numpy as np
 import torch
 import soundfile as sf
 import librosa as _librosa
 # librosa.resample returns float64 in newer numpy — patch it to always return float32
 _orig_resample = _librosa.resample
 def _resample_float32(*args, **kwargs):
 	return _orig_resample(*args, **kwargs).astype(np.float32)
@@ -92,48 +100,53 @@ else:
 	model = Model.from_pretrained(device=device)
 log(f'ready on {device} ({time.time() - t0:.1f}s load time)')
-print('ready', flush=True)
+
 _wav_cache   = {}
 _chime_cache = {}
 _gen_lock    = threading.Lock()
 _SENTINEL      = object()
 playback_queue = queue.Queue()
-_wav_cache = {}
+def playback_worker():
 	while True:
 		item = playback_queue.get()
 		if item is _SENTINEL:
 			break
 		proc = subprocess.Popen(
 			['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
 			stdin=subprocess.PIPE,
 		)
 		proc.stdin.write(item.tobytes())
 		proc.stdin.close()
 		proc.wait()
 		playback_queue.task_done()
 threading.Thread(target=playback_worker, daemon=True).start()
 def ensure_float32_wav(path):
 	"""Re-save audio as float32 mono WAV to work around librosa/numpy float64 issue.
 	Result is cached by input path so repeated calls with the same file are free."""
 	if path in _wav_cache:
 		return _wav_cache[path]
 	wav, sr = sf.read(path, dtype='float32', always_2d=True)
-	wav = wav.mean(axis=1)  # stereo → mono if needed
+	wav = wav.mean(axis=1)
 	tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
 	sf.write(tmp.name, wav, sr, subtype='FLOAT')
 	_wav_cache[path] = tmp.name
 	return tmp.name
-_SENTINEL = object()
+def load_chime(path):
-
+	if path in _chime_cache:
-playback_queue = queue.Queue()
+		return _chime_cache[path]
-
+	samples, sr = sf.read(path, dtype='float32', always_2d=True)
-
+	samples = samples.mean(axis=1)
-def playback_worker():
+	if sr != SAMPLE_RATE:
-	"""Plays audio samples in order. Runs in its own thread."""
+		samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE)
-	while True:
+	_chime_cache[path] = samples
-		item = playback_queue.get()
+	return samples
 		if item is _SENTINEL:
 			break
 		samples = item
 		proc = subprocess.Popen(
 			['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
 			stdin=subprocess.PIPE,
 		)
 		proc.stdin.write(samples.tobytes())
 		proc.stdin.close()
 		proc.wait()
 		playback_queue.task_done()
 playback_thread = threading.Thread(target=playback_worker, daemon=True)
 playback_thread.start()
 def generate(text, opts):
@@ -168,68 +181,74 @@ def generate(text, opts):
 	elapsed = time.time() - t1
 	duration = len(samples) / SAMPLE_RATE
 	log(f'generated {duration:.1f}s audio in {elapsed:.1f}s  rtf={elapsed/duration:.2f}')
 	return samples
-_chime_cache = {}
+class Handler(BaseHTTPRequestHandler):
 	def send_json(self, data, status=200):
 		body = json.dumps(data).encode()
 		self.send_response(status)
 		self.send_header('Content-Type', 'application/json')
 		self.send_header('Content-Length', str(len(body)))
 		self.end_headers()
 		self.wfile.write(body)
-def load_chime(path):
+	def read_json(self):
-	if path in _chime_cache:
+		length = int(self.headers.get('Content-Length', 0))
-		return _chime_cache[path]
+		return json.loads(self.rfile.read(length))
 	samples, sr = sf.read(path, dtype='float32', always_2d=True)
 	samples = samples.mean(axis=1)  # stereo → mono
 	if sr != SAMPLE_RATE:
 		samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE)
 	_chime_cache[path] = samples
 	return samples
-
+	def do_POST(self):
 for line in sys.stdin:
 	line = line.strip()
 	if not line:
 		continue
 	try:
 		req = json.loads(line)
 	except json.JSONDecodeError:
 		req = {'text': line}
 	if 'preload' in req:
 		try:
-			load_chime(req['preload'])
+			req = self.read_json()
-			log(f'preloaded chime: {req["preload"]}')
+		except Exception:
-		except Exception as e:
+			self.send_json({'status': 'error', 'message': 'invalid JSON'}, 400)
-			log(f'preload error: {e}')
+			return
 		print('ok', flush=True)
 		continue
-	if 'chime' in req:
+		if self.path == '/speak':
-		try:
+			text = req.pop('text', '')
-			samples = load_chime(req['chime'])
+			if not text:
-			playback_queue.put(samples)
+				self.send_json({'status': 'ok'})
-		except Exception as e:
+				return
-			log(f'chime error: {e}')
+			try:
-			traceback.print_exc(file=sys.stderr)
+				with _gen_lock:
-		print('ok', flush=True)
+					samples = generate(text, req)
-		continue
+				playback_queue.put(samples)
 				self.send_json({'status': 'ok'})
 			except Exception as e:
 				traceback.print_exc(file=sys.stderr)
 				self.send_json({'status': 'error', 'message': str(e)}, 500)
-	text = req.pop('text', '')
+		elif self.path == '/chime':
-	opts = req
+			path = req.get('path', '')
 			try:
 				samples = load_chime(path)
 				playback_queue.put(samples)
 				self.send_json({'status': 'ok'})
 			except Exception as e:
 				traceback.print_exc(file=sys.stderr)
 				self.send_json({'status': 'error', 'message': str(e)}, 500)
-	if not text:
+		elif self.path == '/preload':
-		print('ok', flush=True)
+			path = req.get('path', '')
-		continue
+			try:
 				load_chime(path)
 				log(f'preloaded: {path}')
 				self.send_json({'status': 'ok'})
 			except Exception as e:
 				self.send_json({'status': 'error', 'message': str(e)}, 500)
-	try:
+		else:
-		samples = generate(text, opts)
+			self.send_json({'status': 'error', 'message': 'not found'}, 404)
 		playback_queue.put(samples)
 	except Exception as e:
 		log(f'error: {e}')
 		traceback.print_exc(file=sys.stderr)
-	print('ok', flush=True)
+	def log_message(self, fmt, *args):
 		log(fmt % args)
-# Drain playback before exit
+
-playback_queue.put(_SENTINEL)
+server = ThreadingHTTPServer(('', PORT), Handler)
-playback_thread.join()
+log(f'listening on port {PORT}')
 try:
 	server.serve_forever()
 except KeyboardInterrupt:
 	pass
 finally:
 	playback_queue.put(_SENTINEL)
--- a/examples/chime.mjs
+++ b/examples/chime.mjs
@@ -0,0 +1,22 @@
 // Play a chime WAV file via the Chatterbox TTS server.
 // Usage: node chime.mjs /path/to/chime.wav
 const PORT = process.env.TTS_PORT ?? '11500'
 const path = process.argv[2]
 if (!path) {
 	console.error('usage: node chime.mjs /path/to/chime.wav')
 	process.exit(1)
 }
 const res = await fetch(`http://localhost:${PORT}/chime`, {
 	method: 'POST',
 	headers: { 'Content-Type': 'application/json' },
 	body: JSON.stringify({ path }),
 })
 const data = await res.json()
 if (data.status !== 'ok') {
 	console.error('error:', data.message)
 	process.exit(1)
 }
--- a/examples/speak.mjs
+++ b/examples/speak.mjs
@@ -0,0 +1,17 @@
 // Speak text via the Chatterbox TTS server.
 // Usage: node speak.mjs "Hello world"
 const PORT = process.env.TTS_PORT ?? '11500'
 const text = process.argv[2] ?? 'Hello from Node.'
 const res = await fetch(`http://localhost:${PORT}/speak`, {
 	method: 'POST',
 	headers: { 'Content-Type': 'application/json' },
 	body: JSON.stringify({ text }),
 })
 const data = await res.json()
 if (data.status !== 'ok') {
 	console.error('error:', data.message)
 	process.exit(1)
 }
--- a/examples/voice-clone.mjs
+++ b/examples/voice-clone.mjs
@@ -0,0 +1,24 @@
 // Speak text using a reference WAV for voice cloning.
 // The server reads the audio_prompt path from its own filesystem.
 // Usage: node voice-clone.mjs /path/to/reference.wav "Text to speak"
 const PORT        = process.env.TTS_PORT ?? '11500'
 const audio_prompt = process.argv[2]
 const text         = process.argv[3] ?? 'Hello, this is a cloned voice.'
 if (!audio_prompt) {
 	console.error('usage: node voice-clone.mjs /path/to/reference.wav "text"')
 	process.exit(1)
 }
 const res = await fetch(`http://localhost:${PORT}/speak`, {
 	method: 'POST',
 	headers: { 'Content-Type': 'application/json' },
 	body: JSON.stringify({ text, audio_prompt }),
 })
 const data = await res.json()
 if (data.status !== 'ok') {
 	console.error('error:', data.message)
 	process.exit(1)
 }