From f6ff8c72e854f02d36e7cfeb3e1688bdba296d59 Mon Sep 17 00:00:00 2001 From: mikael-lovqvists-claude-agent Date: Sun, 7 Jun 2026 07:28:18 +0000 Subject: [PATCH] Convert chatterbox-server.py to HTTP server, add Node.js examples Replace stdin/stdout JSON line protocol with a stdlib HTTP server (ThreadingHTTPServer). Three endpoints: POST /speak, /chime, /preload. All return {"status": "ok"} after audio is queued for playback. TTS generation is serialized via a threading.Lock; concurrent chime/preload requests are handled without waiting for generation. Add examples/speak.mjs, chime.mjs, voice-clone.mjs using Node.js built-in fetch (no libraries required, Node 18+). Co-Authored-By: Claude Sonnet 4.6 --- chatterbox-server.py | 217 +++++++++++++++++++++------------------ examples/chime.mjs | 22 ++++ examples/speak.mjs | 17 +++ examples/voice-clone.mjs | 24 +++++ 4 files changed, 181 insertions(+), 99 deletions(-) create mode 100644 examples/chime.mjs create mode 100644 examples/speak.mjs create mode 100644 examples/voice-clone.mjs diff --git a/chatterbox-server.py b/chatterbox-server.py index 4b8640e..46af4aa 100755 --- a/chatterbox-server.py +++ b/chatterbox-server.py @@ -1,13 +1,19 @@ #!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"' """ -Chatterbox TTS server — keeps model loaded, reads JSON lines from stdin. +Chatterbox TTS HTTP server — keeps model loaded, exposes a JSON HTTP API. -Protocol: - stdin: {"text": "...", "temperature": 0.8, "top_p": 0.95} - {"chime": "/path/to/file.wav"} - {"preload": "/path/to/file.wav"} - stdout: "ok\n" after each utterance is generated (playback may still be in progress) - stderr: status/timing messages +Endpoints: + POST /speak {"text": "...", "temperature": 0.8, "top_p": 0.95, "audio_prompt": "/path.wav"} + POST /chime {"path": "/path/to/file.wav"} + POST /preload {"path": "/path/to/file.wav"} + +All endpoints return {"status": "ok"} or {"status": "error", "message": "..."}. +Responses are sent after audio is queued for playback (not after playback finishes). + +Environment: + TTS_PORT TCP port to listen on (default: 11500) + HF_TOKEN_FILE path to HuggingFace token file (default: ~/.secrets/hugging-face.token) + HF_HUB_CACHE path to HuggingFace hub cache (default: ~/.cache/huggingface/hub) Usage: ./chatterbox-server.py @@ -28,6 +34,10 @@ import time import queue import threading import subprocess +import traceback +import tempfile +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path import numpy as np TOKEN_FILE = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token')) @@ -37,35 +47,33 @@ try: except FileNotFoundError: pass -def find_hf_cache(repo_id): - """Return the local snapshot path if the model is already cached, else None.""" - from pathlib import Path - cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub'))) - repo_dir = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots' +def find_hf_cache(repo_id): + cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub'))) + repo_dir = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots' if repo_dir.exists(): snapshots = sorted(repo_dir.iterdir(), key=lambda p: p.stat().st_mtime) if snapshots: return str(snapshots[-1]) return None -VARIANT = sys.argv[1] if len(sys.argv) > 1 else 'turbo' + +VARIANT = sys.argv[1] if len(sys.argv) > 1 else 'turbo' +PORT = int(os.environ.get('TTS_PORT', 11500)) SAMPLE_RATE = 24000 + def log(msg): print(f'[chatterbox] {msg}', file=sys.stderr, flush=True) + log(f'loading chatterbox-{VARIANT}...') t0 = time.time() -import tempfile -import traceback -import numpy as np import torch import soundfile as sf import librosa as _librosa -# librosa.resample returns float64 in newer numpy — patch it to always return float32 _orig_resample = _librosa.resample def _resample_float32(*args, **kwargs): return _orig_resample(*args, **kwargs).astype(np.float32) @@ -92,48 +100,53 @@ else: model = Model.from_pretrained(device=device) log(f'ready on {device} ({time.time() - t0:.1f}s load time)') -print('ready', flush=True) + +_wav_cache = {} +_chime_cache = {} +_gen_lock = threading.Lock() + +_SENTINEL = object() +playback_queue = queue.Queue() -_wav_cache = {} +def playback_worker(): + while True: + item = playback_queue.get() + if item is _SENTINEL: + break + proc = subprocess.Popen( + ['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'], + stdin=subprocess.PIPE, + ) + proc.stdin.write(item.tobytes()) + proc.stdin.close() + proc.wait() + playback_queue.task_done() + + +threading.Thread(target=playback_worker, daemon=True).start() + def ensure_float32_wav(path): - """Re-save audio as float32 mono WAV to work around librosa/numpy float64 issue. - Result is cached by input path so repeated calls with the same file are free.""" if path in _wav_cache: return _wav_cache[path] wav, sr = sf.read(path, dtype='float32', always_2d=True) - wav = wav.mean(axis=1) # stereo → mono if needed + wav = wav.mean(axis=1) tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) sf.write(tmp.name, wav, sr, subtype='FLOAT') _wav_cache[path] = tmp.name return tmp.name -_SENTINEL = object() - -playback_queue = queue.Queue() - - -def playback_worker(): - """Plays audio samples in order. Runs in its own thread.""" - while True: - item = playback_queue.get() - if item is _SENTINEL: - break - samples = item - proc = subprocess.Popen( - ['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'], - stdin=subprocess.PIPE, - ) - proc.stdin.write(samples.tobytes()) - proc.stdin.close() - proc.wait() - playback_queue.task_done() - - -playback_thread = threading.Thread(target=playback_worker, daemon=True) -playback_thread.start() +def load_chime(path): + if path in _chime_cache: + return _chime_cache[path] + samples, sr = sf.read(path, dtype='float32', always_2d=True) + samples = samples.mean(axis=1) + if sr != SAMPLE_RATE: + samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE) + _chime_cache[path] = samples + return samples def generate(text, opts): @@ -168,68 +181,74 @@ def generate(text, opts): elapsed = time.time() - t1 duration = len(samples) / SAMPLE_RATE log(f'generated {duration:.1f}s audio in {elapsed:.1f}s rtf={elapsed/duration:.2f}') - return samples -_chime_cache = {} +class Handler(BaseHTTPRequestHandler): + def send_json(self, data, status=200): + body = json.dumps(data).encode() + self.send_response(status) + self.send_header('Content-Type', 'application/json') + self.send_header('Content-Length', str(len(body))) + self.end_headers() + self.wfile.write(body) -def load_chime(path): - if path in _chime_cache: - return _chime_cache[path] - samples, sr = sf.read(path, dtype='float32', always_2d=True) - samples = samples.mean(axis=1) # stereo → mono - if sr != SAMPLE_RATE: - samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE) - _chime_cache[path] = samples - return samples + def read_json(self): + length = int(self.headers.get('Content-Length', 0)) + return json.loads(self.rfile.read(length)) - -for line in sys.stdin: - line = line.strip() - if not line: - continue - - try: - req = json.loads(line) - except json.JSONDecodeError: - req = {'text': line} - - if 'preload' in req: + def do_POST(self): try: - load_chime(req['preload']) - log(f'preloaded chime: {req["preload"]}') - except Exception as e: - log(f'preload error: {e}') - print('ok', flush=True) - continue + req = self.read_json() + except Exception: + self.send_json({'status': 'error', 'message': 'invalid JSON'}, 400) + return - if 'chime' in req: - try: - samples = load_chime(req['chime']) - playback_queue.put(samples) - except Exception as e: - log(f'chime error: {e}') - traceback.print_exc(file=sys.stderr) - print('ok', flush=True) - continue + if self.path == '/speak': + text = req.pop('text', '') + if not text: + self.send_json({'status': 'ok'}) + return + try: + with _gen_lock: + samples = generate(text, req) + playback_queue.put(samples) + self.send_json({'status': 'ok'}) + except Exception as e: + traceback.print_exc(file=sys.stderr) + self.send_json({'status': 'error', 'message': str(e)}, 500) - text = req.pop('text', '') - opts = req + elif self.path == '/chime': + path = req.get('path', '') + try: + samples = load_chime(path) + playback_queue.put(samples) + self.send_json({'status': 'ok'}) + except Exception as e: + traceback.print_exc(file=sys.stderr) + self.send_json({'status': 'error', 'message': str(e)}, 500) - if not text: - print('ok', flush=True) - continue + elif self.path == '/preload': + path = req.get('path', '') + try: + load_chime(path) + log(f'preloaded: {path}') + self.send_json({'status': 'ok'}) + except Exception as e: + self.send_json({'status': 'error', 'message': str(e)}, 500) - try: - samples = generate(text, opts) - playback_queue.put(samples) - except Exception as e: - log(f'error: {e}') - traceback.print_exc(file=sys.stderr) + else: + self.send_json({'status': 'error', 'message': 'not found'}, 404) - print('ok', flush=True) + def log_message(self, fmt, *args): + log(fmt % args) -# Drain playback before exit -playback_queue.put(_SENTINEL) -playback_thread.join() + +server = ThreadingHTTPServer(('', PORT), Handler) +log(f'listening on port {PORT}') +try: + server.serve_forever() +except KeyboardInterrupt: + pass +finally: + playback_queue.put(_SENTINEL) diff --git a/examples/chime.mjs b/examples/chime.mjs new file mode 100644 index 0000000..0247451 --- /dev/null +++ b/examples/chime.mjs @@ -0,0 +1,22 @@ +// Play a chime WAV file via the Chatterbox TTS server. +// Usage: node chime.mjs /path/to/chime.wav + +const PORT = process.env.TTS_PORT ?? '11500' +const path = process.argv[2] + +if (!path) { + console.error('usage: node chime.mjs /path/to/chime.wav') + process.exit(1) +} + +const res = await fetch(`http://localhost:${PORT}/chime`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ path }), +}) + +const data = await res.json() +if (data.status !== 'ok') { + console.error('error:', data.message) + process.exit(1) +} diff --git a/examples/speak.mjs b/examples/speak.mjs new file mode 100644 index 0000000..c609de3 --- /dev/null +++ b/examples/speak.mjs @@ -0,0 +1,17 @@ +// Speak text via the Chatterbox TTS server. +// Usage: node speak.mjs "Hello world" + +const PORT = process.env.TTS_PORT ?? '11500' +const text = process.argv[2] ?? 'Hello from Node.' + +const res = await fetch(`http://localhost:${PORT}/speak`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text }), +}) + +const data = await res.json() +if (data.status !== 'ok') { + console.error('error:', data.message) + process.exit(1) +} diff --git a/examples/voice-clone.mjs b/examples/voice-clone.mjs new file mode 100644 index 0000000..c8998b9 --- /dev/null +++ b/examples/voice-clone.mjs @@ -0,0 +1,24 @@ +// Speak text using a reference WAV for voice cloning. +// The server reads the audio_prompt path from its own filesystem. +// Usage: node voice-clone.mjs /path/to/reference.wav "Text to speak" + +const PORT = process.env.TTS_PORT ?? '11500' +const audio_prompt = process.argv[2] +const text = process.argv[3] ?? 'Hello, this is a cloned voice.' + +if (!audio_prompt) { + console.error('usage: node voice-clone.mjs /path/to/reference.wav "text"') + process.exit(1) +} + +const res = await fetch(`http://localhost:${PORT}/speak`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text, audio_prompt }), +}) + +const data = await res.json() +if (data.status !== 'ok') { + console.error('error:', data.message) + process.exit(1) +}