Files
tts-server/chatterbox-server.py
mikael-lovqvists-claude-agent f6ff8c72e8 Convert chatterbox-server.py to HTTP server, add Node.js examples
Replace stdin/stdout JSON line protocol with a stdlib HTTP server
(ThreadingHTTPServer). Three endpoints: POST /speak, /chime, /preload.
All return {"status": "ok"} after audio is queued for playback.
TTS generation is serialized via a threading.Lock; concurrent chime/preload
requests are handled without waiting for generation.

Add examples/speak.mjs, chime.mjs, voice-clone.mjs using Node.js built-in
fetch (no libraries required, Node 18+).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-07 07:28:18 +00:00

255 lines
7.1 KiB
Python
Executable File

#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"'
"""
Chatterbox TTS HTTP server — keeps model loaded, exposes a JSON HTTP API.
Endpoints:
POST /speak {"text": "...", "temperature": 0.8, "top_p": 0.95, "audio_prompt": "/path.wav"}
POST /chime {"path": "/path/to/file.wav"}
POST /preload {"path": "/path/to/file.wav"}
All endpoints return {"status": "ok"} or {"status": "error", "message": "..."}.
Responses are sent after audio is queued for playback (not after playback finishes).
Environment:
TTS_PORT TCP port to listen on (default: 11500)
HF_TOKEN_FILE path to HuggingFace token file (default: ~/.secrets/hugging-face.token)
HF_HUB_CACHE path to HuggingFace hub cache (default: ~/.cache/huggingface/hub)
Usage:
./chatterbox-server.py
./chatterbox-server.py turbo # default
./chatterbox-server.py full # original model, supports exaggeration
Paralinguistic tags supported in text:
[laugh] [chuckle] [cough] [clear throat] [sigh] [shush] [groan] [sniff] [gasp]
Full model only:
exaggeration 0.0-1.0 emotion intensity (ignored in turbo)
"""
import os
import sys
import json
import time
import queue
import threading
import subprocess
import traceback
import tempfile
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from pathlib import Path
import numpy as np
TOKEN_FILE = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token'))
try:
with open(TOKEN_FILE) as f:
os.environ['HF_TOKEN'] = f.read().strip()
except FileNotFoundError:
pass
def find_hf_cache(repo_id):
cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')))
repo_dir = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots'
if repo_dir.exists():
snapshots = sorted(repo_dir.iterdir(), key=lambda p: p.stat().st_mtime)
if snapshots:
return str(snapshots[-1])
return None
VARIANT = sys.argv[1] if len(sys.argv) > 1 else 'turbo'
PORT = int(os.environ.get('TTS_PORT', 11500))
SAMPLE_RATE = 24000
def log(msg):
print(f'[chatterbox] {msg}', file=sys.stderr, flush=True)
log(f'loading chatterbox-{VARIANT}...')
t0 = time.time()
import torch
import soundfile as sf
import librosa as _librosa
_orig_resample = _librosa.resample
def _resample_float32(*args, **kwargs):
return _orig_resample(*args, **kwargs).astype(np.float32)
_librosa.resample = _resample_float32
device = 'cuda' if torch.cuda.is_available() else 'cpu'
REPO_IDS = {
'turbo': 'ResembleAI/chatterbox-turbo',
'full': 'ResembleAI/chatterbox',
}
if VARIANT == 'turbo':
from chatterbox.tts_turbo import ChatterboxTurboTTS as Model
else:
from chatterbox.tts import ChatterboxTTS as Model
cached = find_hf_cache(REPO_IDS[VARIANT])
if cached:
log(f'loading from cache: {cached}')
model = Model.from_local(cached, device=device)
else:
log('cache not found, downloading...')
model = Model.from_pretrained(device=device)
log(f'ready on {device} ({time.time() - t0:.1f}s load time)')
_wav_cache = {}
_chime_cache = {}
_gen_lock = threading.Lock()
_SENTINEL = object()
playback_queue = queue.Queue()
def playback_worker():
while True:
item = playback_queue.get()
if item is _SENTINEL:
break
proc = subprocess.Popen(
['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
stdin=subprocess.PIPE,
)
proc.stdin.write(item.tobytes())
proc.stdin.close()
proc.wait()
playback_queue.task_done()
threading.Thread(target=playback_worker, daemon=True).start()
def ensure_float32_wav(path):
if path in _wav_cache:
return _wav_cache[path]
wav, sr = sf.read(path, dtype='float32', always_2d=True)
wav = wav.mean(axis=1)
tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
sf.write(tmp.name, wav, sr, subtype='FLOAT')
_wav_cache[path] = tmp.name
return tmp.name
def load_chime(path):
if path in _chime_cache:
return _chime_cache[path]
samples, sr = sf.read(path, dtype='float32', always_2d=True)
samples = samples.mean(axis=1)
if sr != SAMPLE_RATE:
samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE)
_chime_cache[path] = samples
return samples
def generate(text, opts):
t1 = time.time()
if VARIANT == 'turbo':
kwargs = {
'temperature': opts.get('temperature', 0.8),
'top_p': opts.get('top_p', 0.95),
'top_k': opts.get('top_k', 1000),
'repetition_penalty': opts.get('repetition_penalty', 1.2),
'min_p': opts.get('min_p', 0.0),
}
else:
kwargs = {
'temperature': opts.get('temperature', 0.8),
'top_p': opts.get('top_p', 1.0),
'repetition_penalty': opts.get('repetition_penalty', 1.2),
'min_p': opts.get('min_p', 0.05),
'exaggeration': opts.get('exaggeration', 0.5),
'cfg_weight': opts.get('cfg_weight', 0.5),
}
audio_prompt = opts.get('audio_prompt')
if audio_prompt:
kwargs['audio_prompt_path'] = ensure_float32_wav(audio_prompt)
with torch.inference_mode():
wav = model.generate(text, **kwargs)
samples = wav.squeeze(0).cpu().numpy().astype(np.float32)
elapsed = time.time() - t1
duration = len(samples) / SAMPLE_RATE
log(f'generated {duration:.1f}s audio in {elapsed:.1f}s rtf={elapsed/duration:.2f}')
return samples
class Handler(BaseHTTPRequestHandler):
def send_json(self, data, status=200):
body = json.dumps(data).encode()
self.send_response(status)
self.send_header('Content-Type', 'application/json')
self.send_header('Content-Length', str(len(body)))
self.end_headers()
self.wfile.write(body)
def read_json(self):
length = int(self.headers.get('Content-Length', 0))
return json.loads(self.rfile.read(length))
def do_POST(self):
try:
req = self.read_json()
except Exception:
self.send_json({'status': 'error', 'message': 'invalid JSON'}, 400)
return
if self.path == '/speak':
text = req.pop('text', '')
if not text:
self.send_json({'status': 'ok'})
return
try:
with _gen_lock:
samples = generate(text, req)
playback_queue.put(samples)
self.send_json({'status': 'ok'})
except Exception as e:
traceback.print_exc(file=sys.stderr)
self.send_json({'status': 'error', 'message': str(e)}, 500)
elif self.path == '/chime':
path = req.get('path', '')
try:
samples = load_chime(path)
playback_queue.put(samples)
self.send_json({'status': 'ok'})
except Exception as e:
traceback.print_exc(file=sys.stderr)
self.send_json({'status': 'error', 'message': str(e)}, 500)
elif self.path == '/preload':
path = req.get('path', '')
try:
load_chime(path)
log(f'preloaded: {path}')
self.send_json({'status': 'ok'})
except Exception as e:
self.send_json({'status': 'error', 'message': str(e)}, 500)
else:
self.send_json({'status': 'error', 'message': 'not found'}, 404)
def log_message(self, fmt, *args):
log(fmt % args)
server = ThreadingHTTPServer(('', PORT), Handler)
log(f'listening on port {PORT}')
try:
server.serve_forever()
except KeyboardInterrupt:
pass
finally:
playback_queue.put(_SENTINEL)