Replace stdin/stdout JSON line protocol with a stdlib HTTP server
(ThreadingHTTPServer). Three endpoints: POST /speak, /chime, /preload.
All return {"status": "ok"} after audio is queued for playback.
TTS generation is serialized via a threading.Lock; concurrent chime/preload
requests are handled without waiting for generation.
Add examples/speak.mjs, chime.mjs, voice-clone.mjs using Node.js built-in
fetch (no libraries required, Node 18+).
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
255 lines
7.1 KiB
Python
Executable File
255 lines
7.1 KiB
Python
Executable File
#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"'
|
|
"""
|
|
Chatterbox TTS HTTP server — keeps model loaded, exposes a JSON HTTP API.
|
|
|
|
Endpoints:
|
|
POST /speak {"text": "...", "temperature": 0.8, "top_p": 0.95, "audio_prompt": "/path.wav"}
|
|
POST /chime {"path": "/path/to/file.wav"}
|
|
POST /preload {"path": "/path/to/file.wav"}
|
|
|
|
All endpoints return {"status": "ok"} or {"status": "error", "message": "..."}.
|
|
Responses are sent after audio is queued for playback (not after playback finishes).
|
|
|
|
Environment:
|
|
TTS_PORT TCP port to listen on (default: 11500)
|
|
HF_TOKEN_FILE path to HuggingFace token file (default: ~/.secrets/hugging-face.token)
|
|
HF_HUB_CACHE path to HuggingFace hub cache (default: ~/.cache/huggingface/hub)
|
|
|
|
Usage:
|
|
./chatterbox-server.py
|
|
./chatterbox-server.py turbo # default
|
|
./chatterbox-server.py full # original model, supports exaggeration
|
|
|
|
Paralinguistic tags supported in text:
|
|
[laugh] [chuckle] [cough] [clear throat] [sigh] [shush] [groan] [sniff] [gasp]
|
|
|
|
Full model only:
|
|
exaggeration 0.0-1.0 emotion intensity (ignored in turbo)
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import time
|
|
import queue
|
|
import threading
|
|
import subprocess
|
|
import traceback
|
|
import tempfile
|
|
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
|
from pathlib import Path
|
|
import numpy as np
|
|
|
|
TOKEN_FILE = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token'))
|
|
try:
|
|
with open(TOKEN_FILE) as f:
|
|
os.environ['HF_TOKEN'] = f.read().strip()
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
|
|
def find_hf_cache(repo_id):
|
|
cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')))
|
|
repo_dir = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots'
|
|
if repo_dir.exists():
|
|
snapshots = sorted(repo_dir.iterdir(), key=lambda p: p.stat().st_mtime)
|
|
if snapshots:
|
|
return str(snapshots[-1])
|
|
return None
|
|
|
|
|
|
VARIANT = sys.argv[1] if len(sys.argv) > 1 else 'turbo'
|
|
PORT = int(os.environ.get('TTS_PORT', 11500))
|
|
SAMPLE_RATE = 24000
|
|
|
|
|
|
def log(msg):
|
|
print(f'[chatterbox] {msg}', file=sys.stderr, flush=True)
|
|
|
|
|
|
log(f'loading chatterbox-{VARIANT}...')
|
|
t0 = time.time()
|
|
|
|
import torch
|
|
import soundfile as sf
|
|
import librosa as _librosa
|
|
|
|
_orig_resample = _librosa.resample
|
|
def _resample_float32(*args, **kwargs):
|
|
return _orig_resample(*args, **kwargs).astype(np.float32)
|
|
_librosa.resample = _resample_float32
|
|
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
|
REPO_IDS = {
|
|
'turbo': 'ResembleAI/chatterbox-turbo',
|
|
'full': 'ResembleAI/chatterbox',
|
|
}
|
|
|
|
if VARIANT == 'turbo':
|
|
from chatterbox.tts_turbo import ChatterboxTurboTTS as Model
|
|
else:
|
|
from chatterbox.tts import ChatterboxTTS as Model
|
|
|
|
cached = find_hf_cache(REPO_IDS[VARIANT])
|
|
if cached:
|
|
log(f'loading from cache: {cached}')
|
|
model = Model.from_local(cached, device=device)
|
|
else:
|
|
log('cache not found, downloading...')
|
|
model = Model.from_pretrained(device=device)
|
|
|
|
log(f'ready on {device} ({time.time() - t0:.1f}s load time)')
|
|
|
|
_wav_cache = {}
|
|
_chime_cache = {}
|
|
_gen_lock = threading.Lock()
|
|
|
|
_SENTINEL = object()
|
|
playback_queue = queue.Queue()
|
|
|
|
|
|
def playback_worker():
|
|
while True:
|
|
item = playback_queue.get()
|
|
if item is _SENTINEL:
|
|
break
|
|
proc = subprocess.Popen(
|
|
['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
|
|
stdin=subprocess.PIPE,
|
|
)
|
|
proc.stdin.write(item.tobytes())
|
|
proc.stdin.close()
|
|
proc.wait()
|
|
playback_queue.task_done()
|
|
|
|
|
|
threading.Thread(target=playback_worker, daemon=True).start()
|
|
|
|
|
|
def ensure_float32_wav(path):
|
|
if path in _wav_cache:
|
|
return _wav_cache[path]
|
|
wav, sr = sf.read(path, dtype='float32', always_2d=True)
|
|
wav = wav.mean(axis=1)
|
|
tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|
|
sf.write(tmp.name, wav, sr, subtype='FLOAT')
|
|
_wav_cache[path] = tmp.name
|
|
return tmp.name
|
|
|
|
|
|
def load_chime(path):
|
|
if path in _chime_cache:
|
|
return _chime_cache[path]
|
|
samples, sr = sf.read(path, dtype='float32', always_2d=True)
|
|
samples = samples.mean(axis=1)
|
|
if sr != SAMPLE_RATE:
|
|
samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE)
|
|
_chime_cache[path] = samples
|
|
return samples
|
|
|
|
|
|
def generate(text, opts):
|
|
t1 = time.time()
|
|
|
|
if VARIANT == 'turbo':
|
|
kwargs = {
|
|
'temperature': opts.get('temperature', 0.8),
|
|
'top_p': opts.get('top_p', 0.95),
|
|
'top_k': opts.get('top_k', 1000),
|
|
'repetition_penalty': opts.get('repetition_penalty', 1.2),
|
|
'min_p': opts.get('min_p', 0.0),
|
|
}
|
|
else:
|
|
kwargs = {
|
|
'temperature': opts.get('temperature', 0.8),
|
|
'top_p': opts.get('top_p', 1.0),
|
|
'repetition_penalty': opts.get('repetition_penalty', 1.2),
|
|
'min_p': opts.get('min_p', 0.05),
|
|
'exaggeration': opts.get('exaggeration', 0.5),
|
|
'cfg_weight': opts.get('cfg_weight', 0.5),
|
|
}
|
|
|
|
audio_prompt = opts.get('audio_prompt')
|
|
if audio_prompt:
|
|
kwargs['audio_prompt_path'] = ensure_float32_wav(audio_prompt)
|
|
|
|
with torch.inference_mode():
|
|
wav = model.generate(text, **kwargs)
|
|
|
|
samples = wav.squeeze(0).cpu().numpy().astype(np.float32)
|
|
elapsed = time.time() - t1
|
|
duration = len(samples) / SAMPLE_RATE
|
|
log(f'generated {duration:.1f}s audio in {elapsed:.1f}s rtf={elapsed/duration:.2f}')
|
|
return samples
|
|
|
|
|
|
class Handler(BaseHTTPRequestHandler):
|
|
def send_json(self, data, status=200):
|
|
body = json.dumps(data).encode()
|
|
self.send_response(status)
|
|
self.send_header('Content-Type', 'application/json')
|
|
self.send_header('Content-Length', str(len(body)))
|
|
self.end_headers()
|
|
self.wfile.write(body)
|
|
|
|
def read_json(self):
|
|
length = int(self.headers.get('Content-Length', 0))
|
|
return json.loads(self.rfile.read(length))
|
|
|
|
def do_POST(self):
|
|
try:
|
|
req = self.read_json()
|
|
except Exception:
|
|
self.send_json({'status': 'error', 'message': 'invalid JSON'}, 400)
|
|
return
|
|
|
|
if self.path == '/speak':
|
|
text = req.pop('text', '')
|
|
if not text:
|
|
self.send_json({'status': 'ok'})
|
|
return
|
|
try:
|
|
with _gen_lock:
|
|
samples = generate(text, req)
|
|
playback_queue.put(samples)
|
|
self.send_json({'status': 'ok'})
|
|
except Exception as e:
|
|
traceback.print_exc(file=sys.stderr)
|
|
self.send_json({'status': 'error', 'message': str(e)}, 500)
|
|
|
|
elif self.path == '/chime':
|
|
path = req.get('path', '')
|
|
try:
|
|
samples = load_chime(path)
|
|
playback_queue.put(samples)
|
|
self.send_json({'status': 'ok'})
|
|
except Exception as e:
|
|
traceback.print_exc(file=sys.stderr)
|
|
self.send_json({'status': 'error', 'message': str(e)}, 500)
|
|
|
|
elif self.path == '/preload':
|
|
path = req.get('path', '')
|
|
try:
|
|
load_chime(path)
|
|
log(f'preloaded: {path}')
|
|
self.send_json({'status': 'ok'})
|
|
except Exception as e:
|
|
self.send_json({'status': 'error', 'message': str(e)}, 500)
|
|
|
|
else:
|
|
self.send_json({'status': 'error', 'message': 'not found'}, 404)
|
|
|
|
def log_message(self, fmt, *args):
|
|
log(fmt % args)
|
|
|
|
|
|
server = ThreadingHTTPServer(('', PORT), Handler)
|
|
log(f'listening on port {PORT}')
|
|
try:
|
|
server.serve_forever()
|
|
except KeyboardInterrupt:
|
|
pass
|
|
finally:
|
|
playback_queue.put(_SENTINEL)
|