#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"' """ Chatterbox TTS HTTP server — keeps model loaded, exposes a JSON HTTP API. Endpoints: POST /speak {"text": "...", "temperature": 0.8, "top_p": 0.95, "audio_prompt": "/path.wav"} POST /chime {"path": "/path/to/file.wav"} POST /preload {"path": "/path/to/file.wav"} POST /command {"command": "terminate"} All endpoints return {"status": "ok"} or {"status": "error", "message": "..."}. Responses are sent after audio is queued for playback (not after playback finishes). Environment: TTS_PORT TCP port to listen on (default: 11500) HF_TOKEN_FILE path to HuggingFace token file (default: ~/.secrets/hugging-face.token) HF_HUB_CACHE path to HuggingFace hub cache (default: ~/.cache/huggingface/hub) Usage: ./chatterbox-server.py ./chatterbox-server.py turbo # default ./chatterbox-server.py full # original model, supports exaggeration Paralinguistic tags supported in text: [laugh] [chuckle] [cough] [clear throat] [sigh] [shush] [groan] [sniff] [gasp] Full model only: exaggeration 0.0-1.0 emotion intensity (ignored in turbo) """ import os import sys import json import time import queue import threading import subprocess import traceback import tempfile from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from pathlib import Path import numpy as np TOKEN_FILE = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token')) try: with open(TOKEN_FILE) as f: os.environ['HF_TOKEN'] = f.read().strip() except FileNotFoundError: pass def find_hf_cache(repo_id): cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub'))) repo_dir = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots' if repo_dir.exists(): snapshots = sorted(repo_dir.iterdir(), key=lambda p: p.stat().st_mtime) if snapshots: return str(snapshots[-1]) return None VARIANT = sys.argv[1] if len(sys.argv) > 1 else 'turbo' PORT = int(os.environ.get('TTS_PORT', 11500)) SAMPLE_RATE = 24000 def log(msg): print(f'[chatterbox] {msg}', file=sys.stderr, flush=True) log(f'loading chatterbox-{VARIANT}...') t0 = time.time() import torch import soundfile as sf import librosa as _librosa _orig_resample = _librosa.resample def _resample_float32(*args, **kwargs): return _orig_resample(*args, **kwargs).astype(np.float32) _librosa.resample = _resample_float32 device = 'cuda' if torch.cuda.is_available() else 'cpu' REPO_IDS = { 'turbo': 'ResembleAI/chatterbox-turbo', 'full': 'ResembleAI/chatterbox', } if VARIANT == 'turbo': from chatterbox.tts_turbo import ChatterboxTurboTTS as Model else: from chatterbox.tts import ChatterboxTTS as Model cached = find_hf_cache(REPO_IDS[VARIANT]) if cached: log(f'loading from cache: {cached}') model = Model.from_local(cached, device=device) else: log('cache not found, downloading...') model = Model.from_pretrained(device=device) log(f'ready on {device} ({time.time() - t0:.1f}s load time)') _wav_cache = {} _chime_cache = {} _gen_lock = threading.Lock() _SENTINEL = object() playback_queue = queue.Queue() def playback_worker(): while True: item = playback_queue.get() if item is _SENTINEL: break proc = subprocess.Popen( ['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'], stdin=subprocess.PIPE, ) proc.stdin.write(item.tobytes()) proc.stdin.close() proc.wait() playback_queue.task_done() threading.Thread(target=playback_worker, daemon=True).start() def ensure_float32_wav(path): if path in _wav_cache: return _wav_cache[path] wav, sr = sf.read(path, dtype='float32', always_2d=True) wav = wav.mean(axis=1) tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) sf.write(tmp.name, wav, sr, subtype='FLOAT') _wav_cache[path] = tmp.name return tmp.name def load_chime(path): if path in _chime_cache: return _chime_cache[path] samples, sr = sf.read(path, dtype='float32', always_2d=True) samples = samples.mean(axis=1) if sr != SAMPLE_RATE: samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE) _chime_cache[path] = samples return samples def generate(text, opts): t1 = time.time() if VARIANT == 'turbo': kwargs = { 'temperature': opts.get('temperature', 0.8), 'top_p': opts.get('top_p', 0.95), 'top_k': opts.get('top_k', 1000), 'repetition_penalty': opts.get('repetition_penalty', 1.2), 'min_p': opts.get('min_p', 0.0), } else: kwargs = { 'temperature': opts.get('temperature', 0.8), 'top_p': opts.get('top_p', 1.0), 'repetition_penalty': opts.get('repetition_penalty', 1.2), 'min_p': opts.get('min_p', 0.05), 'exaggeration': opts.get('exaggeration', 0.5), 'cfg_weight': opts.get('cfg_weight', 0.5), } audio_prompt = opts.get('audio_prompt') if audio_prompt: kwargs['audio_prompt_path'] = ensure_float32_wav(audio_prompt) with torch.inference_mode(): wav = model.generate(text, **kwargs) samples = wav.squeeze(0).cpu().numpy().astype(np.float32) elapsed = time.time() - t1 duration = len(samples) / SAMPLE_RATE log(f'generated {duration:.1f}s audio in {elapsed:.1f}s rtf={elapsed/duration:.2f}') return samples class Handler(BaseHTTPRequestHandler): def send_json(self, data, status=200): body = json.dumps(data).encode() self.send_response(status) self.send_header('Content-Type', 'application/json') self.send_header('Content-Length', str(len(body))) self.end_headers() self.wfile.write(body) def read_json(self): length = int(self.headers.get('Content-Length', 0)) return json.loads(self.rfile.read(length)) def do_POST(self): try: req = self.read_json() except Exception: self.send_json({'status': 'error', 'message': 'invalid JSON'}, 400) return if self.path == '/speak': text = req.pop('text', '') if not text: self.send_json({'status': 'ok'}) return try: with _gen_lock: samples = generate(text, req) playback_queue.put(samples) self.send_json({'status': 'ok'}) except Exception as e: traceback.print_exc(file=sys.stderr) self.send_json({'status': 'error', 'message': str(e)}, 500) elif self.path == '/chime': path = req.get('path', '') try: samples = load_chime(path) playback_queue.put(samples) self.send_json({'status': 'ok'}) except Exception as e: traceback.print_exc(file=sys.stderr) self.send_json({'status': 'error', 'message': str(e)}, 500) elif self.path == '/preload': path = req.get('path', '') try: load_chime(path) log(f'preloaded: {path}') self.send_json({'status': 'ok'}) except Exception as e: self.send_json({'status': 'error', 'message': str(e)}, 500) elif self.path == '/command': command = req.get('command', '') if command == 'terminate': self.send_json({'status': 'ok'}) threading.Thread(target=server.shutdown, daemon=True).start() else: self.send_json({'status': 'error', 'message': f'unknown command: {command}'}, 400) else: self.send_json({'status': 'error', 'message': 'not found'}, 404) def log_message(self, fmt, *args): log(fmt % args) server = ThreadingHTTPServer(('', PORT), Handler) log(f'listening on port {PORT}') try: server.serve_forever() except KeyboardInterrupt: pass finally: playback_queue.put(_SENTINEL)