#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"' """ Chatterbox TTS server — keeps model loaded, reads JSON lines from stdin. Protocol: stdin: {"text": "...", "temperature": 0.8, "top_p": 0.95} {"chime": "/path/to/file.wav"} {"preload": "/path/to/file.wav"} stdout: "ok\n" after each utterance is generated (playback may still be in progress) stderr: status/timing messages Usage: ./chatterbox-server.py ./chatterbox-server.py turbo # default ./chatterbox-server.py full # original model, supports exaggeration Paralinguistic tags supported in text: [laugh] [chuckle] [cough] [clear throat] [sigh] [shush] [groan] [sniff] [gasp] Full model only: exaggeration 0.0-1.0 emotion intensity (ignored in turbo) """ import os import sys import json import time import queue import threading import subprocess import numpy as np TOKEN_FILE = os.path.expanduser('~/.secrets/hugging-face.token') try: with open(TOKEN_FILE) as f: os.environ['HF_TOKEN'] = f.read().strip() except FileNotFoundError: pass def find_hf_cache(repo_id): """Return the local snapshot path if the model is already cached, else None.""" from pathlib import Path cache_dir = Path(os.environ.get('HF_HOME', os.path.expanduser('~/.cache/huggingface'))) / 'hub' repo_dir = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots' if repo_dir.exists(): snapshots = sorted(repo_dir.iterdir(), key=lambda p: p.stat().st_mtime) if snapshots: return str(snapshots[-1]) return None VARIANT = sys.argv[1] if len(sys.argv) > 1 else 'turbo' SAMPLE_RATE = 24000 def log(msg): print(f'[chatterbox] {msg}', file=sys.stderr, flush=True) log(f'loading chatterbox-{VARIANT}...') t0 = time.time() import tempfile import traceback import numpy as np import torch import soundfile as sf import librosa as _librosa # librosa.resample returns float64 in newer numpy — patch it to always return float32 _orig_resample = _librosa.resample def _resample_float32(*args, **kwargs): return _orig_resample(*args, **kwargs).astype(np.float32) _librosa.resample = _resample_float32 device = 'cuda' if torch.cuda.is_available() else 'cpu' REPO_IDS = { 'turbo': 'ResembleAI/chatterbox-turbo', 'full': 'ResembleAI/chatterbox', } if VARIANT == 'turbo': from chatterbox.tts_turbo import ChatterboxTurboTTS as Model else: from chatterbox.tts import ChatterboxTTS as Model cached = find_hf_cache(REPO_IDS[VARIANT]) if cached: log(f'loading from cache: {cached}') model = Model.from_local(cached, device=device) else: log('cache not found, downloading...') model = Model.from_pretrained(device=device) log(f'ready on {device} ({time.time() - t0:.1f}s load time)') print('ready', flush=True) _wav_cache = {} def ensure_float32_wav(path): """Re-save audio as float32 mono WAV to work around librosa/numpy float64 issue. Result is cached by input path so repeated calls with the same file are free.""" if path in _wav_cache: return _wav_cache[path] wav, sr = sf.read(path, dtype='float32', always_2d=True) wav = wav.mean(axis=1) # stereo → mono if needed tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) sf.write(tmp.name, wav, sr, subtype='FLOAT') _wav_cache[path] = tmp.name return tmp.name _SENTINEL = object() playback_queue = queue.Queue() def playback_worker(): """Plays audio samples in order. Runs in its own thread.""" while True: item = playback_queue.get() if item is _SENTINEL: break samples = item proc = subprocess.Popen( ['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'], stdin=subprocess.PIPE, ) proc.stdin.write(samples.tobytes()) proc.stdin.close() proc.wait() playback_queue.task_done() playback_thread = threading.Thread(target=playback_worker, daemon=True) playback_thread.start() def generate(text, opts): t1 = time.time() if VARIANT == 'turbo': kwargs = { 'temperature': opts.get('temperature', 0.8), 'top_p': opts.get('top_p', 0.95), 'top_k': opts.get('top_k', 1000), 'repetition_penalty': opts.get('repetition_penalty', 1.2), 'min_p': opts.get('min_p', 0.0), } else: kwargs = { 'temperature': opts.get('temperature', 0.8), 'top_p': opts.get('top_p', 1.0), 'repetition_penalty': opts.get('repetition_penalty', 1.2), 'min_p': opts.get('min_p', 0.05), 'exaggeration': opts.get('exaggeration', 0.5), 'cfg_weight': opts.get('cfg_weight', 0.5), } audio_prompt = opts.get('audio_prompt') if audio_prompt: kwargs['audio_prompt_path'] = ensure_float32_wav(audio_prompt) with torch.inference_mode(): wav = model.generate(text, **kwargs) samples = wav.squeeze(0).cpu().numpy().astype(np.float32) elapsed = time.time() - t1 duration = len(samples) / SAMPLE_RATE log(f'generated {duration:.1f}s audio in {elapsed:.1f}s rtf={elapsed/duration:.2f}') return samples _chime_cache = {} def load_chime(path): if path in _chime_cache: return _chime_cache[path] samples, sr = sf.read(path, dtype='float32', always_2d=True) samples = samples.mean(axis=1) # stereo → mono if sr != SAMPLE_RATE: samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE) _chime_cache[path] = samples return samples for line in sys.stdin: line = line.strip() if not line: continue try: req = json.loads(line) except json.JSONDecodeError: req = {'text': line} if 'preload' in req: try: load_chime(req['preload']) log(f'preloaded chime: {req["preload"]}') except Exception as e: log(f'preload error: {e}') print('ok', flush=True) continue if 'chime' in req: try: samples = load_chime(req['chime']) playback_queue.put(samples) except Exception as e: log(f'chime error: {e}') traceback.print_exc(file=sys.stderr) print('ok', flush=True) continue text = req.pop('text', '') opts = req if not text: print('ok', flush=True) continue try: samples = generate(text, opts) playback_queue.put(samples) except Exception as e: log(f'error: {e}') traceback.print_exc(file=sys.stderr) print('ok', flush=True) # Drain playback before exit playback_queue.put(_SENTINEL) playback_thread.join()