tts-server/chatterbox-server.py

#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"'
"""
Chatterbox TTS server — keeps model loaded, reads JSON lines from stdin.

Protocol:
  stdin:  {"text": "...", "temperature": 0.8, "top_p": 0.95}
		  {"chime": "/path/to/file.wav"}
		  {"preload": "/path/to/file.wav"}
  stdout: "ok\n" after each utterance is generated (playback may still be in progress)
  stderr: status/timing messages

Usage:
  ./chatterbox-server.py
  ./chatterbox-server.py turbo    # default
  ./chatterbox-server.py full     # original model, supports exaggeration

Paralinguistic tags supported in text:
  [laugh] [chuckle] [cough] [clear throat] [sigh] [shush] [groan] [sniff] [gasp]

Full model only:
  exaggeration  0.0-1.0  emotion intensity (ignored in turbo)
"""

import os
import sys
import json
import time
import queue
import threading
import subprocess
import numpy as np

TOKEN_FILE = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token'))
try:
	with open(TOKEN_FILE) as f:
		os.environ['HF_TOKEN'] = f.read().strip()
except FileNotFoundError:
	pass

def find_hf_cache(repo_id):
	"""Return the local snapshot path if the model is already cached, else None."""
	from pathlib import Path
	cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')))

	repo_dir  = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots'
	if repo_dir.exists():
		snapshots = sorted(repo_dir.iterdir(), key=lambda p: p.stat().st_mtime)
		if snapshots:
			return str(snapshots[-1])
	return None

VARIANT    = sys.argv[1] if len(sys.argv) > 1 else 'turbo'
SAMPLE_RATE = 24000

def log(msg):
	print(f'[chatterbox] {msg}', file=sys.stderr, flush=True)

log(f'loading chatterbox-{VARIANT}...')
t0 = time.time()

import tempfile
import traceback
import numpy as np
import torch
import soundfile as sf
import librosa as _librosa

# librosa.resample returns float64 in newer numpy — patch it to always return float32
_orig_resample = _librosa.resample
def _resample_float32(*args, **kwargs):
	return _orig_resample(*args, **kwargs).astype(np.float32)
_librosa.resample = _resample_float32

device = 'cuda' if torch.cuda.is_available() else 'cpu'

REPO_IDS = {
	'turbo': 'ResembleAI/chatterbox-turbo',
	'full':  'ResembleAI/chatterbox',
}

if VARIANT == 'turbo':
	from chatterbox.tts_turbo import ChatterboxTurboTTS as Model
else:
	from chatterbox.tts import ChatterboxTTS as Model

cached = find_hf_cache(REPO_IDS[VARIANT])
if cached:
	log(f'loading from cache: {cached}')
	model = Model.from_local(cached, device=device)
else:
	log('cache not found, downloading...')
	model = Model.from_pretrained(device=device)

log(f'ready on {device} ({time.time() - t0:.1f}s load time)')
print('ready', flush=True)


_wav_cache = {}

def ensure_float32_wav(path):
	"""Re-save audio as float32 mono WAV to work around librosa/numpy float64 issue.
	Result is cached by input path so repeated calls with the same file are free."""
	if path in _wav_cache:
		return _wav_cache[path]
	wav, sr = sf.read(path, dtype='float32', always_2d=True)
	wav = wav.mean(axis=1)  # stereo → mono if needed
	tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
	sf.write(tmp.name, wav, sr, subtype='FLOAT')
	_wav_cache[path] = tmp.name
	return tmp.name


_SENTINEL = object()

playback_queue = queue.Queue()


def playback_worker():
	"""Plays audio samples in order. Runs in its own thread."""
	while True:
		item = playback_queue.get()
		if item is _SENTINEL:
			break
		samples = item
		proc = subprocess.Popen(
			['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
			stdin=subprocess.PIPE,
		)
		proc.stdin.write(samples.tobytes())
		proc.stdin.close()
		proc.wait()
		playback_queue.task_done()


playback_thread = threading.Thread(target=playback_worker, daemon=True)
playback_thread.start()


def generate(text, opts):
	t1 = time.time()

	if VARIANT == 'turbo':
		kwargs = {
			'temperature':        opts.get('temperature',        0.8),
			'top_p':              opts.get('top_p',              0.95),
			'top_k':              opts.get('top_k',              1000),
			'repetition_penalty': opts.get('repetition_penalty', 1.2),
			'min_p':              opts.get('min_p',              0.0),
		}
	else:
		kwargs = {
			'temperature':        opts.get('temperature',        0.8),
			'top_p':              opts.get('top_p',              1.0),
			'repetition_penalty': opts.get('repetition_penalty', 1.2),
			'min_p':              opts.get('min_p',              0.05),
			'exaggeration':       opts.get('exaggeration',       0.5),
			'cfg_weight':         opts.get('cfg_weight',         0.5),
		}

	audio_prompt = opts.get('audio_prompt')
	if audio_prompt:
		kwargs['audio_prompt_path'] = ensure_float32_wav(audio_prompt)

	with torch.inference_mode():
		wav = model.generate(text, **kwargs)

	samples = wav.squeeze(0).cpu().numpy().astype(np.float32)
	elapsed = time.time() - t1
	duration = len(samples) / SAMPLE_RATE
	log(f'generated {duration:.1f}s audio in {elapsed:.1f}s  rtf={elapsed/duration:.2f}')

	return samples


_chime_cache = {}

def load_chime(path):
	if path in _chime_cache:
		return _chime_cache[path]
	samples, sr = sf.read(path, dtype='float32', always_2d=True)
	samples = samples.mean(axis=1)  # stereo → mono
	if sr != SAMPLE_RATE:
		samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE)
	_chime_cache[path] = samples
	return samples


for line in sys.stdin:
	line = line.strip()
	if not line:
		continue

	try:
		req = json.loads(line)
	except json.JSONDecodeError:
		req = {'text': line}

	if 'preload' in req:
		try:
			load_chime(req['preload'])
			log(f'preloaded chime: {req["preload"]}')
		except Exception as e:
			log(f'preload error: {e}')
		print('ok', flush=True)
		continue

	if 'chime' in req:
		try:
			samples = load_chime(req['chime'])
			playback_queue.put(samples)
		except Exception as e:
			log(f'chime error: {e}')
			traceback.print_exc(file=sys.stderr)
		print('ok', flush=True)
		continue

	text = req.pop('text', '')
	opts = req

	if not text:
		print('ok', flush=True)
		continue

	try:
		samples = generate(text, opts)
		playback_queue.put(samples)
	except Exception as e:
		log(f'error: {e}')
		traceback.print_exc(file=sys.stderr)

	print('ok', flush=True)

# Drain playback before exit
playback_queue.put(_SENTINEL)
playback_thread.join()