From 4a54bdab042e4e2a71652cb31c45ff82c2d014a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mikael=20L=C3=B6vqvist?= Date: Sun, 7 Jun 2026 09:16:26 +0200 Subject: [PATCH] Added readme and chatterbox-server.py --- README.md | 22 ++-- chatterbox-server.py | 235 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 250 insertions(+), 7 deletions(-) create mode 100755 chatterbox-server.py diff --git a/README.md b/README.md index c547a3e..62575c7 100644 --- a/README.md +++ b/README.md @@ -14,11 +14,19 @@ This project started as a [vibe-coded](https://en.wikipedia.org/wiki/Vibe_coding ### Setup [venv](https://docs.python.org/3/library/venv.html) for [python](https://www.python.org/) - Run [`setup-venv.sh`](./setup-venv.sh). +Run [`setup-venv.sh`](./setup-venv.sh). + +> [!NOTE] +> The default location is a directory called `venv` that is created next to the script, but you can override it by using the environment variable `PYTHON_ENV` to point to a different location. +> +> ```console +> PYTHON_ENV='/some/path' ./setup-venv.s +> ``` + +### Environment + +Variable | Purpose +------------------------|------------------------- +`HF_TOKEN_FILE` | Used to resolve a file for the [`HF_TOKEN`](https://huggingface.co/docs/hub/en/security-tokens) secret that is used to download models from [Hugging Face](https://huggingface.co/). If it is not set it defaults to `~/.secrets/hugging-face.token`. +`HF_HUB_CACHE` | Location for hugging face model cache, defaults to `~/.cache/huggingface/hub`. - > [!NOTE] - > The default location is a directory called `venv` that is created next to the script, but you can override it by using the environment variable `PYTHON_ENV` to point to a different location. - > - > ```console - > PYTHON_ENV='/some/path' ./setup-venv.s - > ``` \ No newline at end of file diff --git a/chatterbox-server.py b/chatterbox-server.py new file mode 100755 index 0000000..4b8640e --- /dev/null +++ b/chatterbox-server.py @@ -0,0 +1,235 @@ +#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"' +""" +Chatterbox TTS server — keeps model loaded, reads JSON lines from stdin. + +Protocol: + stdin: {"text": "...", "temperature": 0.8, "top_p": 0.95} + {"chime": "/path/to/file.wav"} + {"preload": "/path/to/file.wav"} + stdout: "ok\n" after each utterance is generated (playback may still be in progress) + stderr: status/timing messages + +Usage: + ./chatterbox-server.py + ./chatterbox-server.py turbo # default + ./chatterbox-server.py full # original model, supports exaggeration + +Paralinguistic tags supported in text: + [laugh] [chuckle] [cough] [clear throat] [sigh] [shush] [groan] [sniff] [gasp] + +Full model only: + exaggeration 0.0-1.0 emotion intensity (ignored in turbo) +""" + +import os +import sys +import json +import time +import queue +import threading +import subprocess +import numpy as np + +TOKEN_FILE = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token')) +try: + with open(TOKEN_FILE) as f: + os.environ['HF_TOKEN'] = f.read().strip() +except FileNotFoundError: + pass + +def find_hf_cache(repo_id): + """Return the local snapshot path if the model is already cached, else None.""" + from pathlib import Path + cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub'))) + + repo_dir = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots' + if repo_dir.exists(): + snapshots = sorted(repo_dir.iterdir(), key=lambda p: p.stat().st_mtime) + if snapshots: + return str(snapshots[-1]) + return None + +VARIANT = sys.argv[1] if len(sys.argv) > 1 else 'turbo' +SAMPLE_RATE = 24000 + +def log(msg): + print(f'[chatterbox] {msg}', file=sys.stderr, flush=True) + +log(f'loading chatterbox-{VARIANT}...') +t0 = time.time() + +import tempfile +import traceback +import numpy as np +import torch +import soundfile as sf +import librosa as _librosa + +# librosa.resample returns float64 in newer numpy — patch it to always return float32 +_orig_resample = _librosa.resample +def _resample_float32(*args, **kwargs): + return _orig_resample(*args, **kwargs).astype(np.float32) +_librosa.resample = _resample_float32 + +device = 'cuda' if torch.cuda.is_available() else 'cpu' + +REPO_IDS = { + 'turbo': 'ResembleAI/chatterbox-turbo', + 'full': 'ResembleAI/chatterbox', +} + +if VARIANT == 'turbo': + from chatterbox.tts_turbo import ChatterboxTurboTTS as Model +else: + from chatterbox.tts import ChatterboxTTS as Model + +cached = find_hf_cache(REPO_IDS[VARIANT]) +if cached: + log(f'loading from cache: {cached}') + model = Model.from_local(cached, device=device) +else: + log('cache not found, downloading...') + model = Model.from_pretrained(device=device) + +log(f'ready on {device} ({time.time() - t0:.1f}s load time)') +print('ready', flush=True) + + +_wav_cache = {} + +def ensure_float32_wav(path): + """Re-save audio as float32 mono WAV to work around librosa/numpy float64 issue. + Result is cached by input path so repeated calls with the same file are free.""" + if path in _wav_cache: + return _wav_cache[path] + wav, sr = sf.read(path, dtype='float32', always_2d=True) + wav = wav.mean(axis=1) # stereo → mono if needed + tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) + sf.write(tmp.name, wav, sr, subtype='FLOAT') + _wav_cache[path] = tmp.name + return tmp.name + + +_SENTINEL = object() + +playback_queue = queue.Queue() + + +def playback_worker(): + """Plays audio samples in order. Runs in its own thread.""" + while True: + item = playback_queue.get() + if item is _SENTINEL: + break + samples = item + proc = subprocess.Popen( + ['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'], + stdin=subprocess.PIPE, + ) + proc.stdin.write(samples.tobytes()) + proc.stdin.close() + proc.wait() + playback_queue.task_done() + + +playback_thread = threading.Thread(target=playback_worker, daemon=True) +playback_thread.start() + + +def generate(text, opts): + t1 = time.time() + + if VARIANT == 'turbo': + kwargs = { + 'temperature': opts.get('temperature', 0.8), + 'top_p': opts.get('top_p', 0.95), + 'top_k': opts.get('top_k', 1000), + 'repetition_penalty': opts.get('repetition_penalty', 1.2), + 'min_p': opts.get('min_p', 0.0), + } + else: + kwargs = { + 'temperature': opts.get('temperature', 0.8), + 'top_p': opts.get('top_p', 1.0), + 'repetition_penalty': opts.get('repetition_penalty', 1.2), + 'min_p': opts.get('min_p', 0.05), + 'exaggeration': opts.get('exaggeration', 0.5), + 'cfg_weight': opts.get('cfg_weight', 0.5), + } + + audio_prompt = opts.get('audio_prompt') + if audio_prompt: + kwargs['audio_prompt_path'] = ensure_float32_wav(audio_prompt) + + with torch.inference_mode(): + wav = model.generate(text, **kwargs) + + samples = wav.squeeze(0).cpu().numpy().astype(np.float32) + elapsed = time.time() - t1 + duration = len(samples) / SAMPLE_RATE + log(f'generated {duration:.1f}s audio in {elapsed:.1f}s rtf={elapsed/duration:.2f}') + + return samples + + +_chime_cache = {} + +def load_chime(path): + if path in _chime_cache: + return _chime_cache[path] + samples, sr = sf.read(path, dtype='float32', always_2d=True) + samples = samples.mean(axis=1) # stereo → mono + if sr != SAMPLE_RATE: + samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE) + _chime_cache[path] = samples + return samples + + +for line in sys.stdin: + line = line.strip() + if not line: + continue + + try: + req = json.loads(line) + except json.JSONDecodeError: + req = {'text': line} + + if 'preload' in req: + try: + load_chime(req['preload']) + log(f'preloaded chime: {req["preload"]}') + except Exception as e: + log(f'preload error: {e}') + print('ok', flush=True) + continue + + if 'chime' in req: + try: + samples = load_chime(req['chime']) + playback_queue.put(samples) + except Exception as e: + log(f'chime error: {e}') + traceback.print_exc(file=sys.stderr) + print('ok', flush=True) + continue + + text = req.pop('text', '') + opts = req + + if not text: + print('ok', flush=True) + continue + + try: + samples = generate(text, opts) + playback_queue.put(samples) + except Exception as e: + log(f'error: {e}') + traceback.print_exc(file=sys.stderr) + + print('ok', flush=True) + +# Drain playback before exit +playback_queue.put(_SENTINEL) +playback_thread.join()