Added readme and chatterbox-server.py

2026-06-07 09:16:26 +02:00
parent 6a6da2d066
commit 4a54bdab04
2 changed files with 250 additions and 7 deletions
--- a/README.md
+++ b/README.md
@@ -14,11 +14,19 @@ This project started as a [vibe-coded](https://en.wikipedia.org/wiki/Vibe_coding

 ### Setup [venv](https://docs.python.org/3/library/venv.html) for [python](https://www.python.org/)

-	Run [`setup-venv.sh`](./setup-venv.sh).
+Run [`setup-venv.sh`](./setup-venv.sh).
+
+> [!NOTE]
+> The default location is a directory called `venv` that is created next to the script, but you can override it by using the environment variable `PYTHON_ENV` to point to a different location.
+>
+> ```console
+> PYTHON_ENV='/some/path' ./setup-venv.s
+> ```
+
+### Environment
+
+Variable				|	Purpose
+------------------------|-------------------------
+`HF_TOKEN_FILE`			|	Used to resolve a file for the [`HF_TOKEN`](https://huggingface.co/docs/hub/en/security-tokens) secret that is used to download models from [Hugging Face](https://huggingface.co/). If it is not set it defaults to `~/.secrets/hugging-face.token`.
+`HF_HUB_CACHE`			|	Location for hugging face model cache, defaults to `~/.cache/huggingface/hub`.

-	> [!NOTE]
-	> The default location is a directory called `venv` that is created next to the script, but you can override it by using the environment variable `PYTHON_ENV` to point to a different location.
-	>
-	> ```console
-	> PYTHON_ENV='/some/path' ./setup-venv.s
-	> ```
--- a/chatterbox-server.py
+++ b/chatterbox-server.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"'
+"""
+Chatterbox TTS server — keeps model loaded, reads JSON lines from stdin.
+
+Protocol:
+  stdin:  {"text": "...", "temperature": 0.8, "top_p": 0.95}
+		  {"chime": "/path/to/file.wav"}
+		  {"preload": "/path/to/file.wav"}
+  stdout: "ok\n" after each utterance is generated (playback may still be in progress)
+  stderr: status/timing messages
+
+Usage:
+  ./chatterbox-server.py
+  ./chatterbox-server.py turbo    # default
+  ./chatterbox-server.py full     # original model, supports exaggeration
+
+Paralinguistic tags supported in text:
+  [laugh] [chuckle] [cough] [clear throat] [sigh] [shush] [groan] [sniff] [gasp]
+
+Full model only:
+  exaggeration  0.0-1.0  emotion intensity (ignored in turbo)
+"""
+
+import os
+import sys
+import json
+import time
+import queue
+import threading
+import subprocess
+import numpy as np
+
+TOKEN_FILE = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token'))
+try:
+	with open(TOKEN_FILE) as f:
+		os.environ['HF_TOKEN'] = f.read().strip()
+except FileNotFoundError:
+	pass
+
+def find_hf_cache(repo_id):
+	"""Return the local snapshot path if the model is already cached, else None."""
+	from pathlib import Path
+	cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')))
+
+	repo_dir  = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots'
+	if repo_dir.exists():
+		snapshots = sorted(repo_dir.iterdir(), key=lambda p: p.stat().st_mtime)
+		if snapshots:
+			return str(snapshots[-1])
+	return None
+
+VARIANT    = sys.argv[1] if len(sys.argv) > 1 else 'turbo'
+SAMPLE_RATE = 24000
+
+def log(msg):
+	print(f'[chatterbox] {msg}', file=sys.stderr, flush=True)
+
+log(f'loading chatterbox-{VARIANT}...')
+t0 = time.time()
+
+import tempfile
+import traceback
+import numpy as np
+import torch
+import soundfile as sf
+import librosa as _librosa
+
+# librosa.resample returns float64 in newer numpy — patch it to always return float32
+_orig_resample = _librosa.resample
+def _resample_float32(*args, **kwargs):
+	return _orig_resample(*args, **kwargs).astype(np.float32)
+_librosa.resample = _resample_float32
+
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+REPO_IDS = {
+	'turbo': 'ResembleAI/chatterbox-turbo',
+	'full':  'ResembleAI/chatterbox',
+}
+
+if VARIANT == 'turbo':
+	from chatterbox.tts_turbo import ChatterboxTurboTTS as Model
+else:
+	from chatterbox.tts import ChatterboxTTS as Model
+
+cached = find_hf_cache(REPO_IDS[VARIANT])
+if cached:
+	log(f'loading from cache: {cached}')
+	model = Model.from_local(cached, device=device)
+else:
+	log('cache not found, downloading...')
+	model = Model.from_pretrained(device=device)
+
+log(f'ready on {device} ({time.time() - t0:.1f}s load time)')
+print('ready', flush=True)
+
+
+_wav_cache = {}
+
+def ensure_float32_wav(path):
+	"""Re-save audio as float32 mono WAV to work around librosa/numpy float64 issue.
+	Result is cached by input path so repeated calls with the same file are free."""
+	if path in _wav_cache:
+		return _wav_cache[path]
+	wav, sr = sf.read(path, dtype='float32', always_2d=True)
+	wav = wav.mean(axis=1)  # stereo → mono if needed
+	tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+	sf.write(tmp.name, wav, sr, subtype='FLOAT')
+	_wav_cache[path] = tmp.name
+	return tmp.name
+
+
+_SENTINEL = object()
+
+playback_queue = queue.Queue()
+
+
+def playback_worker():
+	"""Plays audio samples in order. Runs in its own thread."""
+	while True:
+		item = playback_queue.get()
+		if item is _SENTINEL:
+			break
+		samples = item
+		proc = subprocess.Popen(
+			['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
+			stdin=subprocess.PIPE,
+		)
+		proc.stdin.write(samples.tobytes())
+		proc.stdin.close()
+		proc.wait()
+		playback_queue.task_done()
+
+
+playback_thread = threading.Thread(target=playback_worker, daemon=True)
+playback_thread.start()
+
+
+def generate(text, opts):
+	t1 = time.time()
+
+	if VARIANT == 'turbo':
+		kwargs = {
+			'temperature':        opts.get('temperature',        0.8),
+			'top_p':              opts.get('top_p',              0.95),
+			'top_k':              opts.get('top_k',              1000),
+			'repetition_penalty': opts.get('repetition_penalty', 1.2),
+			'min_p':              opts.get('min_p',              0.0),
+		}
+	else:
+		kwargs = {
+			'temperature':        opts.get('temperature',        0.8),
+			'top_p':              opts.get('top_p',              1.0),
+			'repetition_penalty': opts.get('repetition_penalty', 1.2),
+			'min_p':              opts.get('min_p',              0.05),
+			'exaggeration':       opts.get('exaggeration',       0.5),
+			'cfg_weight':         opts.get('cfg_weight',         0.5),
+		}
+
+	audio_prompt = opts.get('audio_prompt')
+	if audio_prompt:
+		kwargs['audio_prompt_path'] = ensure_float32_wav(audio_prompt)
+
+	with torch.inference_mode():
+		wav = model.generate(text, **kwargs)
+
+	samples = wav.squeeze(0).cpu().numpy().astype(np.float32)
+	elapsed = time.time() - t1
+	duration = len(samples) / SAMPLE_RATE
+	log(f'generated {duration:.1f}s audio in {elapsed:.1f}s  rtf={elapsed/duration:.2f}')
+
+	return samples
+
+
+_chime_cache = {}
+
+def load_chime(path):
+	if path in _chime_cache:
+		return _chime_cache[path]
+	samples, sr = sf.read(path, dtype='float32', always_2d=True)
+	samples = samples.mean(axis=1)  # stereo → mono
+	if sr != SAMPLE_RATE:
+		samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE)
+	_chime_cache[path] = samples
+	return samples
+
+
+for line in sys.stdin:
+	line = line.strip()
+	if not line:
+		continue
+
+	try:
+		req = json.loads(line)
+	except json.JSONDecodeError:
+		req = {'text': line}
+
+	if 'preload' in req:
+		try:
+			load_chime(req['preload'])
+			log(f'preloaded chime: {req["preload"]}')
+		except Exception as e:
+			log(f'preload error: {e}')
+		print('ok', flush=True)
+		continue
+
+	if 'chime' in req:
+		try:
+			samples = load_chime(req['chime'])
+			playback_queue.put(samples)
+		except Exception as e:
+			log(f'chime error: {e}')
+			traceback.print_exc(file=sys.stderr)
+		print('ok', flush=True)
+		continue
+
+	text = req.pop('text', '')
+	opts = req
+
+	if not text:
+		print('ok', flush=True)
+		continue
+
+	try:
+		samples = generate(text, opts)
+		playback_queue.put(samples)
+	except Exception as e:
+		log(f'error: {e}')
+		traceback.print_exc(file=sys.stderr)
+
+	print('ok', flush=True)
+
+# Drain playback before exit
+playback_queue.put(_SENTINEL)
+playback_thread.join()