Merge pull request 'Convert chatterbox-server.py to HTTP server, add Node.js examples' (#1) from mikael-lovqvists-claude-agent/tts-server:http-server into main

Reviewed-on: #1
This commit was merged in pull request #1.
This commit is contained in:
2026-06-07 07:34:18 +00:00
4 changed files with 181 additions and 99 deletions

View File

@@ -1,13 +1,19 @@
#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"' #!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"'
""" """
Chatterbox TTS server — keeps model loaded, reads JSON lines from stdin. Chatterbox TTS HTTP server — keeps model loaded, exposes a JSON HTTP API.
Protocol: Endpoints:
stdin: {"text": "...", "temperature": 0.8, "top_p": 0.95} POST /speak {"text": "...", "temperature": 0.8, "top_p": 0.95, "audio_prompt": "/path.wav"}
{"chime": "/path/to/file.wav"} POST /chime {"path": "/path/to/file.wav"}
{"preload": "/path/to/file.wav"} POST /preload {"path": "/path/to/file.wav"}
stdout: "ok\n" after each utterance is generated (playback may still be in progress)
stderr: status/timing messages All endpoints return {"status": "ok"} or {"status": "error", "message": "..."}.
Responses are sent after audio is queued for playback (not after playback finishes).
Environment:
TTS_PORT TCP port to listen on (default: 11500)
HF_TOKEN_FILE path to HuggingFace token file (default: ~/.secrets/hugging-face.token)
HF_HUB_CACHE path to HuggingFace hub cache (default: ~/.cache/huggingface/hub)
Usage: Usage:
./chatterbox-server.py ./chatterbox-server.py
@@ -28,6 +34,10 @@ import time
import queue import queue
import threading import threading
import subprocess import subprocess
import traceback
import tempfile
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from pathlib import Path
import numpy as np import numpy as np
TOKEN_FILE = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token')) TOKEN_FILE = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token'))
@@ -37,35 +47,33 @@ try:
except FileNotFoundError: except FileNotFoundError:
pass pass
def find_hf_cache(repo_id):
"""Return the local snapshot path if the model is already cached, else None."""
from pathlib import Path
cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')))
repo_dir = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots' def find_hf_cache(repo_id):
cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')))
repo_dir = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots'
if repo_dir.exists(): if repo_dir.exists():
snapshots = sorted(repo_dir.iterdir(), key=lambda p: p.stat().st_mtime) snapshots = sorted(repo_dir.iterdir(), key=lambda p: p.stat().st_mtime)
if snapshots: if snapshots:
return str(snapshots[-1]) return str(snapshots[-1])
return None return None
VARIANT = sys.argv[1] if len(sys.argv) > 1 else 'turbo'
VARIANT = sys.argv[1] if len(sys.argv) > 1 else 'turbo'
PORT = int(os.environ.get('TTS_PORT', 11500))
SAMPLE_RATE = 24000 SAMPLE_RATE = 24000
def log(msg): def log(msg):
print(f'[chatterbox] {msg}', file=sys.stderr, flush=True) print(f'[chatterbox] {msg}', file=sys.stderr, flush=True)
log(f'loading chatterbox-{VARIANT}...') log(f'loading chatterbox-{VARIANT}...')
t0 = time.time() t0 = time.time()
import tempfile
import traceback
import numpy as np
import torch import torch
import soundfile as sf import soundfile as sf
import librosa as _librosa import librosa as _librosa
# librosa.resample returns float64 in newer numpy — patch it to always return float32
_orig_resample = _librosa.resample _orig_resample = _librosa.resample
def _resample_float32(*args, **kwargs): def _resample_float32(*args, **kwargs):
return _orig_resample(*args, **kwargs).astype(np.float32) return _orig_resample(*args, **kwargs).astype(np.float32)
@@ -92,48 +100,53 @@ else:
model = Model.from_pretrained(device=device) model = Model.from_pretrained(device=device)
log(f'ready on {device} ({time.time() - t0:.1f}s load time)') log(f'ready on {device} ({time.time() - t0:.1f}s load time)')
print('ready', flush=True)
_wav_cache = {}
_chime_cache = {}
_gen_lock = threading.Lock()
_SENTINEL = object()
playback_queue = queue.Queue()
_wav_cache = {} def playback_worker():
while True:
item = playback_queue.get()
if item is _SENTINEL:
break
proc = subprocess.Popen(
['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
stdin=subprocess.PIPE,
)
proc.stdin.write(item.tobytes())
proc.stdin.close()
proc.wait()
playback_queue.task_done()
threading.Thread(target=playback_worker, daemon=True).start()
def ensure_float32_wav(path): def ensure_float32_wav(path):
"""Re-save audio as float32 mono WAV to work around librosa/numpy float64 issue.
Result is cached by input path so repeated calls with the same file are free."""
if path in _wav_cache: if path in _wav_cache:
return _wav_cache[path] return _wav_cache[path]
wav, sr = sf.read(path, dtype='float32', always_2d=True) wav, sr = sf.read(path, dtype='float32', always_2d=True)
wav = wav.mean(axis=1) # stereo → mono if needed wav = wav.mean(axis=1)
tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
sf.write(tmp.name, wav, sr, subtype='FLOAT') sf.write(tmp.name, wav, sr, subtype='FLOAT')
_wav_cache[path] = tmp.name _wav_cache[path] = tmp.name
return tmp.name return tmp.name
_SENTINEL = object() def load_chime(path):
if path in _chime_cache:
playback_queue = queue.Queue() return _chime_cache[path]
samples, sr = sf.read(path, dtype='float32', always_2d=True)
samples = samples.mean(axis=1)
def playback_worker(): if sr != SAMPLE_RATE:
"""Plays audio samples in order. Runs in its own thread.""" samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE)
while True: _chime_cache[path] = samples
item = playback_queue.get() return samples
if item is _SENTINEL:
break
samples = item
proc = subprocess.Popen(
['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
stdin=subprocess.PIPE,
)
proc.stdin.write(samples.tobytes())
proc.stdin.close()
proc.wait()
playback_queue.task_done()
playback_thread = threading.Thread(target=playback_worker, daemon=True)
playback_thread.start()
def generate(text, opts): def generate(text, opts):
@@ -168,68 +181,74 @@ def generate(text, opts):
elapsed = time.time() - t1 elapsed = time.time() - t1
duration = len(samples) / SAMPLE_RATE duration = len(samples) / SAMPLE_RATE
log(f'generated {duration:.1f}s audio in {elapsed:.1f}s rtf={elapsed/duration:.2f}') log(f'generated {duration:.1f}s audio in {elapsed:.1f}s rtf={elapsed/duration:.2f}')
return samples return samples
_chime_cache = {} class Handler(BaseHTTPRequestHandler):
def send_json(self, data, status=200):
body = json.dumps(data).encode()
self.send_response(status)
self.send_header('Content-Type', 'application/json')
self.send_header('Content-Length', str(len(body)))
self.end_headers()
self.wfile.write(body)
def load_chime(path): def read_json(self):
if path in _chime_cache: length = int(self.headers.get('Content-Length', 0))
return _chime_cache[path] return json.loads(self.rfile.read(length))
samples, sr = sf.read(path, dtype='float32', always_2d=True)
samples = samples.mean(axis=1) # stereo → mono
if sr != SAMPLE_RATE:
samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE)
_chime_cache[path] = samples
return samples
def do_POST(self):
for line in sys.stdin:
line = line.strip()
if not line:
continue
try:
req = json.loads(line)
except json.JSONDecodeError:
req = {'text': line}
if 'preload' in req:
try: try:
load_chime(req['preload']) req = self.read_json()
log(f'preloaded chime: {req["preload"]}') except Exception:
except Exception as e: self.send_json({'status': 'error', 'message': 'invalid JSON'}, 400)
log(f'preload error: {e}') return
print('ok', flush=True)
continue
if 'chime' in req: if self.path == '/speak':
try: text = req.pop('text', '')
samples = load_chime(req['chime']) if not text:
playback_queue.put(samples) self.send_json({'status': 'ok'})
except Exception as e: return
log(f'chime error: {e}') try:
traceback.print_exc(file=sys.stderr) with _gen_lock:
print('ok', flush=True) samples = generate(text, req)
continue playback_queue.put(samples)
self.send_json({'status': 'ok'})
except Exception as e:
traceback.print_exc(file=sys.stderr)
self.send_json({'status': 'error', 'message': str(e)}, 500)
text = req.pop('text', '') elif self.path == '/chime':
opts = req path = req.get('path', '')
try:
samples = load_chime(path)
playback_queue.put(samples)
self.send_json({'status': 'ok'})
except Exception as e:
traceback.print_exc(file=sys.stderr)
self.send_json({'status': 'error', 'message': str(e)}, 500)
if not text: elif self.path == '/preload':
print('ok', flush=True) path = req.get('path', '')
continue try:
load_chime(path)
log(f'preloaded: {path}')
self.send_json({'status': 'ok'})
except Exception as e:
self.send_json({'status': 'error', 'message': str(e)}, 500)
try: else:
samples = generate(text, opts) self.send_json({'status': 'error', 'message': 'not found'}, 404)
playback_queue.put(samples)
except Exception as e:
log(f'error: {e}')
traceback.print_exc(file=sys.stderr)
print('ok', flush=True) def log_message(self, fmt, *args):
log(fmt % args)
# Drain playback before exit
playback_queue.put(_SENTINEL) server = ThreadingHTTPServer(('', PORT), Handler)
playback_thread.join() log(f'listening on port {PORT}')
try:
server.serve_forever()
except KeyboardInterrupt:
pass
finally:
playback_queue.put(_SENTINEL)

22
examples/chime.mjs Normal file
View File

@@ -0,0 +1,22 @@
// Play a chime WAV file via the Chatterbox TTS server.
// Usage: node chime.mjs /path/to/chime.wav
const PORT = process.env.TTS_PORT ?? '11500'
const path = process.argv[2]
if (!path) {
console.error('usage: node chime.mjs /path/to/chime.wav')
process.exit(1)
}
const res = await fetch(`http://localhost:${PORT}/chime`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ path }),
})
const data = await res.json()
if (data.status !== 'ok') {
console.error('error:', data.message)
process.exit(1)
}

17
examples/speak.mjs Normal file
View File

@@ -0,0 +1,17 @@
// Speak text via the Chatterbox TTS server.
// Usage: node speak.mjs "Hello world"
const PORT = process.env.TTS_PORT ?? '11500'
const text = process.argv[2] ?? 'Hello from Node.'
const res = await fetch(`http://localhost:${PORT}/speak`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text }),
})
const data = await res.json()
if (data.status !== 'ok') {
console.error('error:', data.message)
process.exit(1)
}

24
examples/voice-clone.mjs Normal file
View File

@@ -0,0 +1,24 @@
// Speak text using a reference WAV for voice cloning.
// The server reads the audio_prompt path from its own filesystem.
// Usage: node voice-clone.mjs /path/to/reference.wav "Text to speak"
const PORT = process.env.TTS_PORT ?? '11500'
const audio_prompt = process.argv[2]
const text = process.argv[3] ?? 'Hello, this is a cloned voice.'
if (!audio_prompt) {
console.error('usage: node voice-clone.mjs /path/to/reference.wav "text"')
process.exit(1)
}
const res = await fetch(`http://localhost:${PORT}/speak`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text, audio_prompt }),
})
const data = await res.json()
if (data.status !== 'ok') {
console.error('error:', data.message)
process.exit(1)
}