Compare commits
2 Commits
bdae4c047f
...
26837bec6a
| Author | SHA1 | Date | |
|---|---|---|---|
| 26837bec6a | |||
| f6ff8c72e8 |
@@ -1,13 +1,19 @@
|
||||
#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"'
|
||||
"""
|
||||
Chatterbox TTS server — keeps model loaded, reads JSON lines from stdin.
|
||||
Chatterbox TTS HTTP server — keeps model loaded, exposes a JSON HTTP API.
|
||||
|
||||
Protocol:
|
||||
stdin: {"text": "...", "temperature": 0.8, "top_p": 0.95}
|
||||
{"chime": "/path/to/file.wav"}
|
||||
{"preload": "/path/to/file.wav"}
|
||||
stdout: "ok\n" after each utterance is generated (playback may still be in progress)
|
||||
stderr: status/timing messages
|
||||
Endpoints:
|
||||
POST /speak {"text": "...", "temperature": 0.8, "top_p": 0.95, "audio_prompt": "/path.wav"}
|
||||
POST /chime {"path": "/path/to/file.wav"}
|
||||
POST /preload {"path": "/path/to/file.wav"}
|
||||
|
||||
All endpoints return {"status": "ok"} or {"status": "error", "message": "..."}.
|
||||
Responses are sent after audio is queued for playback (not after playback finishes).
|
||||
|
||||
Environment:
|
||||
TTS_PORT TCP port to listen on (default: 11500)
|
||||
HF_TOKEN_FILE path to HuggingFace token file (default: ~/.secrets/hugging-face.token)
|
||||
HF_HUB_CACHE path to HuggingFace hub cache (default: ~/.cache/huggingface/hub)
|
||||
|
||||
Usage:
|
||||
./chatterbox-server.py
|
||||
@@ -28,6 +34,10 @@ import time
|
||||
import queue
|
||||
import threading
|
||||
import subprocess
|
||||
import traceback
|
||||
import tempfile
|
||||
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
|
||||
TOKEN_FILE = os.environ.get('HF_TOKEN_FILE', os.path.expanduser('~/.secrets/hugging-face.token'))
|
||||
@@ -37,11 +47,9 @@ try:
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
def find_hf_cache(repo_id):
|
||||
"""Return the local snapshot path if the model is already cached, else None."""
|
||||
from pathlib import Path
|
||||
cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')))
|
||||
|
||||
def find_hf_cache(repo_id):
|
||||
cache_dir = Path(os.environ.get('HF_HUB_CACHE', os.path.expanduser('~/.cache/huggingface/hub')))
|
||||
repo_dir = cache_dir / f"models--{repo_id.replace('/', '--')}" / 'snapshots'
|
||||
if repo_dir.exists():
|
||||
snapshots = sorted(repo_dir.iterdir(), key=lambda p: p.stat().st_mtime)
|
||||
@@ -49,23 +57,23 @@ def find_hf_cache(repo_id):
|
||||
return str(snapshots[-1])
|
||||
return None
|
||||
|
||||
|
||||
VARIANT = sys.argv[1] if len(sys.argv) > 1 else 'turbo'
|
||||
PORT = int(os.environ.get('TTS_PORT', 11500))
|
||||
SAMPLE_RATE = 24000
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(f'[chatterbox] {msg}', file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
log(f'loading chatterbox-{VARIANT}...')
|
||||
t0 = time.time()
|
||||
|
||||
import tempfile
|
||||
import traceback
|
||||
import numpy as np
|
||||
import torch
|
||||
import soundfile as sf
|
||||
import librosa as _librosa
|
||||
|
||||
# librosa.resample returns float64 in newer numpy — patch it to always return float32
|
||||
_orig_resample = _librosa.resample
|
||||
def _resample_float32(*args, **kwargs):
|
||||
return _orig_resample(*args, **kwargs).astype(np.float32)
|
||||
@@ -92,48 +100,53 @@ else:
|
||||
model = Model.from_pretrained(device=device)
|
||||
|
||||
log(f'ready on {device} ({time.time() - t0:.1f}s load time)')
|
||||
print('ready', flush=True)
|
||||
|
||||
|
||||
_wav_cache = {}
|
||||
_chime_cache = {}
|
||||
_gen_lock = threading.Lock()
|
||||
|
||||
_SENTINEL = object()
|
||||
playback_queue = queue.Queue()
|
||||
|
||||
|
||||
def playback_worker():
|
||||
while True:
|
||||
item = playback_queue.get()
|
||||
if item is _SENTINEL:
|
||||
break
|
||||
proc = subprocess.Popen(
|
||||
['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
|
||||
stdin=subprocess.PIPE,
|
||||
)
|
||||
proc.stdin.write(item.tobytes())
|
||||
proc.stdin.close()
|
||||
proc.wait()
|
||||
playback_queue.task_done()
|
||||
|
||||
|
||||
threading.Thread(target=playback_worker, daemon=True).start()
|
||||
|
||||
|
||||
def ensure_float32_wav(path):
|
||||
"""Re-save audio as float32 mono WAV to work around librosa/numpy float64 issue.
|
||||
Result is cached by input path so repeated calls with the same file are free."""
|
||||
if path in _wav_cache:
|
||||
return _wav_cache[path]
|
||||
wav, sr = sf.read(path, dtype='float32', always_2d=True)
|
||||
wav = wav.mean(axis=1) # stereo → mono if needed
|
||||
wav = wav.mean(axis=1)
|
||||
tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|
||||
sf.write(tmp.name, wav, sr, subtype='FLOAT')
|
||||
_wav_cache[path] = tmp.name
|
||||
return tmp.name
|
||||
|
||||
|
||||
_SENTINEL = object()
|
||||
|
||||
playback_queue = queue.Queue()
|
||||
|
||||
|
||||
def playback_worker():
|
||||
"""Plays audio samples in order. Runs in its own thread."""
|
||||
while True:
|
||||
item = playback_queue.get()
|
||||
if item is _SENTINEL:
|
||||
break
|
||||
samples = item
|
||||
proc = subprocess.Popen(
|
||||
['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
|
||||
stdin=subprocess.PIPE,
|
||||
)
|
||||
proc.stdin.write(samples.tobytes())
|
||||
proc.stdin.close()
|
||||
proc.wait()
|
||||
playback_queue.task_done()
|
||||
|
||||
|
||||
playback_thread = threading.Thread(target=playback_worker, daemon=True)
|
||||
playback_thread.start()
|
||||
def load_chime(path):
|
||||
if path in _chime_cache:
|
||||
return _chime_cache[path]
|
||||
samples, sr = sf.read(path, dtype='float32', always_2d=True)
|
||||
samples = samples.mean(axis=1)
|
||||
if sr != SAMPLE_RATE:
|
||||
samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE)
|
||||
_chime_cache[path] = samples
|
||||
return samples
|
||||
|
||||
|
||||
def generate(text, opts):
|
||||
@@ -168,68 +181,74 @@ def generate(text, opts):
|
||||
elapsed = time.time() - t1
|
||||
duration = len(samples) / SAMPLE_RATE
|
||||
log(f'generated {duration:.1f}s audio in {elapsed:.1f}s rtf={elapsed/duration:.2f}')
|
||||
|
||||
return samples
|
||||
|
||||
|
||||
_chime_cache = {}
|
||||
class Handler(BaseHTTPRequestHandler):
|
||||
def send_json(self, data, status=200):
|
||||
body = json.dumps(data).encode()
|
||||
self.send_response(status)
|
||||
self.send_header('Content-Type', 'application/json')
|
||||
self.send_header('Content-Length', str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def load_chime(path):
|
||||
if path in _chime_cache:
|
||||
return _chime_cache[path]
|
||||
samples, sr = sf.read(path, dtype='float32', always_2d=True)
|
||||
samples = samples.mean(axis=1) # stereo → mono
|
||||
if sr != SAMPLE_RATE:
|
||||
samples = _librosa.resample(samples, orig_sr=sr, target_sr=SAMPLE_RATE)
|
||||
_chime_cache[path] = samples
|
||||
return samples
|
||||
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
def read_json(self):
|
||||
length = int(self.headers.get('Content-Length', 0))
|
||||
return json.loads(self.rfile.read(length))
|
||||
|
||||
def do_POST(self):
|
||||
try:
|
||||
req = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
req = {'text': line}
|
||||
|
||||
if 'preload' in req:
|
||||
try:
|
||||
load_chime(req['preload'])
|
||||
log(f'preloaded chime: {req["preload"]}')
|
||||
except Exception as e:
|
||||
log(f'preload error: {e}')
|
||||
print('ok', flush=True)
|
||||
continue
|
||||
|
||||
if 'chime' in req:
|
||||
try:
|
||||
samples = load_chime(req['chime'])
|
||||
playback_queue.put(samples)
|
||||
except Exception as e:
|
||||
log(f'chime error: {e}')
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
print('ok', flush=True)
|
||||
continue
|
||||
req = self.read_json()
|
||||
except Exception:
|
||||
self.send_json({'status': 'error', 'message': 'invalid JSON'}, 400)
|
||||
return
|
||||
|
||||
if self.path == '/speak':
|
||||
text = req.pop('text', '')
|
||||
opts = req
|
||||
|
||||
if not text:
|
||||
print('ok', flush=True)
|
||||
continue
|
||||
|
||||
self.send_json({'status': 'ok'})
|
||||
return
|
||||
try:
|
||||
samples = generate(text, opts)
|
||||
with _gen_lock:
|
||||
samples = generate(text, req)
|
||||
playback_queue.put(samples)
|
||||
self.send_json({'status': 'ok'})
|
||||
except Exception as e:
|
||||
log(f'error: {e}')
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
self.send_json({'status': 'error', 'message': str(e)}, 500)
|
||||
|
||||
print('ok', flush=True)
|
||||
elif self.path == '/chime':
|
||||
path = req.get('path', '')
|
||||
try:
|
||||
samples = load_chime(path)
|
||||
playback_queue.put(samples)
|
||||
self.send_json({'status': 'ok'})
|
||||
except Exception as e:
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
self.send_json({'status': 'error', 'message': str(e)}, 500)
|
||||
|
||||
# Drain playback before exit
|
||||
playback_queue.put(_SENTINEL)
|
||||
playback_thread.join()
|
||||
elif self.path == '/preload':
|
||||
path = req.get('path', '')
|
||||
try:
|
||||
load_chime(path)
|
||||
log(f'preloaded: {path}')
|
||||
self.send_json({'status': 'ok'})
|
||||
except Exception as e:
|
||||
self.send_json({'status': 'error', 'message': str(e)}, 500)
|
||||
|
||||
else:
|
||||
self.send_json({'status': 'error', 'message': 'not found'}, 404)
|
||||
|
||||
def log_message(self, fmt, *args):
|
||||
log(fmt % args)
|
||||
|
||||
|
||||
server = ThreadingHTTPServer(('', PORT), Handler)
|
||||
log(f'listening on port {PORT}')
|
||||
try:
|
||||
server.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
finally:
|
||||
playback_queue.put(_SENTINEL)
|
||||
|
||||
22
examples/chime.mjs
Normal file
22
examples/chime.mjs
Normal file
@@ -0,0 +1,22 @@
|
||||
// Play a chime WAV file via the Chatterbox TTS server.
|
||||
// Usage: node chime.mjs /path/to/chime.wav
|
||||
|
||||
const PORT = process.env.TTS_PORT ?? '11500'
|
||||
const path = process.argv[2]
|
||||
|
||||
if (!path) {
|
||||
console.error('usage: node chime.mjs /path/to/chime.wav')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const res = await fetch(`http://localhost:${PORT}/chime`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ path }),
|
||||
})
|
||||
|
||||
const data = await res.json()
|
||||
if (data.status !== 'ok') {
|
||||
console.error('error:', data.message)
|
||||
process.exit(1)
|
||||
}
|
||||
17
examples/speak.mjs
Normal file
17
examples/speak.mjs
Normal file
@@ -0,0 +1,17 @@
|
||||
// Speak text via the Chatterbox TTS server.
|
||||
// Usage: node speak.mjs "Hello world"
|
||||
|
||||
const PORT = process.env.TTS_PORT ?? '11500'
|
||||
const text = process.argv[2] ?? 'Hello from Node.'
|
||||
|
||||
const res = await fetch(`http://localhost:${PORT}/speak`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ text }),
|
||||
})
|
||||
|
||||
const data = await res.json()
|
||||
if (data.status !== 'ok') {
|
||||
console.error('error:', data.message)
|
||||
process.exit(1)
|
||||
}
|
||||
24
examples/voice-clone.mjs
Normal file
24
examples/voice-clone.mjs
Normal file
@@ -0,0 +1,24 @@
|
||||
// Speak text using a reference WAV for voice cloning.
|
||||
// The server reads the audio_prompt path from its own filesystem.
|
||||
// Usage: node voice-clone.mjs /path/to/reference.wav "Text to speak"
|
||||
|
||||
const PORT = process.env.TTS_PORT ?? '11500'
|
||||
const audio_prompt = process.argv[2]
|
||||
const text = process.argv[3] ?? 'Hello, this is a cloned voice.'
|
||||
|
||||
if (!audio_prompt) {
|
||||
console.error('usage: node voice-clone.mjs /path/to/reference.wav "text"')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const res = await fetch(`http://localhost:${PORT}/speak`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ text, audio_prompt }),
|
||||
})
|
||||
|
||||
const data = await res.json()
|
||||
if (data.status !== 'ok') {
|
||||
console.error('error:', data.message)
|
||||
process.exit(1)
|
||||
}
|
||||
Reference in New Issue
Block a user