STT (Silero VAD + Whisper via sherpa-onnx), Chatterbox TTS HTTP server, query completeness classifier (Ollama), multi-voice demo scripts, and planning docs. Kept as reference; clean rewrite planned in separate repos. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
136 lines
4.1 KiB
Python
Executable File
136 lines
4.1 KiB
Python
Executable File
#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"'
|
|
"""
|
|
Bark TTS server — keeps model loaded, reads JSON lines from stdin.
|
|
|
|
Protocol:
|
|
stdin: one JSON line per request: {"text": "...", "voice": "v2/en_speaker_6"}
|
|
stdout: "ok\n" after each utterance is finished playing
|
|
stderr: status/timing messages
|
|
|
|
Usage:
|
|
python bark-server.py [model] [voice]
|
|
python bark-server.py suno/bark-small v2/en_speaker_3
|
|
|
|
Voices (English):
|
|
v2/en_speaker_0 .. v2/en_speaker_9
|
|
Speaker 6 = neutral/warm, 9 = expressive, 3 = deep male
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import time
|
|
import subprocess
|
|
import numpy as np
|
|
|
|
MODEL = sys.argv[1] if len(sys.argv) > 1 else 'suno/bark'
|
|
DEF_VOICE = sys.argv[2] if len(sys.argv) > 2 else 'v2/en_speaker_6'
|
|
SAMPLE_RATE = 24000
|
|
|
|
TOKEN_FILE = os.path.expanduser('~/.secrets/hugging-face.token')
|
|
try:
|
|
with open(TOKEN_FILE) as f:
|
|
os.environ['HF_TOKEN'] = f.read().strip()
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
# Disable background safetensors conversion attempts — the HF API endpoint
|
|
# it calls has changed and now errors. The pytorch_model.bin files work fine.
|
|
os.environ['SAFETENSORS_FAST_GPU'] = '0'
|
|
os.environ.setdefault('TRANSFORMERS_NO_ADVISORY_WARNINGS', '1')
|
|
|
|
def log(msg):
|
|
print(f'[bark] {msg}', file=sys.stderr, flush=True)
|
|
|
|
log(f'loading {MODEL}...')
|
|
t0 = time.time()
|
|
|
|
import torch
|
|
from transformers import AutoProcessor, BarkModel
|
|
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
dtype = torch.float16 if device == 'cuda' else torch.float32
|
|
|
|
if device == 'cuda':
|
|
# TF32 is faster on Ampere (30xx, 40xx) for both matmul and convolutions
|
|
torch.backends.cuda.matmul.allow_tf32 = True
|
|
torch.backends.cudnn.allow_tf32 = True
|
|
|
|
processor = AutoProcessor.from_pretrained(MODEL)
|
|
model = BarkModel.from_pretrained(MODEL, torch_dtype=dtype, use_safetensors=False).to(device)
|
|
|
|
# Bark's internal generation calls set both max_length and max_new_tokens simultaneously
|
|
# which causes a conflict warning and may cause early stopping. Clearing max_length
|
|
# lets max_new_tokens take sole control.
|
|
model.generation_config.max_length = None
|
|
|
|
# torch.compile gives a meaningful speedup on 3090 (adds ~30s first-run compile)
|
|
try:
|
|
model = torch.compile(model, mode='reduce-overhead')
|
|
log('torch.compile applied')
|
|
except Exception as e:
|
|
log(f'torch.compile skipped: {e}')
|
|
|
|
log(f'ready on {device} ({time.time() - t0:.1f}s load time)')
|
|
print('ready', flush=True) # signal to Node that we are ready
|
|
|
|
|
|
def speak(text, voice):
|
|
t1 = time.time()
|
|
|
|
inputs = processor(text, voice_preset=voice, return_tensors='pt')
|
|
|
|
# Set attention_mask if not provided by processor — without it the model
|
|
# cannot distinguish padding from real tokens, which causes erratic generation
|
|
# including leading filler sounds.
|
|
if 'attention_mask' not in inputs:
|
|
inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
|
|
|
|
inputs = {k: v.to(device) for k, v in inputs.items()}
|
|
|
|
with torch.inference_mode():
|
|
audio = model.generate(
|
|
**inputs,
|
|
semantic_temperature = 0.3,
|
|
coarse_temperature = 0.3,
|
|
fine_temperature = 0.3,
|
|
)
|
|
|
|
samples = audio.cpu().numpy().squeeze().astype(np.float32)
|
|
elapsed_gen = time.time() - t1
|
|
duration = len(samples) / SAMPLE_RATE
|
|
log(f'generated {duration:.1f}s audio in {elapsed_gen:.1f}s rtf={elapsed_gen/duration:.2f}')
|
|
|
|
proc = subprocess.Popen(
|
|
['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
|
|
stdin=subprocess.PIPE,
|
|
)
|
|
proc.stdin.write(samples.tobytes())
|
|
proc.stdin.close()
|
|
proc.wait()
|
|
|
|
|
|
for line in sys.stdin:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
try:
|
|
req = json.loads(line)
|
|
text = req.get('text', '')
|
|
voice = req.get('voice', DEF_VOICE)
|
|
except json.JSONDecodeError:
|
|
text = line
|
|
voice = DEF_VOICE
|
|
|
|
if not text:
|
|
print('ok', flush=True)
|
|
continue
|
|
|
|
try:
|
|
speak(text, voice)
|
|
except Exception as e:
|
|
log(f'error: {e}')
|
|
|
|
print('ok', flush=True)
|