Files
claude-voice-experiment/bark-server.py
mikael-lovqvists-claude-agent db8889aeed Initial commit — voice pipeline experiment
STT (Silero VAD + Whisper via sherpa-onnx), Chatterbox TTS HTTP server,
query completeness classifier (Ollama), multi-voice demo scripts, and
planning docs. Kept as reference; clean rewrite planned in separate repos.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-30 04:48:54 +00:00

136 lines
4.1 KiB
Python
Executable File

#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"'
"""
Bark TTS server — keeps model loaded, reads JSON lines from stdin.
Protocol:
stdin: one JSON line per request: {"text": "...", "voice": "v2/en_speaker_6"}
stdout: "ok\n" after each utterance is finished playing
stderr: status/timing messages
Usage:
python bark-server.py [model] [voice]
python bark-server.py suno/bark-small v2/en_speaker_3
Voices (English):
v2/en_speaker_0 .. v2/en_speaker_9
Speaker 6 = neutral/warm, 9 = expressive, 3 = deep male
"""
import os
import sys
import json
import time
import subprocess
import numpy as np
MODEL = sys.argv[1] if len(sys.argv) > 1 else 'suno/bark'
DEF_VOICE = sys.argv[2] if len(sys.argv) > 2 else 'v2/en_speaker_6'
SAMPLE_RATE = 24000
TOKEN_FILE = os.path.expanduser('~/.secrets/hugging-face.token')
try:
with open(TOKEN_FILE) as f:
os.environ['HF_TOKEN'] = f.read().strip()
except FileNotFoundError:
pass
# Disable background safetensors conversion attempts — the HF API endpoint
# it calls has changed and now errors. The pytorch_model.bin files work fine.
os.environ['SAFETENSORS_FAST_GPU'] = '0'
os.environ.setdefault('TRANSFORMERS_NO_ADVISORY_WARNINGS', '1')
def log(msg):
print(f'[bark] {msg}', file=sys.stderr, flush=True)
log(f'loading {MODEL}...')
t0 = time.time()
import torch
from transformers import AutoProcessor, BarkModel
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = torch.float16 if device == 'cuda' else torch.float32
if device == 'cuda':
# TF32 is faster on Ampere (30xx, 40xx) for both matmul and convolutions
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
processor = AutoProcessor.from_pretrained(MODEL)
model = BarkModel.from_pretrained(MODEL, torch_dtype=dtype, use_safetensors=False).to(device)
# Bark's internal generation calls set both max_length and max_new_tokens simultaneously
# which causes a conflict warning and may cause early stopping. Clearing max_length
# lets max_new_tokens take sole control.
model.generation_config.max_length = None
# torch.compile gives a meaningful speedup on 3090 (adds ~30s first-run compile)
try:
model = torch.compile(model, mode='reduce-overhead')
log('torch.compile applied')
except Exception as e:
log(f'torch.compile skipped: {e}')
log(f'ready on {device} ({time.time() - t0:.1f}s load time)')
print('ready', flush=True) # signal to Node that we are ready
def speak(text, voice):
t1 = time.time()
inputs = processor(text, voice_preset=voice, return_tensors='pt')
# Set attention_mask if not provided by processor — without it the model
# cannot distinguish padding from real tokens, which causes erratic generation
# including leading filler sounds.
if 'attention_mask' not in inputs:
inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.inference_mode():
audio = model.generate(
**inputs,
semantic_temperature = 0.3,
coarse_temperature = 0.3,
fine_temperature = 0.3,
)
samples = audio.cpu().numpy().squeeze().astype(np.float32)
elapsed_gen = time.time() - t1
duration = len(samples) / SAMPLE_RATE
log(f'generated {duration:.1f}s audio in {elapsed_gen:.1f}s rtf={elapsed_gen/duration:.2f}')
proc = subprocess.Popen(
['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
stdin=subprocess.PIPE,
)
proc.stdin.write(samples.tobytes())
proc.stdin.close()
proc.wait()
for line in sys.stdin:
line = line.strip()
if not line:
continue
try:
req = json.loads(line)
text = req.get('text', '')
voice = req.get('voice', DEF_VOICE)
except json.JSONDecodeError:
text = line
voice = DEF_VOICE
if not text:
print('ok', flush=True)
continue
try:
speak(text, voice)
except Exception as e:
log(f'error: {e}')
print('ok', flush=True)