claude-voice-experiment/bark-server.py

#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"'
"""
Bark TTS server — keeps model loaded, reads JSON lines from stdin.

Protocol:
  stdin:  one JSON line per request: {"text": "...", "voice": "v2/en_speaker_6"}
  stdout: "ok\n" after each utterance is finished playing
  stderr: status/timing messages

Usage:
  python bark-server.py [model] [voice]
  python bark-server.py suno/bark-small v2/en_speaker_3

Voices (English):
  v2/en_speaker_0  .. v2/en_speaker_9
  Speaker 6 = neutral/warm, 9 = expressive, 3 = deep male
"""

import os
import sys
import json
import time
import subprocess
import numpy as np

MODEL      = sys.argv[1] if len(sys.argv) > 1 else 'suno/bark'
DEF_VOICE  = sys.argv[2] if len(sys.argv) > 2 else 'v2/en_speaker_6'
SAMPLE_RATE = 24000

TOKEN_FILE = os.path.expanduser('~/.secrets/hugging-face.token')
try:
    with open(TOKEN_FILE) as f:
        os.environ['HF_TOKEN'] = f.read().strip()
except FileNotFoundError:
    pass

# Disable background safetensors conversion attempts — the HF API endpoint
# it calls has changed and now errors. The pytorch_model.bin files work fine.
os.environ['SAFETENSORS_FAST_GPU'] = '0'
os.environ.setdefault('TRANSFORMERS_NO_ADVISORY_WARNINGS', '1')

def log(msg):
    print(f'[bark] {msg}', file=sys.stderr, flush=True)

log(f'loading {MODEL}...')
t0 = time.time()

import torch
from transformers import AutoProcessor, BarkModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype  = torch.float16 if device == 'cuda' else torch.float32

if device == 'cuda':
    # TF32 is faster on Ampere (30xx, 40xx) for both matmul and convolutions
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32       = True

processor = AutoProcessor.from_pretrained(MODEL)
model     = BarkModel.from_pretrained(MODEL, torch_dtype=dtype, use_safetensors=False).to(device)

# Bark's internal generation calls set both max_length and max_new_tokens simultaneously
# which causes a conflict warning and may cause early stopping. Clearing max_length
# lets max_new_tokens take sole control.
model.generation_config.max_length = None

# torch.compile gives a meaningful speedup on 3090 (adds ~30s first-run compile)
try:
    model = torch.compile(model, mode='reduce-overhead')
    log('torch.compile applied')
except Exception as e:
    log(f'torch.compile skipped: {e}')

log(f'ready on {device} ({time.time() - t0:.1f}s load time)')
print('ready', flush=True)  # signal to Node that we are ready


def speak(text, voice):
    t1 = time.time()

    inputs = processor(text, voice_preset=voice, return_tensors='pt')

    # Set attention_mask if not provided by processor — without it the model
    # cannot distinguish padding from real tokens, which causes erratic generation
    # including leading filler sounds.
    if 'attention_mask' not in inputs:
        inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])

    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.inference_mode():
        audio = model.generate(
            **inputs,
            semantic_temperature = 0.3,
            coarse_temperature   = 0.3,
            fine_temperature     = 0.3,
        )

    samples = audio.cpu().numpy().squeeze().astype(np.float32)
    elapsed_gen = time.time() - t1
    duration    = len(samples) / SAMPLE_RATE
    log(f'generated {duration:.1f}s audio in {elapsed_gen:.1f}s  rtf={elapsed_gen/duration:.2f}')

    proc = subprocess.Popen(
        ['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'],
        stdin=subprocess.PIPE,
    )
    proc.stdin.write(samples.tobytes())
    proc.stdin.close()
    proc.wait()


for line in sys.stdin:
    line = line.strip()
    if not line:
        continue

    try:
        req          = json.loads(line)
        text  = req.get('text', '')
        voice = req.get('voice', DEF_VOICE)
    except json.JSONDecodeError:
        text  = line
        voice = DEF_VOICE

    if not text:
        print('ok', flush=True)
        continue

    try:
        speak(text, voice)
    except Exception as e:
        log(f'error: {e}')

    print('ok', flush=True)