#!/usr/bin/env -S bash -c 'exec "$(dirname "$0")/venv/bin/python3" "$0" "$@"' """ Bark TTS server — keeps model loaded, reads JSON lines from stdin. Protocol: stdin: one JSON line per request: {"text": "...", "voice": "v2/en_speaker_6"} stdout: "ok\n" after each utterance is finished playing stderr: status/timing messages Usage: python bark-server.py [model] [voice] python bark-server.py suno/bark-small v2/en_speaker_3 Voices (English): v2/en_speaker_0 .. v2/en_speaker_9 Speaker 6 = neutral/warm, 9 = expressive, 3 = deep male """ import os import sys import json import time import subprocess import numpy as np MODEL = sys.argv[1] if len(sys.argv) > 1 else 'suno/bark' DEF_VOICE = sys.argv[2] if len(sys.argv) > 2 else 'v2/en_speaker_6' SAMPLE_RATE = 24000 TOKEN_FILE = os.path.expanduser('~/.secrets/hugging-face.token') try: with open(TOKEN_FILE) as f: os.environ['HF_TOKEN'] = f.read().strip() except FileNotFoundError: pass # Disable background safetensors conversion attempts — the HF API endpoint # it calls has changed and now errors. The pytorch_model.bin files work fine. os.environ['SAFETENSORS_FAST_GPU'] = '0' os.environ.setdefault('TRANSFORMERS_NO_ADVISORY_WARNINGS', '1') def log(msg): print(f'[bark] {msg}', file=sys.stderr, flush=True) log(f'loading {MODEL}...') t0 = time.time() import torch from transformers import AutoProcessor, BarkModel device = 'cuda' if torch.cuda.is_available() else 'cpu' dtype = torch.float16 if device == 'cuda' else torch.float32 if device == 'cuda': # TF32 is faster on Ampere (30xx, 40xx) for both matmul and convolutions torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True processor = AutoProcessor.from_pretrained(MODEL) model = BarkModel.from_pretrained(MODEL, torch_dtype=dtype, use_safetensors=False).to(device) # Bark's internal generation calls set both max_length and max_new_tokens simultaneously # which causes a conflict warning and may cause early stopping. Clearing max_length # lets max_new_tokens take sole control. model.generation_config.max_length = None # torch.compile gives a meaningful speedup on 3090 (adds ~30s first-run compile) try: model = torch.compile(model, mode='reduce-overhead') log('torch.compile applied') except Exception as e: log(f'torch.compile skipped: {e}') log(f'ready on {device} ({time.time() - t0:.1f}s load time)') print('ready', flush=True) # signal to Node that we are ready def speak(text, voice): t1 = time.time() inputs = processor(text, voice_preset=voice, return_tensors='pt') # Set attention_mask if not provided by processor — without it the model # cannot distinguish padding from real tokens, which causes erratic generation # including leading filler sounds. if 'attention_mask' not in inputs: inputs['attention_mask'] = torch.ones_like(inputs['input_ids']) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.inference_mode(): audio = model.generate( **inputs, semantic_temperature = 0.3, coarse_temperature = 0.3, fine_temperature = 0.3, ) samples = audio.cpu().numpy().squeeze().astype(np.float32) elapsed_gen = time.time() - t1 duration = len(samples) / SAMPLE_RATE log(f'generated {duration:.1f}s audio in {elapsed_gen:.1f}s rtf={elapsed_gen/duration:.2f}') proc = subprocess.Popen( ['pacat', '--format=float32le', f'--rate={SAMPLE_RATE}', '--channels=1'], stdin=subprocess.PIPE, ) proc.stdin.write(samples.tobytes()) proc.stdin.close() proc.wait() for line in sys.stdin: line = line.strip() if not line: continue try: req = json.loads(line) text = req.get('text', '') voice = req.get('voice', DEF_VOICE) except json.JSONDecodeError: text = line voice = DEF_VOICE if not text: print('ok', flush=True) continue try: speak(text, voice) except Exception as e: log(f'error: {e}') print('ok', flush=True)