#!/usr/bin/env python3 """OpenAI-compatible TTS API proxy for Piper (Wyoming protocol).""" import socket, json, struct, io, wave, os, sys from http.server import HTTPServer, BaseHTTPRequestHandler PIPER_HOST = os.environ.get("PIPER_HOST", "192.168.86.11") PIPER_PORT = int(os.environ.get("PIPER_PORT", "10200")) LISTEN_PORT = int(os.environ.get("LISTEN_PORT", "8951")) def piper_synthesize(text): """Send text to Piper via Wyoming protocol, return raw PCM + sample rate.""" sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(30) sock.connect((PIPER_HOST, PIPER_PORT)) f = sock.makefile('rb') # Send synthesize event sock.sendall((json.dumps({"type": "synthesize", "data": {"text": text}}) + "\n").encode()) audio_data = b"" rate = 22050 width = 2 channels = 1 while True: line = f.readline() if not line: break msg = json.loads(line.decode().strip()) data_len = msg.get("data_length", 0) payload_len = msg.get("payload_length", 0) if data_len > 0: data_json = json.loads(f.read(data_len).decode()) if msg.get("type") == "audio-start": rate = data_json.get("rate", 22050) width = data_json.get("width", 2) channels = data_json.get("channels", 1) if payload_len > 0: payload = f.read(payload_len) if msg.get("type") == "audio-chunk": audio_data += payload if msg.get("type") == "audio-stop": break sock.close() return audio_data, rate, width, channels def pcm_to_wav(pcm_data, rate, width, channels): """Wrap raw PCM in a WAV container.""" buf = io.BytesIO() with wave.open(buf, 'wb') as wf: wf.setnchannels(channels) wf.setsampwidth(width) wf.setframerate(rate) wf.writeframes(pcm_data) return buf.getvalue() class TTSHandler(BaseHTTPRequestHandler): def do_POST(self): if self.path in ("/v1/audio/speech", "/v1/audio/speech/"): content_len = int(self.headers.get('Content-Length', 0)) body = json.loads(self.rfile.read(content_len)) if content_len else {} text = body.get("input", "") if not text: self.send_response(400) self.end_headers() self.wfile.write(b'{"error": "missing input"}') return try: pcm, rate, width, channels = piper_synthesize(text) wav_data = pcm_to_wav(pcm, rate, width, channels) self.send_response(200) self.send_header("Content-Type", "audio/wav") self.send_header("Content-Length", str(len(wav_data))) self.end_headers() self.wfile.write(wav_data) except Exception as e: self.send_response(500) self.end_headers() self.wfile.write(json.dumps({"error": str(e)}).encode()) else: # Health check / model list if self.path in ("/v1/models", "/health", "/healthz"): self.send_response(200) self.end_headers() self.wfile.write(b'{"status":"ok","provider":"piper"}') else: self.send_response(404) self.end_headers() def do_GET(self): if self.path in ("/v1/models", "/health", "/healthz", "/"): self.send_response(200) self.end_headers() self.wfile.write(b'{"status":"ok","provider":"piper-proxy"}') else: self.send_response(404) self.end_headers() def log_message(self, format, *args): print(f"[piper-proxy] {args[0]}") if __name__ == "__main__": print(f"Piper OpenAI TTS proxy starting on :{LISTEN_PORT}") print(f" Piper backend: {PIPER_HOST}:{PIPER_PORT}") print(f" Endpoint: POST /v1/audio/speech") server = HTTPServer(("0.0.0.0", LISTEN_PORT), TTSHandler) server.serve_forever()