115 lines
4.0 KiB
Python
115 lines
4.0 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""OpenAI-compatible TTS API proxy for Piper (Wyoming protocol)."""
|
||
|
|
import socket, json, struct, io, wave, os, sys
|
||
|
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||
|
|
|
||
|
|
PIPER_HOST = os.environ.get("PIPER_HOST", "192.168.86.11")
|
||
|
|
PIPER_PORT = int(os.environ.get("PIPER_PORT", "10200"))
|
||
|
|
LISTEN_PORT = int(os.environ.get("LISTEN_PORT", "8951"))
|
||
|
|
|
||
|
|
def piper_synthesize(text):
|
||
|
|
"""Send text to Piper via Wyoming protocol, return raw PCM + sample rate."""
|
||
|
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||
|
|
sock.settimeout(30)
|
||
|
|
sock.connect((PIPER_HOST, PIPER_PORT))
|
||
|
|
f = sock.makefile('rb')
|
||
|
|
|
||
|
|
# Send synthesize event
|
||
|
|
sock.sendall((json.dumps({"type": "synthesize", "data": {"text": text}}) + "\n").encode())
|
||
|
|
|
||
|
|
audio_data = b""
|
||
|
|
rate = 22050
|
||
|
|
width = 2
|
||
|
|
channels = 1
|
||
|
|
|
||
|
|
while True:
|
||
|
|
line = f.readline()
|
||
|
|
if not line:
|
||
|
|
break
|
||
|
|
msg = json.loads(line.decode().strip())
|
||
|
|
data_len = msg.get("data_length", 0)
|
||
|
|
payload_len = msg.get("payload_length", 0)
|
||
|
|
|
||
|
|
if data_len > 0:
|
||
|
|
data_json = json.loads(f.read(data_len).decode())
|
||
|
|
if msg.get("type") == "audio-start":
|
||
|
|
rate = data_json.get("rate", 22050)
|
||
|
|
width = data_json.get("width", 2)
|
||
|
|
channels = data_json.get("channels", 1)
|
||
|
|
|
||
|
|
if payload_len > 0:
|
||
|
|
payload = f.read(payload_len)
|
||
|
|
if msg.get("type") == "audio-chunk":
|
||
|
|
audio_data += payload
|
||
|
|
|
||
|
|
if msg.get("type") == "audio-stop":
|
||
|
|
break
|
||
|
|
|
||
|
|
sock.close()
|
||
|
|
return audio_data, rate, width, channels
|
||
|
|
|
||
|
|
def pcm_to_wav(pcm_data, rate, width, channels):
|
||
|
|
"""Wrap raw PCM in a WAV container."""
|
||
|
|
buf = io.BytesIO()
|
||
|
|
with wave.open(buf, 'wb') as wf:
|
||
|
|
wf.setnchannels(channels)
|
||
|
|
wf.setsampwidth(width)
|
||
|
|
wf.setframerate(rate)
|
||
|
|
wf.writeframes(pcm_data)
|
||
|
|
return buf.getvalue()
|
||
|
|
|
||
|
|
class TTSHandler(BaseHTTPRequestHandler):
|
||
|
|
def do_POST(self):
|
||
|
|
if self.path in ("/v1/audio/speech", "/v1/audio/speech/"):
|
||
|
|
content_len = int(self.headers.get('Content-Length', 0))
|
||
|
|
body = json.loads(self.rfile.read(content_len)) if content_len else {}
|
||
|
|
|
||
|
|
text = body.get("input", "")
|
||
|
|
if not text:
|
||
|
|
self.send_response(400)
|
||
|
|
self.end_headers()
|
||
|
|
self.wfile.write(b'{"error": "missing input"}')
|
||
|
|
return
|
||
|
|
|
||
|
|
try:
|
||
|
|
pcm, rate, width, channels = piper_synthesize(text)
|
||
|
|
wav_data = pcm_to_wav(pcm, rate, width, channels)
|
||
|
|
|
||
|
|
self.send_response(200)
|
||
|
|
self.send_header("Content-Type", "audio/wav")
|
||
|
|
self.send_header("Content-Length", str(len(wav_data)))
|
||
|
|
self.end_headers()
|
||
|
|
self.wfile.write(wav_data)
|
||
|
|
except Exception as e:
|
||
|
|
self.send_response(500)
|
||
|
|
self.end_headers()
|
||
|
|
self.wfile.write(json.dumps({"error": str(e)}).encode())
|
||
|
|
else:
|
||
|
|
# Health check / model list
|
||
|
|
if self.path in ("/v1/models", "/health", "/healthz"):
|
||
|
|
self.send_response(200)
|
||
|
|
self.end_headers()
|
||
|
|
self.wfile.write(b'{"status":"ok","provider":"piper"}')
|
||
|
|
else:
|
||
|
|
self.send_response(404)
|
||
|
|
self.end_headers()
|
||
|
|
|
||
|
|
def do_GET(self):
|
||
|
|
if self.path in ("/v1/models", "/health", "/healthz", "/"):
|
||
|
|
self.send_response(200)
|
||
|
|
self.end_headers()
|
||
|
|
self.wfile.write(b'{"status":"ok","provider":"piper-proxy"}')
|
||
|
|
else:
|
||
|
|
self.send_response(404)
|
||
|
|
self.end_headers()
|
||
|
|
|
||
|
|
def log_message(self, format, *args):
|
||
|
|
print(f"[piper-proxy] {args[0]}")
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
print(f"Piper OpenAI TTS proxy starting on :{LISTEN_PORT}")
|
||
|
|
print(f" Piper backend: {PIPER_HOST}:{PIPER_PORT}")
|
||
|
|
print(f" Endpoint: POST /v1/audio/speech")
|
||
|
|
server = HTTPServer(("0.0.0.0", LISTEN_PORT), TTSHandler)
|
||
|
|
server.serve_forever()
|