Initial commit: OpenAI-compatible Piper TTS proxy

2026-03-23 12:22:54 +00:00
commit 60850d975e
2 changed files with 157 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,43 @@
+# piper-openai-proxy
+
+OpenAI-compatible TTS API proxy for [Piper](https://github.com/rhasspy/piper) via the [Wyoming protocol](https://github.com/rhasspy/wyoming).
+
+Any tool that speaks the OpenAI `/v1/audio/speech` API can use your local Piper instance without modification.
+
+## Usage
+
+```bash
+python3 server.py
+```
+
+Environment variables:
+- `PIPER_HOST` — Piper Wyoming host (default: `192.168.86.11`)
+- `PIPER_PORT` — Piper Wyoming port (default: `10200`)
+- `LISTEN_PORT` — Proxy listen port (default: `8951`)
+
+## API
+
+```bash
+# Generate speech
+curl -X POST http://localhost:8951/v1/audio/speech \
+  -H "Content-Type: application/json" \
+  -d '{"input": "Hello world"}' \
+  -o speech.wav
+
+# Health check
+curl http://localhost:8951/health
+```
+
+## Requirements
+
+- Python 3.8+
+- No dependencies (stdlib only)
+- A running [Piper Wyoming server](https://github.com/rhasspy/wyoming-piper)
+
+## How it works
+
+The proxy translates OpenAI TTS API requests into Wyoming protocol messages over a raw TCP socket. Piper generates the audio, and the proxy wraps the raw PCM in a WAV container and returns it.
+
+## License
+
+MIT
--- a/server.py
+++ b/server.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+"""OpenAI-compatible TTS API proxy for Piper (Wyoming protocol)."""
+import socket, json, struct, io, wave, os, sys
+from http.server import HTTPServer, BaseHTTPRequestHandler
+
+PIPER_HOST = os.environ.get("PIPER_HOST", "192.168.86.11")
+PIPER_PORT = int(os.environ.get("PIPER_PORT", "10200"))
+LISTEN_PORT = int(os.environ.get("LISTEN_PORT", "8951"))
+
+def piper_synthesize(text):
+    """Send text to Piper via Wyoming protocol, return raw PCM + sample rate."""
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.settimeout(30)
+    sock.connect((PIPER_HOST, PIPER_PORT))
+    f = sock.makefile('rb')
+
+    # Send synthesize event
+    sock.sendall((json.dumps({"type": "synthesize", "data": {"text": text}}) + "\n").encode())
+
+    audio_data = b""
+    rate = 22050
+    width = 2
+    channels = 1
+
+    while True:
+        line = f.readline()
+        if not line:
+            break
+        msg = json.loads(line.decode().strip())
+        data_len = msg.get("data_length", 0)
+        payload_len = msg.get("payload_length", 0)
+
+        if data_len > 0:
+            data_json = json.loads(f.read(data_len).decode())
+            if msg.get("type") == "audio-start":
+                rate = data_json.get("rate", 22050)
+                width = data_json.get("width", 2)
+                channels = data_json.get("channels", 1)
+
+        if payload_len > 0:
+            payload = f.read(payload_len)
+            if msg.get("type") == "audio-chunk":
+                audio_data += payload
+
+        if msg.get("type") == "audio-stop":
+            break
+
+    sock.close()
+    return audio_data, rate, width, channels
+
+def pcm_to_wav(pcm_data, rate, width, channels):
+    """Wrap raw PCM in a WAV container."""
+    buf = io.BytesIO()
+    with wave.open(buf, 'wb') as wf:
+        wf.setnchannels(channels)
+        wf.setsampwidth(width)
+        wf.setframerate(rate)
+        wf.writeframes(pcm_data)
+    return buf.getvalue()
+
+class TTSHandler(BaseHTTPRequestHandler):
+    def do_POST(self):
+        if self.path in ("/v1/audio/speech", "/v1/audio/speech/"):
+            content_len = int(self.headers.get('Content-Length', 0))
+            body = json.loads(self.rfile.read(content_len)) if content_len else {}
+
+            text = body.get("input", "")
+            if not text:
+                self.send_response(400)
+                self.end_headers()
+                self.wfile.write(b'{"error": "missing input"}')
+                return
+
+            try:
+                pcm, rate, width, channels = piper_synthesize(text)
+                wav_data = pcm_to_wav(pcm, rate, width, channels)
+
+                self.send_response(200)
+                self.send_header("Content-Type", "audio/wav")
+                self.send_header("Content-Length", str(len(wav_data)))
+                self.end_headers()
+                self.wfile.write(wav_data)
+            except Exception as e:
+                self.send_response(500)
+                self.end_headers()
+                self.wfile.write(json.dumps({"error": str(e)}).encode())
+        else:
+            # Health check / model list
+            if self.path in ("/v1/models", "/health", "/healthz"):
+                self.send_response(200)
+                self.end_headers()
+                self.wfile.write(b'{"status":"ok","provider":"piper"}')
+            else:
+                self.send_response(404)
+                self.end_headers()
+
+    def do_GET(self):
+        if self.path in ("/v1/models", "/health", "/healthz", "/"):
+            self.send_response(200)
+            self.end_headers()
+            self.wfile.write(b'{"status":"ok","provider":"piper-proxy"}')
+        else:
+            self.send_response(404)
+            self.end_headers()
+
+    def log_message(self, format, *args):
+        print(f"[piper-proxy] {args[0]}")
+
+if __name__ == "__main__":
+    print(f"Piper OpenAI TTS proxy starting on :{LISTEN_PORT}")
+    print(f"  Piper backend: {PIPER_HOST}:{PIPER_PORT}")
+    print(f"  Endpoint: POST /v1/audio/speech")
+    server = HTTPServer(("0.0.0.0", LISTEN_PORT), TTSHandler)
+    server.serve_forever()