diff --git a/docs/05_usage.md b/docs/05_usage.md
index 98648a6..ebcaf78 100644
--- a/docs/05_usage.md
+++ b/docs/05_usage.md
@@ -49,52 +49,47 @@
# {"status":"ok","backend":"f5_tts"}
```
-## Пример Python-клиента
+## Клиенты
-```python
-import asyncio
-import base64
-import json
-import websockets
+В директории `examples/` лежат готовые клиенты:
+- `examples/client_python.py` — Python-клиент с воспроизведением через `sounddevice`.
+- `examples/client_browser.html` — HTML/JS клиент для браузера с `AudioContext`.
-async def main():
- uri = "ws://localhost:8765/ws"
- async with websockets.connect(uri) as ws:
- await ws.send(json.dumps({
- "type": "init",
- "session_id": "demo",
- "voice_ref": "voices/default_neutral.wav",
- "language": "ru",
- "speed": 1.0,
- "emotion": "neutral",
- "seq": 1,
- }))
+### Python-клиент
- for chunk in ["Привет, ", "как ", "дела?"]:
- await ws.send(json.dumps({
- "type": "text",
- "payload": chunk,
- "seq": 2,
- }))
- await asyncio.sleep(0.2)
-
- await ws.send(json.dumps({"type": "flush", "seq": 3}))
-
- while True:
- msg = json.loads(await ws.recv())
- print(msg)
- if msg["type"] == "audio":
- pcm = base64.b64decode(msg["data"])
- # отправить pcm на воспроизведение
- if msg["type"] == "status" and msg["event"] == "stopped":
- break
-
-
-if __name__ == "__main__":
- asyncio.run(main())
+```bash
+pip install websockets sounddevice
+python examples/client_python.py --uri ws://localhost:8765/ws "Привет, мир!"
```
+Опции:
+
+```bash
+python examples/client_python.py \
+ --uri ws://localhost:8765/ws \
+ --voice-ref voices/rick_ref_clean.wav \
+ --language ru \
+ --speed 1.0 \
+ "Это тестовая фраза для проверки."
+```
+
+Клиент:
+1. Отправляет `init` с настройками.
+2. Разбивает текст на слова и шлёт их как потоковые `text`.
+3. Отправляет `flush`.
+4. Получает `audio`-чанки, декодирует base64 PCM16 и складывает в аудиобуфер.
+5. `sounddevice` воспроизводит аудио в реальном времени из callback.
+6. По `Ctrl+C` отправляет `stop` и выходит.
+
+### Браузерный клиент
+
+Откройте `examples/client_browser.html` в браузере, укажите URI сервера
+и нажмите **Connect**, затем **Speak streaming**. Браузер создаст
+`AudioContext` на 24 кГц, декодирует PCM16 из base64 и ставит буферы
+в очередь для бесшовного воспроизведения. Кнопка **Stop** отправляет
+`stop` на сервер.
+
## Настройка через переменные окружения (.env)
| Переменная | Описание | По умолчанию |
diff --git a/examples/client_browser.html b/examples/client_browser.html
new file mode 100644
index 0000000..49f41c6
--- /dev/null
+++ b/examples/client_browser.html
@@ -0,0 +1,200 @@
+
+
+
+
+
+ Voice TTS WebSocket Client
+
+
+
+ Voice TTS WebSocket Client
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/examples/client_python.py b/examples/client_python.py
new file mode 100644
index 0000000..c891ee2
--- /dev/null
+++ b/examples/client_python.py
@@ -0,0 +1,216 @@
+"""Simple Python WebSocket client for the Voice TTS server.
+
+Streams text in chunks, receives base64 PCM audio and plays it via sounddevice.
+Install dependencies:
+ pip install websockets sounddevice
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import base64
+import json
+import sys
+from typing import Any
+
+import numpy as np
+import sounddevice as sd
+import websockets
+
+
+class VoiceTTSClient:
+ """WebSocket client that speaks LLM/agent output in real time."""
+
+ def __init__(
+ self,
+ uri: str = "ws://localhost:8765/ws",
+ voice_ref: str | None = None,
+ voice_refs: dict[str, str] | None = None,
+ language: str = "ru",
+ speed: float = 1.0,
+ emotion: str = "neutral",
+ sample_rate: int = 24_000,
+ block_size: int = 2048,
+ ):
+ self.uri = uri
+ self.voice_ref = voice_ref
+ self.voice_refs = voice_refs or {}
+ self.language = language
+ self.speed = speed
+ self.emotion = emotion
+ self.sample_rate = sample_rate
+ self.block_size = block_size
+
+ self._ws: websockets.WebSocketClientProtocol | None = None
+ self._seq = 0
+ self._audio_buffer: bytearray = bytearray()
+ self._current_segment_seq: int | None = None
+ self._stream: sd.RawOutputStream | None = None
+ self._lock = asyncio.Lock()
+
+ def _next_seq(self) -> int:
+ self._seq += 1
+ return self._seq
+
+ def _send_dict(self, payload: dict[str, Any]) -> None:
+ if self._ws is None:
+ raise RuntimeError("WebSocket is not connected")
+ asyncio.create_task(self._ws.send(json.dumps(payload, ensure_ascii=False)))
+
+ async def connect(self) -> None:
+ self._ws = await websockets.connect(self.uri)
+
+ # Start audio output stream. sounddevice resamples if necessary.
+ self._stream = sd.RawOutputStream(
+ samplerate=self.sample_rate,
+ channels=1,
+ dtype="int16",
+ blocksize=self.block_size,
+ callback=self._audio_callback,
+ )
+ self._stream.start()
+
+ init_msg = {
+ "type": "init",
+ "seq": self._next_seq(),
+ "session_id": "python-client",
+ "language": self.language,
+ "speed": self.speed,
+ "emotion": self.emotion,
+ }
+ if self.voice_ref:
+ init_msg["voice_ref"] = self.voice_ref
+ if self.voice_refs:
+ init_msg["voice_refs"] = self.voice_refs
+
+ await self._ws.send(json.dumps(init_msg, ensure_ascii=False))
+
+ def _audio_callback(self, outdata: np.ndarray, frames: int, _time, _status) -> None:
+ """Pull audio bytes from the buffer into the sounddevice stream."""
+ needed = frames * 2 # int16 = 2 bytes
+ available = len(self._audio_buffer)
+ if available >= needed:
+ chunk = bytes(self._audio_buffer[:needed])
+ self._audio_buffer = self._audio_buffer[needed:]
+ else:
+ chunk = bytes(self._audio_buffer) + b"\x00" * (needed - available)
+ self._audio_buffer = bytearray()
+ outdata[:] = np.frombuffer(chunk, dtype=np.int16).reshape(-1, 1)
+
+ async def speak_text(self, text: str, chunk_delay: float = 0.15) -> None:
+ """Simulate streaming text by sending it word-by-word."""
+ if self._ws is None:
+ raise RuntimeError("Call connect() first")
+
+ words = text.split()
+ for i, word in enumerate(words):
+ payload = word + (" " if i < len(words) - 1 else "")
+ await self._ws.send(
+ json.dumps(
+ {"type": "text", "payload": payload, "seq": self._next_seq()},
+ ensure_ascii=False,
+ )
+ )
+ await asyncio.sleep(chunk_delay)
+
+ await self._ws.send(json.dumps({"type": "flush", "seq": self._next_seq()}))
+
+ async def stop(self, reason: str = "interrupt") -> None:
+ if self._ws is None:
+ return
+ await self._ws.send(
+ json.dumps({"type": "stop", "reason": reason, "seq": self._next_seq()})
+ )
+ async with self._lock:
+ self._audio_buffer = bytearray()
+
+ async def run(self, text: str) -> None:
+ await self.connect()
+ assert self._ws is not None
+ try:
+ receive_task = asyncio.create_task(self._receive_loop())
+ await self.speak_text(text)
+ await receive_task
+ except websockets.exceptions.ConnectionClosed:
+ pass
+ finally:
+ await self.close()
+
+ async def _receive_loop(self) -> None:
+ assert self._ws is not None
+ finished_events = {"stopped", "finished"}
+ while True:
+ try:
+ raw = await self._ws.recv()
+ except websockets.exceptions.ConnectionClosed:
+ break
+
+ msg = json.loads(raw)
+ msg_type = msg.get("type")
+
+ if msg_type == "audio":
+ pcm = base64.b64decode(msg["data"])
+ async with self._lock:
+ self._audio_buffer.extend(pcm)
+
+ elif msg_type == "status":
+ event = msg.get("event")
+ if event in finished_events:
+ # Wait for the audio buffer to drain before exiting.
+ await self._drain()
+ break
+ print(f"[status] {event} seq={msg.get('seq')}")
+
+ elif msg_type == "error":
+ print(f"[error] {msg.get('message')}", file=sys.stderr)
+
+ async def _drain(self) -> None:
+ """Wait until the local audio buffer has been played."""
+ while True:
+ async with self._lock:
+ if len(self._audio_buffer) == 0:
+ break
+ await asyncio.sleep(0.05)
+ # Give sounddevice a little extra time to finish its current block.
+ await asyncio.sleep(0.2)
+
+ async def close(self) -> None:
+ if self._stream is not None:
+ self._stream.stop()
+ self._stream.close()
+ self._stream = None
+ if self._ws is not None:
+ await self._ws.close()
+ self._ws = None
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description="Voice TTS WebSocket client")
+ parser.add_argument("--uri", default="ws://localhost:8765/ws")
+ parser.add_argument("--voice-ref", default=None)
+ parser.add_argument("--language", default="ru")
+ parser.add_argument("--speed", type=float, default=1.0)
+ parser.add_argument("--emotion", default="neutral")
+ parser.add_argument("--sample-rate", type=int, default=24_000)
+ parser.add_argument("text", nargs="*", default=["Привет. Это тестовая фраза."])
+ args = parser.parse_args()
+
+ text = " ".join(args.text)
+ client = VoiceTTSClient(
+ uri=args.uri,
+ voice_ref=args.voice_ref,
+ language=args.language,
+ speed=args.speed,
+ emotion=args.emotion,
+ sample_rate=args.sample_rate,
+ )
+
+ try:
+ asyncio.run(client.run(text))
+ except KeyboardInterrupt:
+ print("Interrupted")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/pyproject.toml b/pyproject.toml
index 761e770..7baa355 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,10 @@
"httpx>=0.27.0",
"websockets>=13.0",
]
+client = [
+ "websockets>=13.0",
+ "sounddevice>=0.5.0",
+]
[project.scripts]
voice-tts = "voice_tts.main:main"
diff --git a/requirements.txt b/requirements.txt
index 0fc131a..3ee11ab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,3 +31,6 @@
accelerate>=0.34.0
sentencepiece>=0.2.0
bitsandbytes>=0.44.0
+
+# Optional clients (server works without them)
+# sounddevice>=0.5.0 # for examples/client_python.py