diff --git a/docs/05_usage.md b/docs/05_usage.md index 98648a6..ebcaf78 100644 --- a/docs/05_usage.md +++ b/docs/05_usage.md @@ -49,52 +49,47 @@ # {"status":"ok","backend":"f5_tts"} ``` -## Пример Python-клиента +## Клиенты -```python -import asyncio -import base64 -import json -import websockets +В директории `examples/` лежат готовые клиенты: +- `examples/client_python.py` — Python-клиент с воспроизведением через `sounddevice`. +- `examples/client_browser.html` — HTML/JS клиент для браузера с `AudioContext`. -async def main(): - uri = "ws://localhost:8765/ws" - async with websockets.connect(uri) as ws: - await ws.send(json.dumps({ - "type": "init", - "session_id": "demo", - "voice_ref": "voices/default_neutral.wav", - "language": "ru", - "speed": 1.0, - "emotion": "neutral", - "seq": 1, - })) +### Python-клиент - for chunk in ["Привет, ", "как ", "дела?"]: - await ws.send(json.dumps({ - "type": "text", - "payload": chunk, - "seq": 2, - })) - await asyncio.sleep(0.2) - - await ws.send(json.dumps({"type": "flush", "seq": 3})) - - while True: - msg = json.loads(await ws.recv()) - print(msg) - if msg["type"] == "audio": - pcm = base64.b64decode(msg["data"]) - # отправить pcm на воспроизведение - if msg["type"] == "status" and msg["event"] == "stopped": - break - - -if __name__ == "__main__": - asyncio.run(main()) +```bash +pip install websockets sounddevice +python examples/client_python.py --uri ws://localhost:8765/ws "Привет, мир!" ``` +Опции: + +```bash +python examples/client_python.py \ + --uri ws://localhost:8765/ws \ + --voice-ref voices/rick_ref_clean.wav \ + --language ru \ + --speed 1.0 \ + "Это тестовая фраза для проверки." +``` + +Клиент: +1. Отправляет `init` с настройками. +2. Разбивает текст на слова и шлёт их как потоковые `text`. +3. Отправляет `flush`. +4. Получает `audio`-чанки, декодирует base64 PCM16 и складывает в аудиобуфер. +5. `sounddevice` воспроизводит аудио в реальном времени из callback. +6. По `Ctrl+C` отправляет `stop` и выходит. + +### Браузерный клиент + +Откройте `examples/client_browser.html` в браузере, укажите URI сервера +и нажмите **Connect**, затем **Speak streaming**. Браузер создаст +`AudioContext` на 24 кГц, декодирует PCM16 из base64 и ставит буферы +в очередь для бесшовного воспроизведения. Кнопка **Stop** отправляет +`stop` на сервер. + ## Настройка через переменные окружения (.env) | Переменная | Описание | По умолчанию | diff --git a/examples/client_browser.html b/examples/client_browser.html new file mode 100644 index 0000000..49f41c6 --- /dev/null +++ b/examples/client_browser.html @@ -0,0 +1,200 @@ + + + + + + Voice TTS WebSocket Client + + + +

Voice TTS WebSocket Client

+ + + + +
+
+ + +
+
+ + +
+
+ + +
+
+ + + + + + + +
+ + + +
+ +
+ + + + diff --git a/examples/client_python.py b/examples/client_python.py new file mode 100644 index 0000000..c891ee2 --- /dev/null +++ b/examples/client_python.py @@ -0,0 +1,216 @@ +"""Simple Python WebSocket client for the Voice TTS server. + +Streams text in chunks, receives base64 PCM audio and plays it via sounddevice. +Install dependencies: + pip install websockets sounddevice +""" + +from __future__ import annotations + +import argparse +import asyncio +import base64 +import json +import sys +from typing import Any + +import numpy as np +import sounddevice as sd +import websockets + + +class VoiceTTSClient: + """WebSocket client that speaks LLM/agent output in real time.""" + + def __init__( + self, + uri: str = "ws://localhost:8765/ws", + voice_ref: str | None = None, + voice_refs: dict[str, str] | None = None, + language: str = "ru", + speed: float = 1.0, + emotion: str = "neutral", + sample_rate: int = 24_000, + block_size: int = 2048, + ): + self.uri = uri + self.voice_ref = voice_ref + self.voice_refs = voice_refs or {} + self.language = language + self.speed = speed + self.emotion = emotion + self.sample_rate = sample_rate + self.block_size = block_size + + self._ws: websockets.WebSocketClientProtocol | None = None + self._seq = 0 + self._audio_buffer: bytearray = bytearray() + self._current_segment_seq: int | None = None + self._stream: sd.RawOutputStream | None = None + self._lock = asyncio.Lock() + + def _next_seq(self) -> int: + self._seq += 1 + return self._seq + + def _send_dict(self, payload: dict[str, Any]) -> None: + if self._ws is None: + raise RuntimeError("WebSocket is not connected") + asyncio.create_task(self._ws.send(json.dumps(payload, ensure_ascii=False))) + + async def connect(self) -> None: + self._ws = await websockets.connect(self.uri) + + # Start audio output stream. sounddevice resamples if necessary. + self._stream = sd.RawOutputStream( + samplerate=self.sample_rate, + channels=1, + dtype="int16", + blocksize=self.block_size, + callback=self._audio_callback, + ) + self._stream.start() + + init_msg = { + "type": "init", + "seq": self._next_seq(), + "session_id": "python-client", + "language": self.language, + "speed": self.speed, + "emotion": self.emotion, + } + if self.voice_ref: + init_msg["voice_ref"] = self.voice_ref + if self.voice_refs: + init_msg["voice_refs"] = self.voice_refs + + await self._ws.send(json.dumps(init_msg, ensure_ascii=False)) + + def _audio_callback(self, outdata: np.ndarray, frames: int, _time, _status) -> None: + """Pull audio bytes from the buffer into the sounddevice stream.""" + needed = frames * 2 # int16 = 2 bytes + available = len(self._audio_buffer) + if available >= needed: + chunk = bytes(self._audio_buffer[:needed]) + self._audio_buffer = self._audio_buffer[needed:] + else: + chunk = bytes(self._audio_buffer) + b"\x00" * (needed - available) + self._audio_buffer = bytearray() + outdata[:] = np.frombuffer(chunk, dtype=np.int16).reshape(-1, 1) + + async def speak_text(self, text: str, chunk_delay: float = 0.15) -> None: + """Simulate streaming text by sending it word-by-word.""" + if self._ws is None: + raise RuntimeError("Call connect() first") + + words = text.split() + for i, word in enumerate(words): + payload = word + (" " if i < len(words) - 1 else "") + await self._ws.send( + json.dumps( + {"type": "text", "payload": payload, "seq": self._next_seq()}, + ensure_ascii=False, + ) + ) + await asyncio.sleep(chunk_delay) + + await self._ws.send(json.dumps({"type": "flush", "seq": self._next_seq()})) + + async def stop(self, reason: str = "interrupt") -> None: + if self._ws is None: + return + await self._ws.send( + json.dumps({"type": "stop", "reason": reason, "seq": self._next_seq()}) + ) + async with self._lock: + self._audio_buffer = bytearray() + + async def run(self, text: str) -> None: + await self.connect() + assert self._ws is not None + try: + receive_task = asyncio.create_task(self._receive_loop()) + await self.speak_text(text) + await receive_task + except websockets.exceptions.ConnectionClosed: + pass + finally: + await self.close() + + async def _receive_loop(self) -> None: + assert self._ws is not None + finished_events = {"stopped", "finished"} + while True: + try: + raw = await self._ws.recv() + except websockets.exceptions.ConnectionClosed: + break + + msg = json.loads(raw) + msg_type = msg.get("type") + + if msg_type == "audio": + pcm = base64.b64decode(msg["data"]) + async with self._lock: + self._audio_buffer.extend(pcm) + + elif msg_type == "status": + event = msg.get("event") + if event in finished_events: + # Wait for the audio buffer to drain before exiting. + await self._drain() + break + print(f"[status] {event} seq={msg.get('seq')}") + + elif msg_type == "error": + print(f"[error] {msg.get('message')}", file=sys.stderr) + + async def _drain(self) -> None: + """Wait until the local audio buffer has been played.""" + while True: + async with self._lock: + if len(self._audio_buffer) == 0: + break + await asyncio.sleep(0.05) + # Give sounddevice a little extra time to finish its current block. + await asyncio.sleep(0.2) + + async def close(self) -> None: + if self._stream is not None: + self._stream.stop() + self._stream.close() + self._stream = None + if self._ws is not None: + await self._ws.close() + self._ws = None + + +def main() -> None: + parser = argparse.ArgumentParser(description="Voice TTS WebSocket client") + parser.add_argument("--uri", default="ws://localhost:8765/ws") + parser.add_argument("--voice-ref", default=None) + parser.add_argument("--language", default="ru") + parser.add_argument("--speed", type=float, default=1.0) + parser.add_argument("--emotion", default="neutral") + parser.add_argument("--sample-rate", type=int, default=24_000) + parser.add_argument("text", nargs="*", default=["Привет. Это тестовая фраза."]) + args = parser.parse_args() + + text = " ".join(args.text) + client = VoiceTTSClient( + uri=args.uri, + voice_ref=args.voice_ref, + language=args.language, + speed=args.speed, + emotion=args.emotion, + sample_rate=args.sample_rate, + ) + + try: + asyncio.run(client.run(text)) + except KeyboardInterrupt: + print("Interrupted") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 761e770..7baa355 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,10 @@ "httpx>=0.27.0", "websockets>=13.0", ] +client = [ + "websockets>=13.0", + "sounddevice>=0.5.0", +] [project.scripts] voice-tts = "voice_tts.main:main" diff --git a/requirements.txt b/requirements.txt index 0fc131a..3ee11ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,3 +31,6 @@ accelerate>=0.34.0 sentencepiece>=0.2.0 bitsandbytes>=0.44.0 + +# Optional clients (server works without them) +# sounddevice>=0.5.0 # for examples/client_python.py