diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..c662d52 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,87 @@ +## General + +- **Communicate with the user in Russian.** All explanations, reasoning, and feedback should be written in Russian unless explicitly asked otherwise. + +## Quick Commands + +- **Run server (Fish Speech default):** `python -m voice_tts.main` +- **Dummy backend for fast local tests:** `TTS_BACKEND=dummy python -m voice_tts.main` +- **XTTS-v2 backend:** `TTS_BACKEND=xtts_v2 python -m voice_tts.main` +- **Console script (installed):** `voice-tts` +- **Health check:** `curl http://localhost:8765/health` +- **Browser test client:** `cd examples && python -m http.server 8080` → открыть `http://localhost:8080/client_browser.html` +- **Browser test (dummy):** `TTS_BACKEND=dummy python -m voice_tts.main` + http-сервер из `examples/` + +## Project Layout + +``` +scripts/ — standalone utilities (benchmark, download) +src/voice_tts/ — package entry points + main.py — uvicorn.run app (the console-script target) + config.py — pydantic-settings (Settings class); .env is auto-loaded here + api/server.py — FastAPI + WebSocket session loop; _create_engine() picks backend by TTS_BACKEND env var + api/protocol.py — Pydantic msg models for /ws protocol + session/state.py — SessionState, VoiceProfile + tts/engine.py — TTSEngine ABC, DummyTTSEngine + tts/fish_speech_backend.py — Fish Speech 1.5 implementation + tts/f5_backend.py — F5-TTS v1 implementation + tts/xtts_backend.py — XTTS-v2 implementation (auto-downloads from Coqui) + tts/segmenter.py — sentence-break + comma fallback segmentation + tts/utils.py — preprocess_text_for_tts() + audio/formats.py — float32→PCM16→base64, WAV header generation +tests/ — pytest files +models/ — local model checkpoints (gitignored) +voices/ — reference audio (wavs/flac); .wav files gitignored but .lab files are kept and used by Fish Speech +``` + +## Python & Dependencies + +Python 3.10–3.12 is required (set in `pyproject.toml`). PyTorch must be installed with CUDA support before other deps: + +```bash +pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu126 +pip install -r requirements.txt +``` + +## Configuration + +All settings live in `config.py`; the Settings class auto-loads from `.env` via pydantic-settings. + +Key variables: + +| Variable | Default | Notes | +|---|---|---| +| `TTS_BACKEND` | `fish_speech` | One of: dummy / f5_tts / xtts_v2 / fish_speech. Switching backends requires a clean restart (engine is built lazily on first connection). | +| `TTS_MODEL_PATH` | — | Fish Speech checkpoint folder (contains model.pth, firefly-gan-vq-fsq-8x1024-21hz-generator.pth, tokenizer.tiktoken, config.json) | +| `TTS_VOCAB_PATH` | — | Fish Speech v1.5 source tree path (used to import firefly_gan / FSQ modules) | +| `TTS_MODEL_NAME` | `tts_models/multilingual/multi-dataset/xtts_v2` | Coqui model manager path; xtts_v2 downloads this on first use | +| `FISH_COMPILE` | `false` | Avoid setting to true. Enables torch.compile but causes CUDAGraphs tensor-overwrite errors on repeated inference. | +| `FISH_CHUNK_LENGTH` | 200 | Chunk length for Fish Speech (100–300). Higher = more GPU work per call, higher latency. | + +## WebSocket Protocol (`/ws`) + +- Server at `ws://localhost:8765/ws` +- Messages are JSON, client-sent types: `init`, `text`, `flush`, `stop`, `emotion`, `config` +- Server sends back: `status` (session_ready / segment_started / stopped / config_updated), `audio` (sample_rate + base64 data), plus error messages on failure. + +## Testing + +```bash +pytest tests/ # asyncio_mode = auto, paths in tests/ +``` + +Fixtures and reference audio live in `tests/`. No external services required — dummy backend works for unit-level tests without GPU. Fish Speech backends need the local checkpoint in `models/fishaudio_fish-speech-1.5/` (gitignored). + +## Scripts + +- `scripts/benchmark_backends.py` — compare inference times across backends +- `scripts/download_f5_tts.py` — downloads F5-TTS v1 model files into `models/F5TTS_v1_Base/` +- `scripts/benchmark_compile.py` — torch.compile benchmarking utility + +## Important Gotchas + +- **Engine is built lazily on first `/ws` connection** in `_create_engine()` inside `api/server.py`. Changing `TTS_BACKEND` requires a full server restart, not just a message-level config change. +- **All GPU calls are serialized through one `_synth_lock`.** Concurrent sessions share a single inference thread — this exists to avoid CUDA contention and OOM on multi-gpu setups. +- `.env` is gitignored but `.env.example` is the source of truth for supported variables. `config.py` line 50 sets `env_file = ".env"`. +- The dummy backend runs via a transient event loop (see `_sync_synthesize` in server.py:291), which means if your test modifies global asyncio state it can break other tests — run tests independently or set `asyncio_mode=auto`. +- **Space insertion between text payloads.** In `_handle_text` (server.py:152–157), a space is automatically inserted between consecutive payloads if neither side has whitespace at the join point. This prevents word merging when clients send word-by-word without trailing spaces (e.g. the browser client). Clients should not include leading/trailing spaces in payloads — the server handles spacing. diff --git a/docs/05_usage.md b/docs/05_usage.md index ebcaf78..41105ee 100644 --- a/docs/05_usage.md +++ b/docs/05_usage.md @@ -3,7 +3,7 @@ ## Установка > Рекомендуется Python 3.11. Python 3.14+ пока не имеет совместимых wheel для -> `torch` / `f5-tts`, поэтому используйте 3.10–3.12. +> `torch`, поэтому используйте 3.10–3.12. ```bash # Клонировать / перейти в директорию проекта @@ -26,16 +26,21 @@ ## Запуск сервера ```bash -# Основной режим: F5-TTS на GPU +# Основной режим: Fish Speech 1.5 на GPU (по умолчанию) python -m voice_tts.main # С настроенным референсом и warm-up (рекомендуется) -TTS_BACKEND=f5_tts \ -DEFAULT_VOICE_REF=voices/rick_ref_clean.wav \ -DEFAULT_REF_TEXT="Ва-ба-ла-ба-дап-дап! Рикки-тики-тави, сученька! И вот такие у нас новости! Иди." \ +TTS_BACKEND=fish_speech \ +DEFAULT_VOICE_REF=voices/self_ref_clean.wav \ +DEFAULT_REF_TEXT="Добрый вечер, меня зовут Евгений." \ WARMUP=true \ python -m voice_tts.main +# Быстрый вариант XTTS-v2 +TTS_BACKEND=xtts_v2 \ +DEFAULT_VOICE_REF=voices/self_ref_clean.wav \ +python -m voice_tts.main + # Тестовый режим без модели TTS_BACKEND=dummy python -m voice_tts.main ``` @@ -46,7 +51,7 @@ ```bash curl http://localhost:8765/health -# {"status":"ok","backend":"f5_tts"} +# {"status":"ok","backend":"fish_speech"} ``` ## Клиенты @@ -68,7 +73,7 @@ ```bash python examples/client_python.py \ --uri ws://localhost:8765/ws \ - --voice-ref voices/rick_ref_clean.wav \ + --voice-ref voices/self_ref_clean.wav \ --language ru \ --speed 1.0 \ "Это тестовая фраза для проверки." @@ -84,11 +89,24 @@ ### Браузерный клиент -Откройте `examples/client_browser.html` в браузере, укажите URI сервера -и нажмите **Connect**, затем **Speak streaming**. Браузер создаст -`AudioContext` на 24 кГц, декодирует PCM16 из base64 и ставит буферы -в очередь для бесшовного воспроизведения. Кнопка **Stop** отправляет -`stop` на сервер. +```bash +# 1. Терминал 1: запустить TTS сервер (dummy — без GPU, любой бэкенд по выбору) +TTS_BACKEND=dummy python -m voice_tts.main + +# 2. Терминал 2: открыть клиент через HTTP (file:// не даёт WebSocket в некоторых браузерах) +python -m http.server 8080 --directory examples/ +# Открыть http://localhost:8080/client_browser.html +``` + +Нажмите **Connect**, затем **Speak streaming**. Клиент: + +1. Шлёт `init` с настройками (язык, скорость, эмоция). +2. Шлёт слова по одному как потоковые `text` с задержкой 120 мс. +3. Завершает `flush`. +4. Получает `audio`-чанки с динамическим `sample_rate` (поддерживается любой бэкенд). +5. Декодирует PCM16 из base64 и ставит в очередь `AudioBuffer` для бесшовного воспроизведения. + +Кнопка **Stop** отправляет `stop`. Кнопка **Test audio** проверяет звук в браузере независимо от сервера. ## Настройка через переменные окружения (.env) @@ -97,38 +115,43 @@ | `HOST` | Хост сервера | `0.0.0.0` | | `PORT` | Порт сервера | `8765` | | `LOG_LEVEL` | Уровень логирования | `INFO` | -| `TTS_BACKEND` | Бэкенд (`dummy` / `f5_tts`) | `f5_tts` | -| `TTS_SAMPLE_RATE` | Частота дискретизации | `24000` | +| `TTS_BACKEND` | Бэкенд (`dummy` / `fish_speech` / `xtts_v2`) | `fish_speech` | +| `TTS_MODEL_PATH` | Папка с checkpoint Fish Speech / XTTS | — | +| `TTS_VOCAB_PATH` | Исходники Fish Speech v1.5.1 | `models/fish-speech-v1.5.1` | +| `TTS_SAMPLE_RATE` | Частота дискретизации | `44100` | +| `TTS_SPEED` | Множитель скорости речи | `1.2` | | `VOICES_DIR` | Директория с референсами | `voices` | +| `DEFAULT_VOICE_REF` | Референс по умолчанию | — | +| `DEFAULT_REF_TEXT` | Точный текст референса (skip Whisper) | — | | `MIN_SEGMENT_LENGTH` | Мин. длина сегмента | `30` | | `MAX_SEGMENT_LENGTH` | Макс. длина сегмента | `200` | | `MAX_BUFFER_WAIT_MS` | Макс. ожидание перед flush | `500` | | `DEVICE` | `cuda` или `cpu` | `cuda` | -| `DTYPE` | `bfloat16` / `float16` | `bfloat16` | -| `DEFAULT_VOICE_REF` | Путь к референсу по умолчанию | — | -| `DEFAULT_REF_TEXT` | Точный текст референса (skip Whisper) | — | +| `DTYPE` | `bfloat16` / `float16` / `float32` | `bfloat16` | +| `FISH_COMPILE` | `torch.compile` для Fish Speech | `false` | +| `FISH_CHUNK_LENGTH` | Длина LLM-чанка Fish Speech | `200` | +| `FISH_USE_MEMORY_CACHE` | Кэшировать VQ референса | `on` | | `WARMUP` | Прогреть CUDA и кэшировать референс | `false` | | `WARMUP_TEXT` | Текст для warm-up | `Привет. Это тестовая фраза.` | -## Загрузка модели +## Модели -Если `TTS_BACKEND=f5_tts` (по умолчанию), при первом старте сервер автоматически -скачает нужный checkpoint из Hugging Face в кэш. Чтобы скачать модель -заранее: +### Fish Speech 1.5 -```bash -python scripts/download_f5_tts.py --model F5TTS_v1_Base -``` +По умолчанию используется локальный checkpoint `models/fishaudio_fish-speech-1.5/`: -Поддерживаемые варианты: `F5TTS_v1_Base`, `F5TTS_Base`, `E2TTS_Base`. -Модель сохраняется в `models/F5TTS_v1_Base/`. +- `model.pth` — LLaMA языковая модель, +- `firefly-gan-vq-fsq-8x1024-21hz-generator.pth` — VQ-GAN декодер, +- `tokenizer.tiktoken`, `config.json`, `special_tokens.json`. -## Тесты +Исходный код Fish Speech v1.5.1 должен лежать в `models/fish-speech-v1.5.1/`, +чтобы Python мог импортировать нужные модули. -```bash -# Быстрые тесты без загрузки F5-TTS -TTS_BACKEND=dummy python -m pytest tests/ -v -``` +### XTTS-v2 + +Coqui-модель `tts_models/multilingual/multi-dataset/xtts_v2` скачивается +автоматически при первом запуске `TTS_BACKEND=xtts_v2`. Можно указать +локальный путь через `TTS_MODEL_PATH`. ## Референсные аудио @@ -136,6 +159,8 @@ ``` voices/ +├── self_ref_clean.wav +├── self_ref_clean.lab ├── default_neutral.wav ├── default_happy.wav ├── default_sad.wav @@ -145,7 +170,21 @@ Требования к референсу: - WAV или другой формат, читаемый `torchaudio`. - Моно, 16+ кГц. -- Длина 3–10 секунд (для F5-TTS). +- Длина 5–15 секунд для Fish Speech, 3–10 секунд для XTTS-v2. - Чистая речь одного спикера без фонового шума. -- Для мгновенного старта задайте точный `DEFAULT_REF_TEXT` — иначе сервер - будет транскрибировать референс через Whisper при первом запуске (5–6 с). +- Для Fish Speech рядом с `.wav` можно положить `.lab` с точным транскриптом. + Иначе сервер использует `DEFAULT_REF_TEXT` или Whisper-транскрипцию. + +## Тесты + +```bash +# Быстрые тесты без загрузки тяжёлых моделей +python -m pytest tests/ -v +``` + +Для запуска в CI рекомендуется отдельно прогонять быстрые и тяжёлые тесты: + +```bash +python -m pytest tests/test_segmenter.py tests/test_server.py -v +python -m pytest tests/test_fish_speech_backend.py -v +``` diff --git a/examples/client_browser.html b/examples/client_browser.html index e24cb4e..a7f237e 100644 --- a/examples/client_browser.html +++ b/examples/client_browser.html @@ -32,10 +32,6 @@
@@ -48,8 +44,8 @@
- - + + @@ -80,10 +76,10 @@ const ensureAudioContext = async () => { const AudioContextCtor = window.AudioContext || window.webkitAudioContext; if (!audioCtx) { - audioCtx = new AudioContextCtor({ sampleRate: 24000 }); + audioCtx = new AudioContextCtor({ sampleRate: 44100 }); } if (audioCtx.state === 'closed') { - audioCtx = new AudioContextCtor({ sampleRate: 24000 }); + audioCtx = new AudioContextCtor({ sampleRate: 44100 }); } if (audioCtx.state === 'suspended') { log(`Resuming AudioContext (state=${audioCtx.state}) ...`); @@ -98,13 +94,15 @@ } }; + const sampleRate = 44100; + const playTone = async (freq = 440, duration = 0.5, amplitude = 0.5) => { await ensureAudioContext(); - const samplesCount = Math.ceil(24000 * duration); - const buffer = audioCtx.createBuffer(1, samplesCount, 24000); + const samplesCount = Math.ceil(sampleRate * duration); + const buffer = audioCtx.createBuffer(1, samplesCount, sampleRate); const channel = buffer.getChannelData(0); for (let i = 0; i < samplesCount; i++) { - const t = i / 24000; + const t = i / sampleRate; channel[i] = amplitude * Math.sin(2 * Math.PI * freq * t) * (1 - t / duration); } const source = audioCtx.createBufferSource(); @@ -115,31 +113,34 @@ log(`Playing test tone at ${freq} Hz for ${duration}s`); }; - const playPcm16 = async (base64Data, seq) => { + const base64ToBytes = (base64) => { + const binary = atob(base64); + const bytes = new Uint8Array(binary.length); + for (let i = 0; i < binary.length; i++) { + bytes[i] = binary.charCodeAt(i); + } + return bytes; + }; + + const playPcm16 = async (base64Data, seq, serverSampleRate = sampleRate) => { await ensureAudioContext(); if (audioCtx.state !== 'running') { throw new Error(`AudioContext not running (state=${audioCtx.state})`); } - const raw = atob(base64Data); - if (raw.length === 0) { + const bytes = base64ToBytes(base64Data); + if (bytes.length === 0) { throw new Error('Empty audio data'); } - if (raw.length % 2 !== 0) { - throw new Error(`Odd raw audio length: ${raw.length}`); + if (bytes.length % 2 !== 0) { + throw new Error(`Odd raw audio length: ${bytes.length}`); } - const sampleCount = raw.length / 2; - const samples = new Int16Array(sampleCount); - const view = new DataView(samples.buffer); - for (let i = 0; i < raw.length; i += 2) { - // little-endian PCM16 - samples[i / 2] = view.getInt16(i, true); - } - - // Convert to float32 AudioBuffer - const buffer = audioCtx.createBuffer(1, sampleCount, 24000); + const sampleCount = bytes.length / 2; + const view = new DataView(bytes.buffer); + const buffer = audioCtx.createBuffer(1, sampleCount, serverSampleRate); const channel = buffer.getChannelData(0); for (let i = 0; i < sampleCount; i++) { - channel[i] = samples[i] / 32768.0; + // little-endian PCM16 -> float32 in [-1, 1] + channel[i] = view.getInt16(i * 2, true) / 32768.0; } const source = audioCtx.createBufferSource(); @@ -153,7 +154,7 @@ const startAt = nextStartTime; source.start(startAt); nextStartTime += buffer.duration; - log(`audio queued seq=${seq} raw=${raw.length} samples=${sampleCount} duration=${buffer.duration.toFixed(2)}s startAt=${startAt.toFixed(3)} ctxState=${audioCtx.state}`); + log(`audio queued seq=${seq} raw=${bytes.length} samples=${sampleCount} duration=${buffer.duration.toFixed(2)}s startAt=${startAt.toFixed(3)} ctxState=${audioCtx.state}`); source.onended = () => { log(`audio ended seq=${seq}`); @@ -204,7 +205,7 @@ if (!msg.data || String(msg.data).trim() === '') { throw new Error('Server sent empty audio data'); } - await playPcm16(msg.data, msg.seq); + await playPcm16(msg.data, msg.seq, msg.sample_rate); } catch (err) { log(`audio playback error: ${err.message}`); } diff --git a/src/voice_tts/api/server.py b/src/voice_tts/api/server.py index 49ca640..27e0d57 100644 --- a/src/voice_tts/api/server.py +++ b/src/voice_tts/api/server.py @@ -26,13 +26,17 @@ from voice_tts.session.state import SessionState, VoiceProfile from voice_tts.tts.engine import DummyTTSEngine, TTSEngine from voice_tts.tts.f5_backend import F5TTSEngine +from voice_tts.tts.fish_speech_backend import FishSpeechEngine from voice_tts.tts.segmenter import Segmenter +from voice_tts.tts.xtts_backend import XTTSv2Engine # Supported TTS backends _BACKEND_MAP: dict[str, type[TTSEngine]] = { "dummy": DummyTTSEngine, "f5_tts": F5TTSEngine, + "xtts_v2": XTTSv2Engine, + "fish_speech": FishSpeechEngine, } @@ -145,6 +149,11 @@ # After a stop, new text implicitly resets the stop flag self.state.reset_stop() + # Ensure a space between consecutive payloads so words don't merge + # (clients often send word-by-word without leading/trailing spaces). + if self.state.text_buffer and msg.payload: + if not self.state.text_buffer[-1].isspace() and not msg.payload[0].isspace(): + self.state.text_buffer += " " self.state.text_buffer += msg.payload if msg.emotion: self.state.emotion = msg.emotion @@ -305,7 +314,7 @@ return asyncio.run_coroutine_threadsafe(_run(), loop).result() return asyncio.run(_run()) - # F5-TTS exposes an async synthesize method that blocks on CPU/CUDA work. + # Backends expose an async synthesize method that blocks on CPU/CUDA work. # Inside a thread from asyncio.to_thread there is no running loop, so we # drive the coroutine with a fresh transient event loop. kwargs: dict = dict( @@ -317,6 +326,8 @@ ) if isinstance(engine, F5TTSEngine) and settings.default_ref_text: kwargs["ref_text"] = settings.default_ref_text + if isinstance(engine, FishSpeechEngine) and settings.default_ref_text: + kwargs["ref_text"] = settings.default_ref_text return asyncio.run(engine.synthesize(**kwargs)) def _stop_all(self) -> None: @@ -343,6 +354,21 @@ model=settings.tts_model_name, sample_rate=settings.tts_sample_rate, ) + elif engine_cls is XTTSv2Engine: + engine = engine_cls( + model_name=settings.tts_model_name, + sample_rate=settings.tts_sample_rate, + ) + elif engine_cls is FishSpeechEngine: + engine = engine_cls( + checkpoint_path=settings.tts_model_path, + source_root=settings.tts_vocab_path, + sample_rate=settings.tts_sample_rate, + device=settings.device, + compile=settings.fish_compile, + use_memory_cache=settings.fish_use_memory_cache, + chunk_length=settings.fish_chunk_length, + ) else: engine = engine_cls(sample_rate=settings.tts_sample_rate) if hasattr(engine, "load"):